In [1]:
# import pandas to read the Excel spreadsheet into a dataframe
import pandas as pd

In [2]:
# import Mr Haulage data
haulage_df = pd.read_excel('mr_haulage_order_details.xlsx')

# config display settings
pd.set_option('display.max.columns', None, 'display.width', None, 'display.max.colwidth', None)

# print sample df to see what we're working with
haulage_df.head(20)

Unnamed: 0,Order ID,Customer ID,Order Date,Order Time,Item Serial,Box Type,Delivery Region,Distance (miles)
0,1097342,733603,22/08/2021,00:14,30351,Small,South East,70
1,1097343,405061,22/08/2021,07:08,17634,Small,Greater London,32
2,1097344,842139,22/08/2021,10:15,25598,Small,South West,190
3,1097345,211806,22/08/2021,17:05,10104,Small,South West,85
4,1097346,103222,22/08/2021,23:48,3252,Small,Greater London,43
5,1097347,603400,22/08/2021,23:57,62831,Small,Greater London,33
6,1097348,837737,23/08/2021,02:11,90766,Large,West Midlands,143
7,1097349,334749,23/08/2021,04:43,93186,Large,Greater London,45
8,1097350,239710,23/08/2021,11:49,99590,Large,North East,210
9,1097351,730371,23/08/2021,14:03,39952,Small,South West,110


In [10]:
# check the shape of the df
f"haulage_df has {haulage_df.shape[0]} rows & {haulage_df.shape[1]} columns"

'haulage_df has 2000 rows & 8 columns'

In [6]:
# check the data types
haulage_df.dtypes

order_id            object
customer_id         object
order_date          object
order_time          object
item_serial         object
box_type            object
delivery_region     object
distance_(miles)    object
dtype: object

In [4]:
# convert column names to lowercase and replace whitespace with underscores
haulage_df.columns = haulage_df.columns.str.lower().str.replace(' ', '_')

# Convert columns to a string type
haulage_df = haulage_df.astype(str)

# Convert row values to lowercase and replace whitespace with underscores
haulage_df = haulage_df.apply(lambda x: x.str.lower().str.replace(' ', '_'))

haulage_df.head()

Unnamed: 0,order_id,customer_id,order_date,order_time,item_serial,box_type,delivery_region,distance_(miles)
0,1097342,733603,22/08/2021,00:14,30351,small,south_east,70
1,1097343,405061,22/08/2021,07:08,17634,small,greater_london,32
2,1097344,842139,22/08/2021,10:15,25598,small,south_west,190
3,1097345,211806,22/08/2021,17:05,10104,small,south_west,85
4,1097346,103222,22/08/2021,23:48,3252,small,greater_london,43


In [5]:
haulage_df = haulage_df.sort_values(by='order_date', ascending=True)

haulage_df

Unnamed: 0,order_id,customer_id,order_date,order_time,item_serial,box_type,delivery_region,distance_(miles)
377,1097719,378716,01/01/2022,07:39,39886,small,greater_london,22
1725,1099067,439570,01/01/2023,03:51,76996,large,east_midlands,90
1726,1099068,117777,01/01/2023,18:41,15262,small,greater_london,20
436,1097778,769510,01/02/2022,01:45,93601,large,east_midlands,114
437,1097779,153229,01/02/2022,02:34,62821,small,east_midlands,119
...,...,...,...,...,...,...,...,...
1567,1098909,718142,31/10/2022,15:55,19031,small,greater_london,37
1563,1098905,314081,31/10/2022,00:23,60931,small,greater_london,21
1565,1098907,887271,31/10/2022,09:01,86762,large,north_east,267
376,1097718,276730,31/12/2021,11:05,13104,small,south_wales,168


In [16]:
columns = haulage_df.columns.tolist()
columns

['order_id',
 'customer_id',
 'order_date',
 'order_time',
 'item_serial',
 'box_type',
 'delivery_region',
 'distance_(miles)']

In [30]:
messages = []
for col in columns:
    unique_count = haulage_df[col].nunique()
    messages.append(f"There are {unique_count} unique values in {col} column")

for message in messages:
    print(message)


There are 2000 unique values in order_id column
There are 1792 unique values in customer_id column
There are 597 unique values in order_date column
There are 1083 unique values in order_time column
There are 1982 unique values in item_serial column
There are 2 unique values in box_type column
There are 8 unique values in delivery_region column
There are 289 unique values in distance_(miles) column


In [23]:
# .t transpose

description_df = haulage_df.describe().T
description_df

Unnamed: 0,count,unique,top,freq
order_id,2000,2000,1097719,1
customer_id,2000,1792,103222,106
order_date,2000,597,24/08/2021,10
order_time,2000,1083,10:35,7
item_serial,2000,1982,29435,2
box_type,2000,2,small,1511
delivery_region,2000,8,greater_london,629
distance_(miles),2000,289,45,40


In [24]:
haulage_df.nunique()

order_id            2000
customer_id         1792
order_date           597
order_time          1083
item_serial         1982
box_type               2
delivery_region        8
distance_(miles)     289
dtype: int64

In [None]:
# convert date data to 'datetime64'
haulage_df['order_date'] = pd.to_datetime(haulage_df['order_date'], format='%d/%m/%Y')

# convert time data to 'datetime64'
haulage_df['order_time'] = pd.to_datetime(haulage_df['order_time'], format='%H:%M')

# Format to only show hour and minute, and convert back to string
haulage_df['order_time'] = haulage_df['order_time'].dt.strftime('%H:%M')

haulage_df.head()

In [None]:
haulage_df.dtypes

In [None]:
# uniform format for string type data
haulage_df[['box_type', 'delivery_region']] = haulage_df[['box_type', 'delivery_region']].apply(lambda x: x.str.lower().str.replace(' ', '_'))

haulage_df.head()

In [None]:
haulage_df.dtypes

In [None]:
# Display the count of NaN values for each column
nan_counts = haulage_df.isnull().sum(axis=0)
print(nan_counts)

In [None]:
# No. of orders per customer_id
customer_id_order_count = haulage_df['customer_id'].value_counts()
customer_id_order_count

In [None]:
# check for duplicate rows (criteria = same order_id & customer_id)

duplicates = haulage_df[haulage_df.duplicated(subset=['order_id', 'customer_id'], keep=False)]

print(f"There are {len(duplicates)} duplicates in the dataframe.")

duplicates

### The data is pretty clean etc. ready to analyse different areas

In [None]:
# create new columns to analyse seasonal trends & year-on-year trends

year_values = haulage_df['order_date'].dt.year
month_values = haulage_df['order_date'].dt.month

haulage_df.insert(4, 'order_year', year_values)
haulage_df.insert(5, 'order_mont', month_values)
haulage_df.head()


In [None]:
year_counts = haulage_df['year'].value_counts()
month_counts = haulage_df['month'].value_counts()

year_counts