In [1]:
# imports

# standard import of pandas
import pandas as pd

In [2]:
# getting dataframes

%store -r final_merge


### Order Lead Time = Days needed from order date & order arival scan date

In [3]:
final_merge.sample(5)

Unnamed: 0,order_id,order_date,state,region,ship_mode,ready_to_ship_date,pickup_date,arrival_scan_date
800,CA-2019-110730,2019-09-23,Washington,West,Standard Processing,NaT,2019-09-27,2019-10-03
4935,CA-2019-124100,2019-03-31,New York,East,Standard Processing,NaT,2019-04-10,NaT
3661,CA-2018-110247,2018-12-04,Florida,South,Standard Class,NaT,NaT,NaT
2637,US-2020-139465,2020-08-27,New York,East,Express,NaT,2020-08-31,NaT
785,US-2020-169502,2020-08-28,Wisconsin,Central,Standard Processing,NaT,2020-09-02,NaT


In [4]:
# Create new data frame
lead_time = final_merge.copy()

## keep only necessary columns
# new_df = old_df.filter(items=['col1', 'col2']).copy()
lead_time = lead_time.filter(items=["order_date", "arrival_scan_date"]).copy()

In [5]:
lead_time.columns

Index(['order_date', 'arrival_scan_date'], dtype='object')

In [6]:
lead_time.head(1)

Unnamed: 0,order_date,arrival_scan_date
0,2019-01-16,NaT


In [7]:
# show me no. of cells that are NOT NULL
lead_time['arrival_scan_date'].notnull().sum()

333

In [8]:
# show me no. of cells that ARE NULL
lead_time['arrival_scan_date'].isnull().sum()

4677

In [9]:
# Drop rows if there are null values in column arrival scan date
lead_time.dropna(subset=['arrival_scan_date'], inplace=True)

In [10]:
# should only have 333 rows of date confirmed below after dropping
lead_time['arrival_scan_date'].info()

<class 'pandas.core.series.Series'>
Index: 333 entries, 17 to 5005
Series name: arrival_scan_date
Non-Null Count  Dtype         
--------------  -----         
333 non-null    datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 5.2 KB


In [11]:
# Create new column to display the calculated lead time
lead_time["order_lead_time"] = lead_time["arrival_scan_date"] - lead_time["order_date"]

In [12]:
# Display new column Order Lead Time and results 
lead_time["order_lead_time"]

17     12 days
24     13 days
40     11 days
59     13 days
80     10 days
         ...  
4978   14 days
4981   11 days
4984   14 days
4987    7 days
5005   11 days
Name: order_lead_time, Length: 333, dtype: timedelta64[ns]

In [13]:
# Create new column for Days in INT format in case we need this for calculating
lead_time["order_lead_time_i"] = lead_time["order_lead_time"].dt.days
lead_time.head(2)

Unnamed: 0,order_date,arrival_scan_date,order_lead_time,order_lead_time_i
17,2019-09-11,2019-09-23,12 days,12
24,2019-04-23,2019-05-06,13 days,13


In [14]:
## Want to calculate what day of week the order was made on, and arrived on - to see later if there are correlations

# Day of order
lead_time["ordered_day"]  = lead_time["order_date"].dt.day_name()

# Day of arrival
lead_time["arrival_day"] = lead_time["arrival_scan_date"].dt.day_name()
lead_time.tail(3)

Unnamed: 0,order_date,arrival_scan_date,order_lead_time,order_lead_time_i,ordered_day,arrival_day
4984,2019-11-25,2019-12-09,14 days,14,Monday,Monday
4987,2019-09-16,2019-09-23,7 days,7,Monday,Monday
5005,2019-10-31,2019-11-11,11 days,11,Thursday,Monday


### What days to customers receive their orders?

In [16]:
# What days do the customers receive their orders?
lead_time.arrival_day.unique()

## Customers have never received their delivery on a Saturday or Sunday

array(['Monday', 'Friday', 'Thursday', 'Tuesday', 'Wednesday'],
      dtype=object)

### KPI: Order Lead Time observed

In [18]:
# Mean value = 10.83 days
lead_time.order_lead_time_i.mean()

10.834834834834835

In [19]:
# Mode / most frequent = 12 days
lead_time.order_lead_time_i.mode()

0    12
Name: order_lead_time_i, dtype: int64