**Table of contents**<a id='toc0_'></a>    
- 1. [Importing Data](#toc1_)    
- 2. [Exercise 4.7 Practice](#toc2_)    
- 3. [Deriving New Variables](#toc3_)    
  - 3.1. [Creating Price Label and Busiest Day Columns](#toc3_1_)    
  - 3.2. [Creating Busiest Days Column](#toc3_2_)    
  - 3.3. [The results indicate that customers order mostly during weekends with the busiest day (Saturday) and second busiest day (Sunday) accounting for about 36.6% of total orders.](#toc3_3_)    
  - 3.4. [Identifying Busiest Time of Day](#toc3_4_)    
- 4. [Exporting DataFrame as Pickle File](#toc4_)    

<!-- vscode-jupyter-toc-config
	numbering=true
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# 1. <a id='toc1_'></a>[Importing Data](#toc0_)

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import os

In [2]:
Path = r'D:\Data Analysis\01-08-2025 Instacart Basket Analysis\Data'
df_merged = pd.read_pickle(os.path.join(Path, 'Prepared Data', 'ord_pro_merge.pkl'))
df_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,pro_merged
0,3108588,1,prior,8,1,14,14.0,12427,1,1,both,Original Beef Jerky,23,19,4.4,both
1,3108588,1,prior,8,1,14,14.0,196,2,1,both,Soda,77,7,9.0,both
2,3108588,1,prior,8,1,14,14.0,10258,3,1,both,Pistachios,117,19,3.0,both
3,3108588,1,prior,8,1,14,14.0,25133,4,1,both,Organic String Cheese,21,16,8.6,both
4,3108588,1,prior,8,1,14,14.0,46149,5,0,both,Zero Calorie Cola,77,7,13.4,both


In [3]:
# Removing merge verification columns of previous exercise
df_merged = df_merged.drop(columns=['_merge', 'pro_merged'])
df_merged.shape

(32404859, 14)

# 2. <a id='toc2_'></a>[Exercise 4.7 Practice](#toc0_)

In [None]:
# Creating a subset
df = df_merged[:1000000]

In [5]:
def price_label(row):
  
  if row['prices']>15:
    return 'HighRange'
  elif (row['prices']<= 15) and (row['prices']>5):
    return 'MidRange'
  elif row['prices']<= 5:
    return 'LowRange'
  else: return 'Not Enough Data'

In [None]:
df['price_range'] = df.apply(price_label, axis=1)
df['price_range'].value_counts(dropna = False)

# 3. <a id='toc3_'></a>[Deriving New Variables](#toc0_)

## 3.1. <a id='toc3_1_'></a>[Creating Price Label and Busiest Day Columns](#toc0_)

In [7]:
df_merged.loc[df_merged['prices']>15, 'price_range'] = 'High Range'
df_merged.loc[(df_merged['prices']<= 15) & (df_merged['prices']>5), 'price_range'] = 'Mid Range'
df_merged.loc[df_merged['prices']<= 5, 'price_range'] = 'Low Range'
df_merged['price_range'].value_counts(dropna=False)

price_range
Mid Range     21860860
Low Range     10126321
High Range      417678
Name: count, dtype: int64

In [8]:
df_merged['order_day_of_week'].value_counts(dropna=False)

order_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [9]:
busyStatus = []

for value in df_merged['order_day_of_week']:
  if value == 0:
    busyStatus.append('Busiest Day')
  elif value == 4:
    busyStatus.append('Least Busiest')
  else: 
    busyStatus.append('Regularly Busy')

In [10]:
df_merged['busiest_day'] = busyStatus
df_merged['busiest_day'].value_counts(dropna=False)

busiest_day
Regularly Busy    22416875
Busiest Day        6204182
Least Busiest      3783802
Name: count, dtype: int64

In [11]:
df_merged = df_merged.drop(columns='busiest_day')
df_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range
0,3108588,1,prior,8,1,14,14.0,12427,1,1,Original Beef Jerky,23,19,4.4,Low Range
1,3108588,1,prior,8,1,14,14.0,196,2,1,Soda,77,7,9.0,Mid Range
2,3108588,1,prior,8,1,14,14.0,10258,3,1,Pistachios,117,19,3.0,Low Range
3,3108588,1,prior,8,1,14,14.0,25133,4,1,Organic String Cheese,21,16,8.6,Mid Range
4,3108588,1,prior,8,1,14,14.0,46149,5,0,Zero Calorie Cola,77,7,13.4,Mid Range


## 3.2. <a id='toc3_2_'></a>[Creating Busiest Days Column](#toc0_)

In [12]:
activeStatus = []

for dow in df_merged['order_day_of_week']:
  if dow == 0:
    activeStatus.append('Busiest Day')
  elif dow == 1:
    activeStatus.append('Second Busiest')
  elif dow == 4:
    activeStatus.append('Slowest Day')
  elif dow == 3:
    activeStatus.append('Second Slowest')
  else:
    activeStatus.append('Regularly Busy')

In [14]:
from pandas.api.types import CategoricalDtype

In [None]:
df_merged['busiest_days'] = activeStatus
busiestOrder = CategoricalDtype(categories=['Busiest Day', 'Second Busiest', 'Regularly Busy', 'Second Slowest', 'Slowest Day'], ordered=True)
df_merged['busiest_days'] = df_merged['busiest_days'].astype(busiestOrder)

In [23]:
df_merged['busiest_days'].value_counts(dropna=False).reindex(busiestOrder.categories)

Busiest Day        6204182
Second Busiest     5660230
Regularly Busy    12916111
Second Slowest     3840534
Slowest Day        3783802
Name: count, dtype: int64

## 3.3. <a id='toc3_3_'></a>[The results indicate that customers order mostly during weekends with the busiest day (Saturday) and second busiest day (Sunday) accounting for about 36.6% of total orders.](#toc0_)

## 3.4. <a id='toc3_4_'></a>[Identifying Busiest Time of Day](#toc0_)

In [24]:
df_merged['order_hour_of_day'].value_counts(dropna=False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

In [25]:
df_merged['order_hour_of_day'].value_counts(dropna=False).mean()

np.float64(1350202.4583333333)

In [26]:
busyHour = []

for hod in df_merged['order_hour_of_day']:
  if hod >= 9:
    busyHour.append('Most Orders')
  elif hod in [8,17,18,19]:
    busyHour.append('Average Orders')
  else:
    busyHour.append('Fewest Orders')

In [28]:
# 5. Frequency of Busiest Period of Day Column
df_merged['busiest_period_of_day'] = busyHour
busyHourOrder = CategoricalDtype(categories=['Most Orders', 'Average Orders', 'Fewest Orders'], ordered=True)
df_merged['busiest_period_of_day'] = df_merged['busiest_period_of_day'].astype(busyHourOrder)
df_merged['busiest_period_of_day'].value_counts(dropna=False).reindex(busyHourOrder.categories)

Most Orders       28908866
Average Orders     1718118
Fewest Orders      1777875
Name: count, dtype: int64

In [29]:
df_merged.head(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range,busiest_days,busiest_period_of_day
0,3108588,1,prior,8,1,14,14.0,12427,1,1,Original Beef Jerky,23,19,4.4,Low Range,Second Busiest,Most Orders
1,3108588,1,prior,8,1,14,14.0,196,2,1,Soda,77,7,9.0,Mid Range,Second Busiest,Most Orders
2,3108588,1,prior,8,1,14,14.0,10258,3,1,Pistachios,117,19,3.0,Low Range,Second Busiest,Most Orders
3,3108588,1,prior,8,1,14,14.0,25133,4,1,Organic String Cheese,21,16,8.6,Mid Range,Second Busiest,Most Orders
4,3108588,1,prior,8,1,14,14.0,46149,5,0,Zero Calorie Cola,77,7,13.4,Mid Range,Second Busiest,Most Orders
5,3108588,1,prior,8,1,14,14.0,49235,6,0,Organic Half & Half,53,16,1.8,Low Range,Second Busiest,Most Orders
6,1901567,2,prior,3,1,10,3.0,47766,1,1,Organic Avocado,24,4,6.3,Mid Range,Second Busiest,Most Orders
7,1901567,2,prior,3,1,10,3.0,32792,2,1,Chipotle Beef & Pork Realstick,23,19,5.2,Mid Range,Second Busiest,Most Orders
8,1901567,2,prior,3,1,10,3.0,20574,3,1,Roasted Turkey,96,20,2.3,Low Range,Second Busiest,Most Orders
9,1901567,2,prior,3,1,10,3.0,7781,4,0,Organic Sticks Low Moisture Part Skim Mozzarel...,21,16,6.6,Mid Range,Second Busiest,Most Orders


# 4. <a id='toc4_'></a>[Exporting DataFrame as Pickle File](#toc0_)


In [None]:
df_merged.to_pickle(os.path.join(Path, 'Prepared Data', 'ord_pro_busyderived.pkl'))