**Table of contents**<a id='toc0_'></a>    
- 1. [Importing Data](#toc1_)    
- 2. [Data Wrangling](#toc2_)    
  - 2.1. [Initial Data Wrangling](#toc2_1_)    
  - 2.2. [Changing Data Types](#toc2_2_)    
  - 2.3. [Variable name change](#toc2_3_)    
  - 2.4. [Busiest hour for placing orders is 10:00 a.m.](#toc2_4_)    
  - 2.5. [Determining the meaning of department_id 4](#toc2_5_)    
  - 2.6. [Creating df_breakfast subset of department_id 14](#toc2_6_)    
  - 2.7. [Products that can be used for dinner parties.](#toc2_7_)    
  - 2.8. [Number of rows in df_dinner_parties: 7,650](#toc2_8_)    
  - 2.9. [User_id 1 details:](#toc2_9_)    
  - 2.10. [User behavior](#toc2_10_)    
- 3. [Exporting Data Frames](#toc3_)    

<!-- vscode-jupyter-toc-config
	numbering=true
	anchor=true
	flat=false
	minLevel=1
	maxLevel=3
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# 1. <a id='toc1_'></a>[Importing Data](#toc0_)

In [50]:
# Importing Libraries
import pandas as pd
import numpy as np
import os

In [51]:
Path = r"D:\Data Analysis\01-08-2025 Instacart Basket Analysis\Data"

In [52]:
df_ord = pd.read_csv(os.path.join(Path, 'Original Data', 'orders.csv'), index_col=False)

In [53]:
df_pro = pd.read_csv(os.path.join(Path, 'Original Data', 'products.csv'), index_col = False)

In [54]:
df_pro.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


# 2. <a id='toc2_'></a>[Data Wrangling](#toc0_)

## 2.1. <a id='toc2_1_'></a>[Initial Data Wrangling](#toc0_)

In [57]:
df_ord.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [58]:
#Dropping eval_set column from orders.csv
df_ord.drop(columns=['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [59]:
# Identifying missing values in columns without dropping them.
df_ord['days_since_prior_order'].value_counts(dropna=False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

In [60]:
# Overwriting existing column name instead of creating a new column 
df_ord.rename(columns = {'order_dow':'order_day_of_week'}, inplace=True)

In [61]:
df_ord.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [64]:
df_dept = pd.read_csv(os.path.join(Path, 'Original Data', 'departments.csv'), index_col=False)

In [65]:
df_dept.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [66]:
# Transposing df_dept
df_dept_t = df_dept.T

In [67]:
df_dept_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [68]:
#Adding an index to df_dept_t
df_dept_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [69]:
#Creating new variable consisting of first row of df_dept_t
new_header = df_dept_t.iloc[0]
new_header

0    department
Name: department_id, dtype: object

In [70]:
#Taking the data below the header row to update df_dept_t without previous headers
df_dept_t = df_dept_t[1:]
df_dept_t

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [71]:
# Setting the header row variable as the new header of df_dept_t
df_dept_t.columns = new_header
df_dept_t

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [72]:
df_dept_t.columns

Index(['department'], dtype='object', name='department_id')

In [73]:
#Data dictionary with meanings for depatment_id column of df_dept_t
data_dict = df_dept_t.to_dict('index')
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [74]:
df_pro.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [75]:
#Figuring out what 19 means in data_dict
# Alt: print(data_dict['19'])
print(data_dict.get('19'))

{'department': 'snacks'}


In [76]:
df_pro['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [77]:
df_pro[df_pro['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [78]:
#Alternative 1 - Creating a subsetting name df_snacks of department_id = 19
df_snacks = df_pro[df_pro['department_id']==19]
df_snacks

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [79]:
#Alternative 2 - Creating a subsetting name df_snacks of department_id = 19
df_snacks_2 = df_pro.loc[df_pro['department_id']==19]

In [80]:
#Alternative 3 - Creating a subsetting name df_snacks of department_id = 19
df_snacks_3 = df_pro.loc[df_pro['department_id'].isin([17,18,19])]
df_snacks_3

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
13,14,Fresh Scent Dishwasher Cleaner,74,17,6.5
14,15,Overnight Diapers Size 6,56,18,11.2
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
...,...,...,...,...,...
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49672,49668,Apple Cinnamon Scented Candles,101,17,5.6
49675,49671,Milk Chocolate Drops,45,19,3.0


## 2.2. <a id='toc2_2_'></a>[Changing Data Types](#toc0_)

In [82]:
df_ord['userid'] = df_ord['user_id'].astype('str')

In [62]:
# changing the data type of values in a column
df_ord['order_id'] = df_ord['order_id'].astype('str')

In [63]:
df_ord['order_id'].dtype

dtype('O')

## 2.3. <a id='toc2_3_'></a>[Variable name change](#toc0_)

In [None]:
df_ord.rename(columns = {'order_number':'order_sequence_number'}, inplace = True)
df_ord.head()

Unnamed: 0,order_id,user_id,eval_set,order_count,order_day_of_week,order_hour_of_day,days_since_prior_order,userid
0,2539329,1,prior,1,2,8,,1
1,2398795,1,prior,2,3,7,15.0,1
2,473747,1,prior,3,3,12,21.0,1
3,2254736,1,prior,4,4,7,29.0,1
4,431534,1,prior,5,4,15,28.0,1


## 2.4. <a id='toc2_4_'></a>[Busiest hour for placing orders is 10:00 a.m.](#toc0_)

In [84]:
df_ord['order_hour_of_day'].value_counts()

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

## 2.5. <a id='toc2_5_'></a>[Determining the meaning of department_id 4](#toc0_)

In [85]:
print(data_dict.get('4'))

{'department': 'produce'}


## 2.6. <a id='toc2_6_'></a>[Creating df_breakfast subset of department_id 14](#toc0_)

In [86]:
df_breakfast = df_pro[df_pro['department_id'].isin([14])]
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


## 2.7. <a id='toc2_7_'></a>[Products that can be used for dinner parties.](#toc0_)

In [87]:
df_dinner_parties = df_pro[df_pro['department_id'].isin([5,20,7,12])].sort_values(by=['department_id', 'product_id'])
df_dinner_parties

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
51,52,Mirabelle Brut Rose,134,5,14.4
118,119,Chardonnay Paso Robles,62,5,5.5
149,150,Brut Rosé,134,5,12.9
233,234,Tennessee Whiskey,124,5,3.1
248,249,"Pinot Grigio, California, 2010",62,5,2.7
...,...,...,...,...,...
49558,49554,Roasted Garlic Hommus,67,20,14.8
49564,49560,Selects Natural Slow Roasted Chicken Breast,96,20,14.5
49585,49581,Pinto Bean and Cheese Pupusa,13,20,10.5
49609,49605,Classic Hummus Family Size,67,20,3.5


## 2.8. <a id='toc2_8_'></a>[Number of rows in df_dinner_parties: 7,650](#toc0_)

In [88]:
df_dinner_parties.shape

(7650, 5)

## 2.9. <a id='toc2_9_'></a>[User_id 1 details:](#toc0_)

In [89]:
#order details of customer 1
cus_1 = df_ord.loc[df_ord['user_id']== 1]
cus_1

Unnamed: 0,order_id,user_id,eval_set,order_count,order_day_of_week,order_hour_of_day,days_since_prior_order,userid
0,2539329,1,prior,1,2,8,,1
1,2398795,1,prior,2,3,7,15.0,1
2,473747,1,prior,3,3,12,21.0,1
3,2254736,1,prior,4,4,7,29.0,1
4,431534,1,prior,5,4,15,28.0,1
5,3367565,1,prior,6,2,7,19.0,1
6,550135,1,prior,7,1,9,20.0,1
7,3108588,1,prior,8,1,14,14.0,1
8,2295261,1,prior,9,1,16,0.0,1
9,2550362,1,prior,10,4,8,30.0,1


In [90]:
df_cust = pd.read_csv(os.path.join(Path, 'Original Data', 'customers.csv'), index_col=False)
df_cust

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374
...,...,...,...,...,...,...,...,...,...,...
206204,168073,Lisa,Case,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Jeremy,Robbins,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Doris,Richmond,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Rose,Rollins,Female,California,27,4/1/2020,1,married,99799


In [91]:
df_cust.rename(columns = {'Surnam':'Surname'}, inplace = True)
df_cust.head()

Unnamed: 0,user_id,First Name,Surname,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [92]:
df_cust.loc[df_cust['user_id']==1]

Unnamed: 0,user_id,First Name,Surname,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
134862,1,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


In [93]:
#demographic details of customer 1
cust_1 = df_cust.loc[df_cust['user_id']==1]
cust_1

Unnamed: 0,user_id,First Name,Surname,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
134862,1,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


In [94]:
print('Customer Demographics:' '\n', cust_1, '\n\n', 'Customer Order Details:' '\n', cus_1)

Customer Demographics:
         user_id First Name Surname  Gender    STATE  Age date_joined  \
134862        1      Linda  Nguyen  Female  Alabama   31   2/17/2019   

        n_dependants fam_status  income  
134862             3    married   40423   

 Customer Order Details:
    order_id  user_id eval_set  order_count  order_day_of_week  \
0   2539329        1    prior            1                  2   
1   2398795        1    prior            2                  3   
2    473747        1    prior            3                  3   
3   2254736        1    prior            4                  4   
4    431534        1    prior            5                  4   
5   3367565        1    prior            6                  2   
6    550135        1    prior            7                  1   
7   3108588        1    prior            8                  1   
8   2295261        1    prior            9                  1   
9   2550362        1    prior           10                  4   
10  

In [95]:
cus_1.describe()

Unnamed: 0,user_id,order_count,order_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,11.0,10.0
mean,1.0,6.0,2.636364,10.090909,19.0
std,0.0,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,1.0,7.0,0.0
25%,1.0,3.5,1.5,7.5,14.25
50%,1.0,6.0,3.0,8.0,19.5
75%,1.0,8.5,4.0,13.0,26.25
max,1.0,11.0,4.0,16.0,30.0


## 2.10. <a id='toc2_10_'></a>[User behavior](#toc0_)

In [None]:
# Most orders have been made on day 4 (Wednesdays)
cus_1['order_day_of_week'].value_counts()

order_day_of_week
4    4
1    3
3    2
2    2
Name: count, dtype: int64

In [None]:
# Most orders have been made in the morning at 7 a.m. and 8 a.m.
cus_1['order_hour_of_day'].value_counts()

order_hour_of_day
8     3
7     3
12    1
15    1
9     1
14    1
16    1
Name: count, dtype: int64

# 3. <a id='toc3_'></a>[Exporting Data Frames](#toc0_)

In [None]:
df_dept_t.to_csv(os.path.join(Path, 'Prepared Data', 'departments_wrangled.csv'))

In [None]:
df_ord.to_csv(os.path.join(Path, 'Prepared Data', 'orders_wrangled.csv'))