# 1. Setup and Data Import

In [142]:
import pandas as pd
import numpy as np
import os

In [144]:
path = r'C:\Users\Jacques\OneDrive\Documents\Data Analytics course\Data Immersion\Section 4\08 April 2025 Instacart Basket Analysis\02 Data'
df_ords = pd.read_csv(os.path.join(path, 'Original Data', 'orders.csv'))
df_prods = pd.read_csv(os.path.join(path, 'Original Data', 'products.csv'))
df_dep = pd.read_csv(os.path.join(path, 'Original Data', 'departments.csv'))

# 2. Initial Cleaning (Dropping, Renaming, Changing Data Types)

In [147]:
df_ords = df_ords.drop(columns=['eval_set'])

In [149]:
df_ords.rename(columns={'order_dow': 'orders_day_of_week'}, inplace=True)

In [151]:
df_ords['order_id'] = df_ords['order_id'].astype('str')

# 3. Transposing and Cleaning df_dep

In [158]:
df_dep_t = df_dep.T

In [160]:
df_dep_t.reset_index()
new_header = df_dep_t.iloc[0]
df_dep_t_new = df_dep_t[1:]
df_dep_t_new.columns = new_header

# 4. Creating a Data Dictionary

In [163]:
data_dict = df_dep_t_new.to_dict('index')

In [165]:
data_dict['19']

{'department': 'snacks'}

# 5. Subsetting the Data

In [172]:
df_snacks = df_prods[df_prods['department_id'] == 19]
df_breakfast = df_prods[df_prods['department_id'] == 14]

In [174]:
dinner_ids = [5, 7, 12, 20]  # Update with correct IDs
df_dinner = df_prods[df_prods['department_id'].isin(dinner_ids)]

In [176]:
df_dinner.shape[0]

7650

# 6. User Analysis (user_id == 1)

In [195]:
df_ords.rename(columns={'order_hour_of_day': 'order_hour'}, inplace=True)

In [197]:
df_user_1 = df_ords[df_ords['user_id'] == '1']

In [199]:
df_user_1.describe()
df_user_1['orders_day_of_week'].value_counts()
df_user_1['order_hour'].value_counts()

Series([], Name: count, dtype: int64)

# 7. Exporting Dataframes

In [202]:
df_ords.to_csv(os.path.join(path, 'Prepared Data', 'orders_wrangled.csv'))

In [204]:
df_dep_t_new.to_csv(os.path.join(path, 'Prepared Data', 'departments_wrangled.csv'))

________________________________________________________________________________________________________________________________________________________

# Tutorial

# 1. Import the necessary libraries 

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 2. Setting the path

In [10]:
# Set path to the 'Original Data' folder
path = r'C:\Users\Jacques\OneDrive\Documents\Data Analytics course\Data Immersion\Section 4\08 April 2025 Instacart Basket Analysis\02 Data'

# 3. Importing the data

In [13]:
# Import data
df_ords = pd.read_csv(os.path.join(path, 'Original Data', 'orders.csv'))
df_prods = pd.read_csv(os.path.join(path, 'Original Data', 'products.csv'))

# 4. Previewing data

In [18]:
# Check the first few rows of orders
df_ords.head()

# Check the first few rows of products
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [20]:
df_ords['days_since_prior_order'].value_counts(dropna = False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

# 5. Importing a new dataset

In [32]:
# Importing data set departments.csv
df_dep = pd.read_csv(os.path.join(path, 'Original Data', 'departments.csv'), index_col = False)

In [34]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [36]:
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [38]:
df_dep_t = df_dep.T

In [40]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [42]:
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [44]:
new_header = df_dep_t.iloc[0]

In [46]:
new_header

0    department
Name: department_id, dtype: object

In [48]:
df_dep_t_new = df_dep_t[1:]

In [50]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [52]:
df_dep_t_new.columns = new_header

In [54]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [56]:
data_dict = df_dep_t_new.to_dict('index')

In [58]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [60]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [62]:
print(data_dict.get('19'))

{'department': 'snacks'}


In [64]:
df_snacks =  df_prods[df_prods['department_id']==19]

In [66]:
df_prods['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [68]:
df_prods[df_prods['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [70]:
df_snacks =  df_prods[df_prods['department_id']==19]

In [72]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [74]:
df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]

In [76]:
df_snacks_3 = df_prods.loc[df_prods['department_id'].isin([19])]

## Further Data Wrangling and Subsetting Procedures

In [79]:
# Check the first few rows of orders
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [81]:
# Change another identifier variable in df_ords from INT to STRING
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [85]:
# Rename a column with an unintuitive name
df_ords.rename(columns={'order_hour_of_day': 'order_hour'}, inplace=True)

In [87]:
# Find the busiest hour for placing orders
df_ords['order_hour'].value_counts().sort_index()

order_hour
0      22758
1      12398
2       7539
3       5474
4       5527
5       9569
6      30529
7      91868
8     178201
9     257812
10    288418
11    284728
12    272841
13    277999
14    283042
15    283639
16    272553
17    228795
18    182912
19    140569
20    104292
21     78109
22     61468
23     40043
Name: count, dtype: int64

In [93]:
# Find the meaning of "department_id" = 4 in df_prods using the data dictionary
print(data_dict['4'])

{'department': 'produce'}


In [95]:
for key, value in data_dict.items():
    if value['department'] == 'breakfast':
        print(f"Breakfast department_id: {key}")

Breakfast department_id: 14


In [97]:
# Create a subset of breakfast items
df_breakfast = df_prods[df_prods['department_id'] == 14]

In [103]:
for key, value in data_dict.items():
    if value['department'] in ['alcohol', 'deli', 'beverages', 'meat', 'seafood']:
        print(f"{value['department'].title()} department_id: {key}")

Alcohol department_id: 5
Beverages department_id: 7
Deli department_id: 20


In [105]:
df_dep_t_new['department'].unique()

array(['frozen', 'other', 'bakery', 'produce', 'alcohol', 'international',
       'beverages', 'pets', 'dry goods pasta', 'bulk', 'personal care',
       'meat seafood', 'pantry', 'breakfast', 'canned goods',
       'dairy eggs', 'household', 'babies', 'snacks', 'deli', 'missing'],
      dtype=object)

In [107]:
for key, value in data_dict.items():
    if value['department'] in ['alcohol', 'deli', 'beverages', 'meat seafood']:
        print(f"{value['department'].title()} department_id: {key}")

Alcohol department_id: 5
Beverages department_id: 7
Meat Seafood department_id: 12
Deli department_id: 20


In [109]:
# Subset for dinner party items
dinner_ids = [5, 7, 12, 20]  # replace with correct IDs for alcohol, deli, beverages, meat/seafood
df_dinner = df_prods[df_prods['department_id'].isin(dinner_ids)]

In [111]:
# Count number of rows in df_dinner
df_dinner.shape[0]

7650

In [127]:
#  Extract data for user_id == "1"
df_user_1 = df_ords[df_ords['user_id'] == '1']

In [131]:
df_user_1.rename(columns={'order_dow': 'orders_day_of_week'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_1.rename(columns={'order_dow': 'orders_day_of_week'}, inplace=True)


In [133]:
df_user_1.columns

Index(['order_id', 'user_id', 'eval_set', 'order_number', 'orders_day_of_week',
       'order_hour', 'days_since_prior_order'],
      dtype='object')

In [139]:
# Basic stats for user_id == "1"
df_user_1.describe()
df_user_1['orders_day_of_week'].value_counts()
df_user_1['order_hour'].value_counts()

order_hour
8     3
7     3
12    1
15    1
9     1
14    1
16    1
Name: count, dtype: int64