In [1]:
import pandas as pd
import numpy as np
import os

# 1. Create a path string
path = r'C:\Users\howel\OneDrive\Instacart Basket Analysis'

# 2. Import the datasets
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [2]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [3]:
# 1. Drop the 'eval_set' column
df_ords.drop(columns = ['eval_set'], inplace = True)

# 2. Rename the column
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

# 3. Change order_id to string
df_ords['order_id'] = df_ords['order_id'].astype('str')

# Check your work to see the changes
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [4]:
# 1. Transpose the dataframe (flip rows and columns)
df_dep_t = df_dep.T

# 2. Create a new header
new_header = df_dep_t.iloc[0] # Grab the first row for the header

# 3. Create a new dataframe that takes the data under the header row
df_dep_t_new = df_dep_t[1:]

# 4. Set the header
df_dep_t_new.columns = new_header

# Check the result
df_dep_t_new.head()

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol


In [5]:
# Turn the dataframe into a dictionary
data_dict = df_dep_t_new.to_dict('index')

# Check the dictionary
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [6]:
# Change user_id to string
df_ords['user_id'] = df_ords['user_id'].astype('str')

# Check the change
df_ords['user_id'].dtype

dtype('O')

In [7]:
# --- STEP 2: Change user_id to string ---
df_ords['user_id'] = df_ords['user_id'].astype('str')


# --- STEP 3: Rename column (without overwriting) ---
# This will print a preview of the new name, but won't change the actual table permanently
df_ords.rename(columns = {'days_since_prior_order' : 'days_since_last_order'}, inplace = False)


# --- STEP 4: Find the busiest hour ---
# This counts how many orders happened in each hour (0 = midnight, 10 = 10am, etc.)
df_ords['order_hour_of_day'].value_counts()

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

In [8]:
# --- STEP 5: Meaning of Department 4 ---
print(data_dict.get(4))


# --- STEP 6: Breakfast Subset (Department 14) ---
df_breakfast = df_prods[df_prods['department_id'] == 14]


# --- STEP 7: Dinner Party Subset ---
# We use .isin() to check for multiple department IDs at once
df_dinner = df_prods[df_prods['department_id'].isin([5, 20, 7, 12])]

# Check the first few rows of the dinner party list
df_dinner.head()

None


Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1


In [9]:
# --- STEP 8: How many rows? ---
# The first number in the output is the number of rows.
print("Total rows in dinner party dataframe:", df_dinner.shape[0])


# --- STEP 9: Extract User 1 Info ---
# We look for user_id '1' (as a string)
df_user_1 = df_ords[df_ords['user_id'] == '1']
# Display the table
df_user_1


# --- STEP 10: User 1 Stats ---
# This calculates the count, mean, min, and max for User 1
df_user_1.describe()

Total rows in dinner party dataframe: 7650


Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0
