# Section 1: Import Libraries

In [3]:
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
import os  # For operating system interactions

# Define project folder path

In [5]:
project_folder_path = r"C:\Users\marci\15-07-2024Instacart Basket Analysis\02 Data"

In [None]:
# Load data

In [7]:
orders_path = os.path.join(project_folder_path, "Original Data", "orders.csv")
df_ords = pd.read_csv(orders_path)

In [17]:
products_path = os.path.join(project_folder_path, "Original Data", "products.csv")
df_prods = pd.read_csv(products_path)

In [25]:
df_prods.head

<bound method NDFrame.head of        product_id                                       product_name  \
0               1                         Chocolate Sandwich Cookies   
1               2                                   All-Seasons Salt   
2               3               Robust Golden Unsweetened Oolong Tea   
3               4  Smart Ones Classic Favorites Mini Rigatoni Wit...   
4               5                          Green Chile Anytime Sauce   
...           ...                                                ...   
49688       49684          Vodka, Triple Distilled, Twist of Vanilla   
49689       49685                 En Croute Roast Hazelnut Cranberry   
49690       49686                                   Artisan Baguette   
49691       49687         Smartblend Healthy Metabolism Dry Cat Food   
49692       49688                             Fresh Foaming Cleanser   

       aisle_id  department_id  prices  
0            61             19     5.8  
1           104        

## Data Wrangling Procedures

# 2.  Change 'order_id' from numeric to string

In [36]:
df_ords['order_id'] = df_ords['order_id'].astype(str)A

In [42]:
df_ords['order_id'].dtype

dtype('O')

# 3. Rename 'order_dow' to 'order_day_of_week'

In [130]:
df_ords = df_ords.rename(columns={'order_dow': 'order_day_of_week'})

In [132]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


# 4.  Find the frequency of orders by hour of day

In [44]:
order_hour_freq = df_ords['order_hour_of_day'].value_counts().sort_index()
# Print the busiest hour
busiest_hour = order_hour_freq.idxmax()
print(f"The busiest hour for placing orders is: {busiest_hour}")

The busiest hour for placing orders is: 10


# transposing departments dataframe

In [46]:
departments_path = os.path.join(project_folder_path, "Original Data", "departments.csv")
df_dep = pd.read_csv(departments_path)

In [48]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [50]:
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [52]:
df_dep_t = df_dep.T

In [54]:
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [56]:
new_header = df_dep_t.iloc[0]

In [58]:
df_dep_t_new = df_dep_t[1:]

In [60]:
new_header

0    department
Name: department_id, dtype: object

In [62]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [64]:
df_dep_t_new.columns = new_header #set the header row as the df header

In [66]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


# 5. Determine the meaning behind a value of 4 in the "department_id"creating a dictionary

In [68]:
data_dict = df_dep_t_new.to_dict('index')


In [70]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [80]:
#print(data_dict.get('4'))
dept_4_meaning = data_dict.get('4')
print(f"The meaning of department_id 4 is: {dept_4_meaning}")

The meaning of department_id 4 is: {'department': 'produce'}


#  6. Create a subset for breakfast items


In [100]:
# Filter breakfast items using the department dictionary
df_breakfast =  df_prods[df_prods['department_id']==14]

In [102]:
df_breakfast.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


# 7.  Find all observations for dinner party items


In [116]:
df_dinnerparty = df_prods.loc[df_prods['department_id'].isin([5,20,7,12])]


In [122]:
df_dinnerparty.head()


Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1


# 8. Count rows in the last dataframe

In [124]:
# Count rows in dinner_party_items dataframe
dinner_party_items_count = len(df_dinnerparty)
print(f"The dinner_party_items dataframe has {dinner_party_items_count} rows.")

The dinner_party_items dataframe has 7650 rows.


# 9. Extract information for user with user_id of "1"

In [126]:
# Extract all information for user_id 1
user_1_data = df_ords[df_ords['user_id'] == 1]

# 10. Provide basic Info for user_id "1"


In [128]:
# Basic stats for user_id 1
user_1_order_stats = user_1_data.describe()
print(user_1_order_stats)

       user_id  order_number  order_day_of_week  order_hour_of_day  \
count     11.0     11.000000          11.000000          11.000000   
mean       1.0      6.000000           2.636364          10.090909   
std        0.0      3.316625           1.286291           3.477198   
min        1.0      1.000000           1.000000           7.000000   
25%        1.0      3.500000           1.500000           7.500000   
50%        1.0      6.000000           3.000000           8.000000   
75%        1.0      8.500000           4.000000          13.000000   
max        1.0     11.000000           4.000000          16.000000   

       days_since_prior_order  
count               10.000000  
mean                19.000000  
std                  9.030811  
min                  0.000000  
25%                 14.250000  
50%                 19.500000  
75%                 26.250000  
max                 30.000000  
