# 4.4 Departments

## Contents:
### Set import variable
### Departments - Import
### Departments - Extract header from data fields and apply as header¶
### Departments - Data dictionary
### Products - Import and analysis
### Products - Create subset of data for dept 19 (snacks)
### Exercises (1-12)

In [56]:
# Import libraries
import pandas as pd
import numpy as np
import os

### Set path for import

In [57]:
# Set path for import/export
path = r'C:\Users\XLT2\CFProjects\2023-04-07 Instacart Basket Analysis\02 Data'

### Import departments.csv

In [58]:
# Import departments
df_dep = pd.read_csv(os.path.join(path, 'Original Data', 'departments.csv'), index_col = False)

In [59]:
# Confirm contents
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [60]:
df_dep.shape

(1, 22)

### Transpose departments - wide to long

In [61]:
# Transform departments data from wide to long
df_dep_t = df_dep.T

In [62]:
# View transformed data to confirm
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [63]:
# Reset the index numbering
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


## Departments - Extract header from data fields and apply as header

In [64]:
# Assign header of table to new_header
# This permits us to use this to move the header from the data to the column header
new_header = df_dep_t.iloc[0]

In [65]:
# Confirm contents of new_header
new_header

0    department
Name: department_id, dtype: object

In [66]:
# Write all rows except the first row to new table (no header)
df_dep_t_new = df_dep_t[1:]

In [67]:
# Confirm that header row has been removed from data
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [68]:
# Assign header to this new table
df_dep_t_new.columns = new_header

In [69]:
# Confirm headers were added correctly
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


### Create data dictionary

In [70]:
# Create a data dictionary
# Use numbered rows as key values for the dict
data_dict = df_dep_t_new.to_dict('index')

In [71]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [72]:
# Print the values from the key of 19
print(data_dict.get('19'))

{'department': 'snacks'}


### Import products.csv

In [73]:
# Import products
df_prods = pd.read_csv(os.path.join(path, 'Original Data', 'products.csv'), index_col = False)

In [74]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [75]:
df_prods.shape

(49693, 5)

### Create subset of data for dept 19 (snacks)

In [76]:
# Test all rows to identify which are for department id 19
df_prods['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [77]:
# Find all records where department id == 19
df_prods[df_prods['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [78]:
# Write all records from dept 19 to snacks - only snacks records
df_snacks = df_prods[df_prods['department_id']==19]

In [79]:
# Confirm contents of new table
df_snacks

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [80]:
df_prods.shape

(49693, 5)

In [81]:
# Write snacks to csv
df_snacks.to_csv(os.path.join(path, 'Prepared Data', 'snacks.csv'))

In [82]:
# Write prods to csv
df_prods.to_csv(os.path.join(path, 'Prepared Data', 'prods.csv'))

# 4.4 Final Tasks

### Step 2 - Import orders

In [83]:
df_ords = pd.read_csv(os.path.join(path, 'Original Data', 'orders.csv'), index_col = False)

In [84]:
df_ords

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [85]:
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   eval_set                object 
 3   order_number            int64  
 4   order_dow               int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 182.7+ MB


### Step 2 - 'order_number' data type change from int64 to str

In [86]:
# Change data type
df_ords['order_number'] = df_ords['order_number'].astype('str')

In [87]:
# Confirm data type change
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   eval_set                object 
 3   order_number            object 
 4   order_dow               int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(4), object(2)
memory usage: 182.7+ MB


### Step 3 - Change 'order_dow' name (without overwrite to df)

In [88]:
# Change 'order_dow' column name without df overwrite
df_ords.rename(columns = {'order_dow' : 'order_day_of_week'}, inplace = True)

In [89]:
# Confirm column name change
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   eval_set                object 
 3   order_number            object 
 4   order_day_of_week       int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(4), object(2)
memory usage: 182.7+ MB


### Step 4 - Identify busiest hour for orders placed

In [90]:
# Inspect orders df
df_ords

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [91]:
# View orders by hour of day descending
df_ords['order_hour_of_day'].value_counts(dropna = False)

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_hour_of_day, dtype: int64

### Busiest hour is hour 10

### Step 5 - Isolate and identify 'department_id' of '4'

In [92]:
# Inspect products df
df_prods

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [93]:
# Print data dictionary key:value for department_id == 4 (already created data dict)
print(data_dict.get('4'))

{'department': 'produce'}


### Step 6 - Create subset of data to isolate only breakfast sales

In [94]:
# Create df for breakfast and write items from department_id == 14
df_breakfast_items = df_prods[df_prods['department_id']==14]

In [95]:
# Confirm breakfast df
df_breakfast_items

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


### Step 7 - Create subset of data to isolate only dinner party items (alcohol, deli, bev, meat/seafood)

In [96]:
# Create df for dinner and write records
df_dinner_party = df_prods.loc[df_prods['department_id'].isin([5, 20, 7, 12])]

In [97]:
# Confirm dinner df
df_dinner_party

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


### Step 8 - Confirm row count in dinner party df

In [98]:
df_dinner_party.shape

(7650, 5)

In [99]:
df_dinner_party.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7650 entries, 2 to 49688
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     7650 non-null   int64  
 1   product_name   7647 non-null   object 
 2   aisle_id       7650 non-null   int64  
 3   department_id  7650 non-null   int64  
 4   prices         7650 non-null   float64
dtypes: float64(1), int64(3), object(1)
memory usage: 358.6+ KB


### Step 9 - Create subset of data to isolate user_id '1'

In [100]:
# View orders df
df_ords

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [101]:
# Create new df and write records for 'user_id' == '1'
df_ords_user_1 = df_ords[df_ords['user_id']==1]

In [102]:
# Confirm contents of user_id == '1' df
df_ords_user_1

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


### Step 10 - View statistics of user_id == 1 df

In [103]:
# View statistics of user_id 1 df
df_ords_user_1.describe()

Unnamed: 0,order_id,user_id,order_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,11.0,10.0
mean,1923450.0,1.0,2.636364,10.090909,19.0
std,1071950.0,0.0,1.286291,3.477198,9.030811
min,431534.0,1.0,1.0,7.0,0.0
25%,869017.0,1.0,1.5,7.5,14.25
50%,2295261.0,1.0,3.0,8.0,19.5
75%,2544846.0,1.0,4.0,13.0,26.25
max,3367565.0,1.0,4.0,16.0,30.0


### Step 12 - Export orders and departments to csv

In [54]:
df_ords.to_csv(os.path.join(path, 'Prepared Data', 'orders_wrangled.csv'))

In [69]:
df_dep_t_new.to_csv(os.path.join(path, 'Prepared Data', 'departments_wrangled.csv'))