## 1. Import libraries

In [110]:
# Import libraries
import pandas as pd
import numpy as np
import os

## 2. Set system path

In [111]:
# set path for data files
path = r'C:\Users\Stony\OneDrive\CareerFoundry\Data Immersion Course\Instacart Basket Analysis'

## 3. Import Orders and Product csv files as dataframes

In [112]:
# Import Orders.csv into a dataframe
df_orders = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

In [113]:
# Import Products.csv into a dataframe
df_products = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

## 4. Drop uneeded columns and save dataframe with column removed

In [114]:
# Drop eval_set column from orders dataframe
df_orders.drop(columns = ['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [115]:
# Remove eval_set column from the orders dataframe and update the dataframe
df_orders = df_orders.drop(columns = ['eval_set'])

## 5. Count values in days since prior order column

In [116]:
# Value counts of days_since_prior_order column
df_orders['days_since_prior_order'].value_counts(dropna = False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

## 6. Rename order_dow column

In [117]:
# Rename order_dow column to order_day_of_week
df_orders.rename(columns = {'order_dow' : 'order_day_of_week'}, inplace = True)

In [118]:
# Validate the change was successful
df_orders.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## 7. Change data type of order_id to string

In [119]:
# Change order_id data type to string
df_orders['order_id'] = df_orders['order_id'].astype('str')

In [120]:
#Validate order_id data type was changed to string
df_orders['order_id'].dtype

dtype('O')

## 8. Import departments csv into a dataframe

In [121]:
# Import departments.csv into a dataframe
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [122]:
# Identify the column headers and number of rows of the department dataframe
df_dep.head

<bound method NDFrame.head of   department_id       1      2       3        4        5              6  \
0    department  frozen  other  bakery  produce  alcohol  international   

           7     8                9  ...            12      13         14  \
0  beverages  pets  dry goods pasta  ...  meat seafood  pantry  breakfast   

             15          16         17      18      19    20       21  
0  canned goods  dairy eggs  household  babies  snacks  deli  missing  

[1 rows x 22 columns]>

## 9. Transpose products dataframe

In [123]:
# Transpose the wide format of the department dataframe into a long format in a new dataframe
df_dep_t = df_dep.T

In [124]:
# Validate the transpose operation was successful
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


## 10. Create an index in department_t_dataframe

In [125]:
# Create an index in the new department_t dataframe
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


## 11. Create a new header row for the department_t dataframe

In [126]:
# Create a new header for the department_t dataframe
new_header = df_dep_t.iloc[0]

In [127]:
# Delete the first row in the department_t dataframe
df_dep_t_new = df_dep_t[1:]

In [128]:
# Ensure the first row was deleted successfully
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [129]:
# Insert new header into a new dataframe called department_t_new
df_dep_t_new.columns = new_header

In [130]:
# Test to ensure new dataframe has the correct header
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


## 12. Create a data dictionary

In [131]:
# Create a data dictionary of the new department_t_new dataframe and assign it the data_dict variable
data_dict = df_dep_t_new.to_dict('index')

In [132]:
# Call new data_dict variable
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [133]:
df_products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [134]:
print(data_dict.get('19'))

{'department': 'snacks'}


## 13. Create a subset of data from the products dataframe

In [135]:
# Create a new dataframe called df_snacks containing only items with a department_id of 19, which refers to snacks
df_snacks = df_products[df_products['department_id']==19]

In [136]:
df_products['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [137]:
# Subsetting the products dataframe with only those items with a department_id of 19
df_products[df_products['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [138]:
# Create a new dataframe called snacks from the subset of data with a department_id of 19
df_snacks = df_products[df_products['department_id']==19]

In [139]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


## 14. Exercise 4.4 Task

# 2. Find another identifier variable in the df_ords dataframe that doesn’t need to be included in your analysis as a numeric variable and change it to a suitable format.

In [140]:
# Find another variable that does not need to be included in the analysis as a number
df_orders.dtypes

order_id                   object
user_id                     int64
order_number                int64
order_day_of_week           int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [141]:
# Change user_id data type to string
df_orders['user_id'] = df_orders['user_id'].astype('str')

In [142]:
#Validate user_id data type was changed to string
df_orders['user_id'].dtype

dtype('O')

# 3. Look for a variable in your df_ords dataframe with an unintuitive name and change its name without overwriting the dataframe.

In [143]:
df_orders.columns

Index(['order_id', 'user_id', 'order_number', 'order_day_of_week',
       'order_hour_of_day', 'days_since_prior_order'],
      dtype='object')

In [144]:
# Rename days_since_prior_order column to days_since_last_order
df_orders.rename(columns = {'days_since_prior_order' : 'days_since_last_order'}, inplace = True)

In [145]:
# Validate the column was renamed successfully
df_orders.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


# 4. Your client wants to know what the busiest hour is for placing orders. Find the frequency of the corresponding variable and share your findings.

In [146]:
# Value counts of order_hour_of_day column
df_orders['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

In [147]:
# 10 AM is the busiest time of the day for users placing orders with Instacart.

# 5. Determine the meaning behind a value of 4 in the "department_id" column within the df_prods dataframe using a data dictionary.


In [148]:
# Use print function to determine what the value of 4 in the products dataframe
print(data_dict.get('4'))

{'department': 'produce'}


In [149]:
# The value of 4 in the department_id column represents the produce department

# 6. The sales team in your client’s organization wants to know more about breakfast item sales. Create a subset containing only the required information.

In [150]:
# Use our data dictionary to determine which department_id refers to breakfast items
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

According to our data dictionary, department_id 14 refers to breakfast items.

In [151]:
# Create a new dataframe called df_breakfast containing only breakfast items
df_breakfast = df_products[df_products['department_id']==14]

In [152]:
# Validate the new dataframe was created with only breakfast items in it
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


# 7. They’d also like to see details about products that customers might use to throw dinner parties. Your task is to find all observations from the entire dataframe that include items from the following departments: alcohol, deli, beverages, and meat/seafood. You’ll need to present this subset to your client.

In [153]:
# Use our data dictionary to get the department_id numbers for the requested departments
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [154]:
# Create a subset dataframe called df_dinner_party that contains the requested departments

In [155]:
df_dinner_party = df_products.loc[df_products['department_id'].isin([5, 7, 12, 20])]

In [156]:
df_dinner_party

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


# 8. How many rows does the last dataframe you created have?

In [157]:
# The last dataframe created was df_dinner_party
df_dinner_party

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


In [158]:
# There are 7,650 rowns and 5 columns in the df_dinner_party dataframe

# 9. Extract all the information you can about the customer with a 'user_id' of 1.

In [159]:
# Create a dataframe with all the orders information on user_id 1
df_orders_userid_1 = df_orders[df_orders['user_id']=='1']

In [160]:
df_orders_userid_1

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


# 10. Provide some details about this user’s behavior. What basic stats can you provide based on the information you have?

In [161]:
# Use the describe function to see statistical data on user_id 1.
df_orders_userid_1.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


# 12. Export df_orders dataframe as "orders_wrangled.csv"

In [164]:
# Exporting df_orders to a csv file in the Prepared Data folder of my analysis
df_orders.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'))

# 13. Export the df_dep_t_new dataframe as “departments_wrangled.csv”

In [166]:
# Exporting df_department_t_new to a csv file in the Prepared Data folder of my analysis
df_dep_t_new.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'))