# In this Notebook

1. Importing libraries
2. Importing Data
3. Data Wrangling
4. Create a data dictionary
5. Subsetting
6. Exercise steps
7. Exporting data

# 1. Importing Libraries

In [53]:
#Importing Libraries
import pandas as pd
import numpy as np
import os

# 2. Importing Data

In [54]:
#creating path
path = r"/Users/katerinapilota/Desktop/Desktop - Pilot's Mac mini/dataimmersion/python/ 02:03:21 Instacart Basket Analysis"

In [55]:
#Importing orders.csv
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

In [56]:
#checking info about df_ords
df_ords.info

<bound method DataFrame.info of          order_id  user_id eval_set  order_number  order_dow  \
0         2539329        1    prior             1          2   
1         2398795        1    prior             2          3   
2          473747        1    prior             3          3   
3         2254736        1    prior             4          4   
4          431534        1    prior             5          4   
...           ...      ...      ...           ...        ...   
3421078   2266710   206209    prior            10          5   
3421079   1854736   206209    prior            11          4   
3421080    626363   206209    prior            12          1   
3421081   2977660   206209    prior            13          1   
3421082    272231   206209    train            14          6   

         order_hour_of_day  days_since_prior_order  
0                        8                     NaN  
1                        7                    15.0  
2                       12              

In [57]:
#importing products.csv
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [58]:
#checking info about df_prods
df_prods.info

<bound method DataFrame.info of        product_id                                       product_name  \
0               1                         Chocolate Sandwich Cookies   
1               2                                   All-Seasons Salt   
2               3               Robust Golden Unsweetened Oolong Tea   
3               4  Smart Ones Classic Favorites Mini Rigatoni Wit...   
4               5                          Green Chile Anytime Sauce   
...           ...                                                ...   
49688       49684          Vodka, Triple Distilled, Twist of Vanilla   
49689       49685                 En Croute Roast Hazelnut Cranberry   
49690       49686                                   Artisan Baguette   
49691       49687         Smartblend Healthy Metabolism Dry Cat Food   
49692       49688                             Fresh Foaming Cleanser   

       aisle_id  department_id  prices  
0            61             19     5.8  
1           104      

# 2 Data Wrangling

Drop columns unneccessary for analysis

In [59]:
#Dropping eval_set column from df_ords
df_ords = df_ords.drop(columns = ['eval_set'])

Check for missing values 

In [60]:
#print the first 15 rows
df_ords.head(15)

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [61]:
#checking for missing values in 'days_since_prior_order' column
df_ords['days_since_prior_order'].value_counts(dropna = False)

30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: days_since_prior_order, dtype: int64

Rename columns

In [62]:
#Renaming order_dow column 
df_ords.rename(columns = {'order_dow' : 'orders_days_of_the_week'}, inplace = True)

In [63]:
#checking result
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_days_of_the_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


Change data types

In [64]:
#changing order id to object
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [65]:
df_ords['order_id'].dtype

dtype('O')

Transpose new df from dataset for data dictionary

In [66]:
#Importing dataset departments.csv
df_dep = pd.read_csv(os.path.join(path,'02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [67]:
#return head of df_dep
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [68]:
#transpose df 
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [69]:
#create new df from transposed df
df_dep_t = df_dep.T

In [70]:
#check first 5 rows
df_dep_t.head()

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce


Fix headers

In [71]:
#reset index
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [72]:
#create object that contains first row of df
new_header = df_dep_t.iloc[0]

In [73]:
#check new object
new_header

0    department
Name: department_id, dtype: object

In [74]:
#call df
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [75]:
#copying everything from row 1 onwards for new df
df_dep_t_new = df_dep_t[1:]

In [76]:
#call copied df
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [77]:
# set new_header as the df header
df_dep_t_new.columns = new_header

In [78]:
#call prepared df
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


# 4. Create a Data Dictionary

In [79]:
#checking what info is needed from df_prods (meaning of numbers in dept_id)
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [80]:
# creating a new data dictionary 
data_dict = df_dep_t_new.to_dict('index')

In [81]:
#call data dictionary
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [82]:
#test dictionary - print first 5 rows of df_prods
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [83]:
#find the meaning of dept_id '19' using dictionary
print(data_dict.get('19'))

{'department': 'snacks'}


# 5. Subsetting

In [84]:
#create a subset of df containing only tuples with department_id '19'
df_snacks = df_prods[df_prods['department_id']==19]

In [85]:
#check subset
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [86]:
#craete a subset with loc functions
df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]

In [87]:
df_snacks_2.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [88]:
df_snacks_3 = df_prods.loc[df_prods['department_id'].isin([19])]

In [89]:
df_snacks_3.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


# 3. Exercise Steps

1. Change user_id data type to string

In [90]:
#changing user_id to string
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [91]:
df_ords['user_id'].dtype

dtype('O')

2. Rename 'order_hour_of_day' column

In [92]:
#renaming order_hour_of_day column
df_ords.rename(columns = {'order_hour_of_day' : 'time_of_order_24hr_time'}, inplace = True)

In [93]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_days_of_the_week,time_of_order_24hr_time,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


3. What is the busiest hour of the day

In [94]:
#using value_counts (frequency) to find businest hour of the day
df_ords['time_of_order_24hr_time'].value_counts(dropna=False) #10am is the busiest time of day

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: time_of_order_24hr_time, dtype: int64

4. What is the meaning of department_id category '4'?

In [95]:
#using dictionary to find meaning of '4' in dept_id
print(data_dict.get('4'))

{'department': 'produce'}


5. Create a subset for breakfast items

In [96]:
#create a subset for breakfast items
df_breakfast =  df_prods[df_prods['department_id']==14]

In [97]:
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


6. Create a subset for dinner parties

In [98]:
#create a subset for dinner parties
df_dinner_parties = df_prods.loc[df_prods['department_id'].isin([5,20,7,12])]

In [99]:
#call dinner parties subset
df_dinner_parties

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


7. How many rows does dinner parties subset have? It has 7650 rows

In [100]:
#9 Finding out about user_id '1'
user_1 = df_ords[df_ords['user_id']==1]

In [101]:
user_1.head()

Unnamed: 0,order_id,user_id,order_number,orders_days_of_the_week,time_of_order_24hr_time,days_since_prior_order


In [102]:
user_1.describe()

Unnamed: 0,order_number,orders_days_of_the_week,time_of_order_24hr_time,days_since_prior_order
count,0.0,0.0,0.0,0.0
mean,,,,
std,,,,
min,,,,
25%,,,,
50%,,,,
75%,,,,
max,,,,


There are no entries for user 1

# 7. Exporting data

In [104]:
#saving wrangled orders df
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'), index = False)

In [105]:
#saving wrangled departments df
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'), index = False)

End. 