# 4.4 Instacart data wrangling and subsetting

# This script contains the following points:

## 1. Import libraries

## 2. Import orders.csv dataset and assign to df_ords

## 3. Import products.csv dataset and assign to df_prods

## 4. Dropping columns and creating a new version of df_ords

## 5. Find missing values in df_ords dataset (NaN)

## 6. Renaming and overwriting column order_dow to orders_days_of_week

## 7. Changing a variables type

## 8. Conduct descriptive statistical analysis

## 9. Transposing data using the T function from dataframe 'departments.csv'

## 10. Adding an index

## 11. Create a new header

## 12. Eliminate duplicate rows by coping rows beyond the duplicate row

## 13. Set df_dep_t_new column headers, using new_header values

## 14. Create a Data Dictionary

## 15. Find information about corresponding rows using 'get' 

## 16. Export revised departments dataframe

## 17. Subsetting using df_prods dataframe

## 18. Use loc and isin

## 19. Exercise 4.4 Tasks

## 20. Task 2 - Change user_id from int to str using df_ords dataframe

## 21.Task 3 - Change variable name

## 22. Task 4 - Find the busiest hour for placing orders

## 23. Task 5 - Use a data dictionary to determine the meaning behind a value of '4' in 'department_id' in df_prods dataframe

## 23. Task 6 - Create a subset of breakfast item sales

## 24. Task 7 - Create a subset of alcohol, deli, beverages, and meat/seafood
 
## 25. Export revised orders_wrangled dataframe



## 1. Import libraries

In [1]:
#import libraries

import pandas as pd
import numpy as np
import os

## 2. Import orders.csv dataset and assign to df_ords

In [4]:
# identify path

path = r'C:\Users\howl6\OneDrive\Certificates\CareerFoundry\Coursework\Data_Immersion\Chapter 4\Instacart Basket Analysis'

In [6]:
# assign df_ords to orders.csv dataframe

df_ords = pd.read_csv(os.path.join(path, '02_Data', 'Original_Data', '4.3_orders_products', 'orders.csv'), index_col=False)

## 3. Import products.csv dataset and assign to df_prods

In [7]:
# assign df_prods to products.csv dataframe

df_prods = pd.read_csv(os.path.join(path, '02_Data', 'Original_Data', '4.3_orders_products', 'products.csv'), index_col=False)

## 4. Dropping columns and creating a new version of df_ords

In [9]:
# drop column eval_set

df_ords = df_ords.drop (columns = ['eval_set'])

In [11]:
# preview top 5 rows

df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## 5. Find missing values in df_ords dataset (NaN)

In [12]:
# identify missing values (NaN)

df_ords['days_since_prior_order'].value_counts(dropna = False)

30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: days_since_prior_order, dtype: int64

### There are 206209 missing values

## 6. Renaming and overwriting column order_dow to orders_days_of_week

In [13]:
# rename column order_dow

df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [14]:
# preview top five rows

df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## 7. Changing a variables type

In [15]:
# convert order_id to string

df_ords['order_id'] = df_ords ['order_id'].astype('str')

## 8. Conduct descriptive statistical analysis

In [18]:
# produce descriptive statistics

df_ords.describe()

Unnamed: 0,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,102978.2,17.15486,2.776219,13.45202,11.11484
std,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,0.0,0.0,0.0
25%,51394.0,5.0,1.0,10.0,4.0
50%,102689.0,11.0,3.0,13.0,7.0
75%,154385.0,23.0,5.0,16.0,15.0
max,206209.0,100.0,6.0,23.0,30.0


In [19]:
# verify change in datatype

df_ords['order_id'].dtype

dtype('O')

## 9. Transposing data using the T function from dataframe 'departments.csv'

In [20]:
# assign df_dep to departments.csv

df_dep = pd.read_csv(os.path.join(path, '02_Data', 'Original_Data', '4.4_departments', 'departments.csv'), index_col=False)

In [21]:
# preview top row 

df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [26]:
 # transpose from wide to long

df_dep_t = df_dep.T

In [27]:
# oreview top 5

df_dep_t.head()

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce


In [28]:
#preview all

df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


## 10. Adding an index

In [29]:
# add index

df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


## 11. Create a new header

In [30]:
# assign a new header

new_header = df_dep_t.iloc[0]

In [31]:

#preview new header

new_header

0    department
Name: department_id, dtype: object

## 12. Eliminate duplicate rows by coping rows beyond the duplicate row

In [32]:
# copy rows beyond duplicate row

df_dep_t_new = df_dep_t[1:]

In [33]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


## 13. Set df_dep_t_new column headers, using new_header values

In [34]:
# set new header values

df_dep_t_new.columns = new_header

In [35]:
# preview new rows, columns

df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


## 14. Create a Data Dictionary

In [38]:
# create data dictionary

data_dict = df_dep_t_new.to_dict('index')

In [39]:
# view data dictionary

data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

## 15. Find information about corresponding rows using 'get'

In [41]:
# find corresponding product for department 19

print(data_dict.get('19'))

{'department': 'snacks'}


## 16. Export revised departments dataframe

In [109]:
# export wrangled df_dep_t_new to departments_wrangled.csv dataframe

df_dep_t_new.to_csv(os.path.join(path, '02_Data','Prepared_Data', 'departments_wrangled.csv'))

In [40]:
#preview top 5 rows

df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


## 17. Subsetting using df_prods dataframe

In [42]:
df_prods['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [43]:
# identify products in department 19 for df_prods

df_prods[df_prods['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [44]:
# assign subset department 19 to df_snacks

df_snacks = df_prods[df_prods['department_id']==19]

In [46]:
# preview top 5 rows

df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


## 18. Use loc and isin

In [47]:
# assign subset department 19 to df_snacks_2 using loc function

df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]

In [48]:
#preview top 5 rows

df_snacks_2.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [49]:
# assign subset department 19 to df_snacks_3 using isin function

df_snacks_3 = df_prods.loc[df_prods['department_id'].isin([19])]

In [50]:
#preview top 5 rows

df_snacks_3.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


## 19. Exercise 4.4 Tasks

In [54]:
# preview top 5 rows

df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## 20. Task 2 - Change user_id from int to str using df_ords dataframe

In [56]:
# id datatypes

df_ords.dtypes

order_id                   object
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [57]:
# change user id to string

df_ords['user_id'] = df_ords ['user_id'].astype('str')

In [59]:
# verify change in datatype

df_ords['user_id'].dtype

dtype('O')

In [60]:
# id datatypes

df_ords.dtypes

order_id                   object
user_id                    object
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

## 21.Task 3 - Change variable name

In [69]:
# rename order hour of day column

df_ords.rename(columns = {'order_hour_of_day' : 'order_time_of_day'}, inplace = True)

In [70]:
# preview top 5 rows

df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## 22. Task 4 - Find the busiest hour for placing orders

In [83]:
#find busiest hour for orders using value_counts

df_ords['order_time_of_day'].value_counts(dropna = False)

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_time_of_day, dtype: int64

### The busiest time for placing orders is '10 am'.

## 23. Task 5 - using a data dictionary to determine the meaning behind a value of '4' in 'department_id' in df_prods dataframe

In [81]:
# id product in department_id 4

data_dict['4']

{'department': 'produce'}

## 23. Task 6 - Create a subset of breakfast item sales

In [84]:
# show data dictionary

data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

### The department_id for breakfast items is '14'.

In [85]:
# id products in depeartment_id 14

df_prods[df_prods['department_id']==14]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


 ## 24. Task 7 - Create a subset of alcohol, deli, beverages, and meat/seafood

### Dept_id is '5' for'alcohol', '7' for 'beverages', '12' for 'meat seafood', and '20' for 'deli'.


In [101]:
# create subset for dinner party

df_dinner_pty = df_prods.loc[df_prods['department_id'].isin([5,7,12,20])]

In [103]:
# view df_dinner_pty

df_dinner_pty

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


### Task 8 - the number of rows for the previous task is 7650.

### Task 9 - extract information regarding 'user_id' number '1'

In [105]:
# preview first row

df_ords.head(1)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order
0,2539329,1,1,2,8,


### There is missing information regarding days_since_prior_order for 'user_id' number '1'.  The user's orders are typically on the 2nd day of the week at 8 am.

## 25. Export revised orders_wrangled dataframe

In [110]:
df_ords.to_csv(os.path.join(path, '02_Data','Prepared_Data', 'orders_wrangled.csv'))