## Data Wrangling

In [83]:
# import libraries
import pandas as pd
import numpy as np
import os

In [84]:
# turning the folder path into string
path =r'/Users/lavinia/Documents/04-2020 Instacart Basket Analysis'
path

'/Users/lavinia/Documents/04-2020 Instacart Basket Analysis'

In [85]:
# import orders data set
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

In [86]:
# import products data set
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [87]:
# import departments data set
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [88]:
# display the first five rows of orders dataframe
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [89]:
# dropping eval_set column
df_ords = df_ords.drop(columns = ['eval_set'])


In [90]:
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_number            int64  
 3   order_dow               int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(5)
memory usage: 156.6 MB


## Find another identifier variable in the df_ords dataframe that doesn't need to be included in your analysis as a numeric variable and change it to a suitable format

In [91]:
# change the order_id format to string 
df_ords['order_id'] = df_ords['order_id'].astype('str')
df_ords['order_id'].dtype

dtype('O')

## Look for a variable in the df_ords dataframe with an unintuitive name and change its name without overwritting the data frame

In [92]:
# rename the column 'order_dow' to order_day_of_week
df_ords.rename(columns = {'order_dow' : 'order_day_of_week'}, inplace = True)

In [93]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## The client wants to know the busiest hour is for placing orders. Find the frequency of the corresponsing variable and share your findings

In [94]:
# to investigate which busiest hour of the day is for placing orders
df_ords['order_hour_of_day'].value_counts(dropna = False)

# Answer: the busiest hour of the day for placing orders is 10:00 am

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_hour_of_day, dtype: int64

## Determining the meaning behind a value of 4 in the "department_id" column within the df_prods dataframe using a data dictionary

In [95]:
# transposing departments dataframe
df_dep = df_dep.T

In [96]:
# adding an index to the departments dataframe
df_dep.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [97]:
# create new header for the dataframe
new_header = df_dep.iloc[0]
new_header

0    department
Name: department_id, dtype: object

In [98]:
# create a new dataframe that comprises only data from index = 1
df_dep = df_dep[1:]

In [99]:
# assign column names
df_dep.columns = new_header

In [100]:
# create a data dictionary from departments dataset
data_dict = df_dep.to_dict('index')
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [101]:
# Answer: The meaning behind a value of 4 in the "department_id" column is Produce
print(data_dict.get('4'))

{'department': 'produce'}


## The sales team wants to know more about breakfast item sales. Create a subset containing only the required information

In [102]:
# Create a subset for breakfast item
df_breakfast = df_prods[df_prods['department_id']==14]
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


## The sales team would also like to see details about products that customers might use to throw dinner parties. To find all observations from the entire dataframe that include items from the following departments: alcohol, deli, beverages, and meat/seafood.

In [103]:
df_dinner_parties = df_prods.loc[df_prods['department_id'].isin ([5,7,12,20])]
df_dinner_parties

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


## It's important to keep track of total counts in your dataframes. How many rows does the last dataframe you created have

In [104]:
df_dinner_parties.shape

# Answer: 7650 rows

(7650, 5)

## Someone from the data engineers team in Instacart thinks they've spotted something strange about the customer with a "user_id" of "1". Extract all the information you can about this user

In [105]:
df_cust_userid_1 = df_ords[df_ords['user_id']==1]

## Provide some details about this user's behaviour. What basic stats can you provide based on the information you have

In [106]:
df_cust_userid_1.describe()

# Answer: The user does not make frequent purchase from Instacart

Unnamed: 0,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,11.0,10.0
mean,1.0,6.0,2.636364,10.090909,19.0
std,0.0,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,1.0,7.0,0.0
25%,1.0,3.5,1.5,7.5,14.25
50%,1.0,6.0,3.0,8.0,19.5
75%,1.0,8.5,4.0,13.0,26.25
max,1.0,11.0,4.0,16.0,30.0


## Export the df_ords dataframe and df_dep dataframe

In [107]:
# export df_ords dataframe
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index = False)

In [None]:
# the 'department_id' refers to the name of the index, to export it as column name, we need to do reset_index
# 1. set the index name
df_dep.index.name = 'department_id'

In [None]:
# 2. reset the index to make the index a regular column
df_dep = df_dep.reset_index

In [123]:
# export df_dep dataframe
df_dep.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'df_dep_wrangled.csv'), index = False)

In [124]:
df_dep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   department_id  21 non-null     object
 1   department     21 non-null     object
dtypes: object(2)
memory usage: 464.0+ bytes
