# 4.4 Data Wrangling Orders, Products, Departments Data Sets

### This script contains the following points: <br> <br> 
1. Importing Libraries <br> <br> 
2. Importing Data Sets <br> <br> 
3. Data Checks <br>
 > 3.1 Orders Data Set <br>
 > 3.2 Products Data Set <br>
 > 3.3 Departments Data Set<br>
4. Data Wrangling <br><br>
5. Client Questions<br>
 > 5.1 What is the busiest hour for placing orders? <br>
 > 5.2 Data Dictionaries: What is the meaing of value 4 in the department_id column? <br>
 > 5.3 Subsets: Breakfast sales <br>
 > 5.4 Subsets: Dinner parties <br>
 > 5.5 Subsets: User_id = 1 <br>
6. Exporting

## 01 Importing Libraries

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

## 02 Importing Data

In [2]:
# Shortcut for importing data frames uses os.path.join()
# First create a string of the path for the main project folder
path = r'/Users/mistystone/Library/CloudStorage/OneDrive-Personal/Documents/CF_Data_Ach4_Python/2023-05_Instacart_Basket_Analysis/'

In [3]:
# Import the orders.csv file.
df_orders = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'))

In [4]:
# Import the products.csv file.
df_products = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'))

In [5]:
# Import the departments.csv file.
df_departments = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'))

## 03 Data Checks

### 03.01 Orders data set

In [6]:
# orders: data set shape
df_orders.shape

(3421083, 7)

In [7]:
# orders: dataset head
df_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [8]:
# orders: descriptive statistics
df_orders.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


Note that we don't need the eval_set column as every observation is "prior" and the column order_dow is named in a confusing manner. 

### 03.02 Products data set

In [9]:
# products: data set shape
df_products.shape

(49693, 5)

In [10]:
# products: data set head
df_products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [11]:
# products: descriptive statistics
df_products.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.345139,67.770249,11.728433,9.994136
std,14343.717401,38.316774,5.850282,453.519686
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


### 03.03 Departments Data Set

In [12]:
# departments: data set shape
df_departments.shape

(1, 22)

In [13]:
# departments: data set head
df_departments.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


Note that this dataset needs to be transposed. 

# 04 Data Wrangling

In [14]:
# Create new dataframe df_orders_new with a dropped column eval_set.
df_orders_new = df_orders.drop(columns = ['eval_set'])

In [15]:
# Rename a column order_dow to orders_day_of_week.
df_orders_new.rename(columns = {'order_dow' : 'order_day_of_week'}, inplace = True)


In [16]:
# Check orders head
df_orders_new.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [17]:
# This command changes the data type of order_id and user_id to a string.
df_orders_new['order_id'] = df_orders_new['order_id'].astype('str')
df_orders_new['user_id'] = df_orders_new['user_id'].astype('str')

In [18]:
# Check data types.
df_orders_new.dtypes

order_id                   object
user_id                    object
order_number                int64
order_day_of_week           int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [19]:
# Transpose departments dataframe.
df_departments_new = df_departments.T

In [20]:
# Get rid of the 0 at the top of the second column.
# First add an index column.
df_departments_new.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [21]:
# Pull the first row of the dataframe and name it new_header.
new_header = df_departments_new.iloc[0]

In [22]:
# Create new dataframe df_departs_new with the old dataframe.
df_departments_new = df_departments_new[1:]

In [23]:
# Set the variable new_header as the header.
df_departments_new.columns = new_header

In [24]:
# Check departments data set
df_departments_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


## 05 Client Questions

### 05.01 What is the busiest hour for placing orders? 

In [25]:
# Frequency of order_hour
df_orders_new['order_hour_of_day'].value_counts(dropna = False)

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_hour_of_day, dtype: int64

The busiest hour is the 10:00 hour

### 05.02 Data Dictionaries: What is the meaing of value 4 in the department_id column?

What is the meaning behind a value of 4 in the "department_id" column within the df_products dataframe using a data dictionary.

In [26]:
# Create a data dictionary
data_dict = df_departments_new.to_dict('index')
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

The value 4 in the department_id refers to "produce".

### 05.03 Subsets: Breakfast sales

The sales team in your client’s organization wants to know more about breakfast item sales. <br>
Create a subset containing only the required information.

In [27]:
# Create a subset with information about breakfast items, department 14
df_breakfast =  df_products[df_products['department_id']==14]
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


### 05.04 Subsets: Dinner parties

Explore some details about products that customers might use to throw dinner parties. Find all observations from the entire dataframe that include items from the following departments: alcohol, deli, beverages, and meat/seafood. (Departments 5, 7, 12, 20)

In [28]:
# 7. Subset including alcohol, deli, beverages, meat / seafood
df_dinnerparties = df_products.loc[df_products['department_id'].isin([5,20,7,12])]
df_dinnerparties.head(30)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
19,20,Pomegranate Cranberry & Aloe Vera Enrich Drink,98,7,6.0
22,23,Organic Turkey Burgers,49,12,8.2
34,35,Italian Herb Porcini Mushrooms Chicken Sausage,106,12,15.1
38,39,Daily Tangerine Citrus Flavored Beverage,64,7,12.5
39,40,Beef Hot Links Beef Smoked Sausage With Chile ...,106,12,22.5


In [29]:
# How many rows are in the dinnerparties dataframe?
df_dinnerparties.shape

(7650, 5)

### 05.05 Subsets: User_id = 1

In [30]:
# Subset of orders by user_id = 1
df_customer1 = df_orders_new.loc[df_orders['user_id'].isin([1])]
df_customer1.head(30)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [31]:
# Descriptive Stats for user_id = 1
df_customer1.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


User_id = 1 has placed 11 orders. <br>
They usually order on Tuesday or Wednesday, but always near the beginning of the week. <br>
They usually at about 10:00am, but never earlier than 7:00am,and always before 4pm. <br>
They typically have 19 days in between orders, but have ordered the following day, and the longest number of days between orders is 30.

# 06 Exporting

In [32]:
df_orders_new.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [33]:
df_orders_new.shape

(3421083, 6)

In [34]:
df_departments_new.head()

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol


In [35]:
df_departments_new.shape

(21, 1)

In [36]:
df_orders_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

In [37]:
df_departments_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))