### Table of Contents
01. Import Libraries
02. Import Data
03. Explore Data
04. Wrangling Procedures
05. Data Cleaning
06. Merge 'df_cust_data' with 'df_ords_prods_merge'
07. Export Data

# 01. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 02. Import Data

In [2]:
# Folder Path
path = r'C:\Users\jrper\OneDrive\Documents\Career Foundry Data Analytics Program\Instacart Basket Analysis'

In [3]:
# Import orders_products_merged.pkl
df_cust_data = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

In [4]:
# Import ords_prods_agg_variables.pkl
df_ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_agg_variables.pkl'))

# 03. Explore Data

In [5]:
# Check first 5 rows of df_cust_data
df_cust_data.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [6]:
# Check last 5 rows of df_cust_data
df_cust_data.tail()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
206204,168073,Lisa,Case,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Jeremy,Robbins,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Doris,Richmond,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Rose,Rollins,Female,California,27,4/1/2020,1,married,99799
206208,80148,Cynthia,Noble,Female,New York,55,4/1/2020,1,married,57095


In [7]:
# List columns
df_cust_data.columns

Index(['user_id', 'First Name', 'Surnam', 'Gender', 'STATE', 'Age',
       'date_joined', 'n_dependants', 'fam_status', 'income'],
      dtype='object')

In [8]:
# Check df_cust_data size
df_cust_data.shape

(206209, 10)

In [9]:
# Check data types
df_cust_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [10]:
# Check for mixed-type data
for col in df_cust_data.columns.tolist():
  weird = (df_cust_data[[col]].applymap(type) != df_cust_data[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_cust_data[weird]) > 0:
    print (col)

First Name


#### Observations: Column 'First Name' has mixed-type data

In [11]:
# Check for null values
df_cust_data.isna().sum()

user_id             0
First Name      11259
Surnam              0
Gender              0
STATE               0
Age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64

#### Observations: Column 'First Name' has 11259 null values

In [12]:
# Create df_na to view missing values
df_na = df_cust_data[df_cust_data['First Name'].isnull() == True]
df_na.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
53,76659,,Gilbert,Male,Colorado,26,1/1/2017,2,married,41709
73,13738,,Frost,Female,Louisiana,39,1/1/2017,0,single,82518
82,89996,,Dawson,Female,Oregon,52,1/1/2017,3,married,117099
99,96166,,Oconnor,Male,Oklahoma,51,1/1/2017,1,married,155673
105,29778,,Dawson,Female,Utah,63,1/1/2017,3,married,151819


In [13]:
df_na.tail()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
206038,121317,,Melton,Male,Pennsylvania,28,3/31/2020,3,married,87783
206044,200799,,Copeland,Female,Hawaii,52,4/1/2020,2,married,108488
206090,167394,,Frost,Female,Hawaii,61,4/1/2020,1,married,45275
206162,187532,,Floyd,Female,California,39,4/1/2020,0,single,56325
206171,116898,,Delgado,Female,Colorado,23,4/1/2020,2,married,59222


#### Observations: Missing values do not coincide with any other variables.

In [14]:
# Check for duplicates
df_cust_data.duplicated().any()

False

#### Observations: No duplicates in 'customers' data set.

In [15]:
# View descriptive statistics
df_cust_data.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


#### Observations: Statistics are within expected ranges. Counts are consistent with shape of df_cust_data.

# 04. Wrangling Procedures

In [16]:
# Rename 'Surnam' column to 'Last Name'
df_cust_data.rename(columns = {'Surnam' : 'Last Name'}, inplace = True)

In [17]:
# Ensure renaming was successful
df_cust_data.head()

Unnamed: 0,user_id,First Name,Last Name,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [18]:
# Rename 'n_dependants' column to 'number_of_dependants'
df_cust_data.rename(columns = {'n_dependants' : 'number_of_dependants'}, inplace = True)

In [19]:
# Ensure renaming was successful
df_cust_data.head()

Unnamed: 0,user_id,First Name,Last Name,Gender,STATE,Age,date_joined,number_of_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


#### Observations: All variables are relevant, so no columns should be dropped.

In [20]:
# Change 'First Name' column to string
df_cust_data['First Name'] = df_cust_data['First Name'].astype('str')

In [21]:
# Check data type for 'First Name'
df_cust_data['First Name'].dtype

dtype('O')

In [22]:
# Ensure that mixed-type data has been resolved
for col in df_cust_data.columns.tolist():
  weird = (df_cust_data[[col]].applymap(type) != df_cust_data[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_cust_data[weird]) > 0:
    print (col)

#### Observations: There are 11,259 NaN values in the 'First Name' column, and those values were causing that column to be flagged as having mixed-type data. The records containing the NaN values will not be removed because the missing data won't cause problems for the analysis. The data type for the 'First Name' column has been changed to 'object' so that the column will no longer be flagged as mixed-type data.  

# 05. Data Cleaning

#### Observations: There are no duplicates in this data set, and missing values will not be removed or imputed.

# 06. Merge 'df_cust_data' with 'df_ords_prods_merge'

In [23]:
# Check 'df_ords_prods_merge' size
df_ords_prods_merge.shape

(32404859, 24)

In [26]:
# Drop '_merge' column from df_ords_prods_merge
df_ords_prods_merge = df_ords_prods_merge.drop(columns = ['_merge'])

In [27]:
# Drop 'busiest_day' column from df_ords_prods_merge
df_ords_prods_merge = df_ords_prods_merge.drop(columns = ['busiest_day'])

In [29]:
# Ensure columns were successfully dropped
df_ords_prods_merge.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest_days_2',
       'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'user_prices_mean', 'spending_flag', 'days_since_prior_order_median',
       'frequency_flag'],
      dtype='object')

#### Observations: Column 'user_id' is found in both dataframes

In [30]:
# Merge df_ords_prods_merge with df_cust_data
df_merged = df_ords_prods_merge.merge(df_cust_data, on = 'user_id', indicator = True)

In [31]:
# Check output for df_merged
df_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,First Name,Last Name,Gender,STATE,Age,date_joined,number_of_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both


In [32]:
# Check value counts for '_merge' column
df_merged['_merge'].value_counts()

both          32404859
left_only            0
right_only           0
Name: _merge, dtype: int64

#### Observations: Full match between dataframes.

In [33]:
# Check size for df_merged
df_merged.shape

(32404859, 32)

#### Observations: Size matches expections for df_merged.

# 07. Export Data

In [34]:
# Export df_merged as ords_prods_cust.pkl
df_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_cust.pkl'))