# 01. Importing libraries

In [1]:
# Import libraries
import warnings
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

from pandas.core.common import SettingWithCopyWarning
from scipy.stats import norm

# 02. Importing data

In [2]:
# Define csv path
path = r'C:\Users\lucav\Desktop\Instacart Basket Analysis'

In [3]:
# Import data from 'orders_products_merged_frequencies&flags.pkl' and 'customers.csv'
warnings.simplefilter(action='ignore', category=FutureWarning) # Added to ignore warnings
ords_prods = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged_frequencies&flags.pkl'))
df_customers = pd.read_csv(os.path.join(path, '02 Data','Original Data', 'customers.csv'))

# 03. Data wrangling

## Wrangle data in 'df_customers' so that it follows consistent logic

In [4]:
# Print the first 5 rows from 'df_customers'
df_customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [5]:
# Rename 'First Name' column to 'first_name', 'Surnam' to 'last_name', 'Gender' to 'gender', 'STATE' to 'state', 'Age' to 'age'
df_customers.rename(columns = {'First Name' : 'first_name', 'Surnam' : 'last_name', 'Gender' : 'gender', 'STATE' : 'state', 'Age' : 'age'}, inplace = True)

In [6]:
# Print the first 5 rows from 'df_customers' to ensure changes were properly applied
df_customers.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


# 04. Data quality and consistency checks

## Data type

In [7]:
# Check the data type of each column in 'df_customers'
df_customers.dtypes

user_id          int64
first_name      object
last_name       object
gender          object
state           object
age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

The 'user_id' column can be changed from int64 type to string type.

In [8]:
# Change 'user_id' type to string
df_customers['user_id'] = df_customers['user_id'].astype('str')

In [9]:
# Verify that 'user_id' type has been changed to string
df_customers.dtypes

user_id         object
first_name      object
last_name       object
gender          object
state           object
age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [10]:
# Check for mixed-type data in 'df_customers'
for col in df_customers.columns.tolist():
    weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_customers[weird]) > 0:
        print (col)

first_name


The 'first_name' column contains mixed-type data but it should be a string type. This requires additional checks to identify which values are stored differently.

In [11]:
# Return only those rows where the 'first_name' column is not a string
df_customers[df_customers['first_name'].apply(lambda x: not isinstance(x, str))]

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
53,76659,,Gilbert,Male,Colorado,26,1/1/2017,2,married,41709
73,13738,,Frost,Female,Louisiana,39,1/1/2017,0,single,82518
82,89996,,Dawson,Female,Oregon,52,1/1/2017,3,married,117099
99,96166,,Oconnor,Male,Oklahoma,51,1/1/2017,1,married,155673
105,29778,,Dawson,Female,Utah,63,1/1/2017,3,married,151819
...,...,...,...,...,...,...,...,...,...,...
206038,121317,,Melton,Male,Pennsylvania,28,3/31/2020,3,married,87783
206044,200799,,Copeland,Female,Hawaii,52,4/1/2020,2,married,108488
206090,167394,,Frost,Female,Hawaii,61,4/1/2020,1,married,45275
206162,187532,,Floyd,Female,California,39,4/1/2020,0,single,56325


It can be observed that 11259 rows have 'NaN' values in the 'first_name' column, that are not stored as a string type. This issue would need to be discussed with stakeholders to better assess the impact on the analysis.

## Missing values

In [12]:
# Find missing values in 'df_customers'
df_customers.isnull().sum()

user_id             0
first_name      11259
last_name           0
gender              0
state               0
age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64

In [13]:
# Check the missing values in the 'df_customers' column
df_customers[df_customers['first_name'].isnull()==True]

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
53,76659,,Gilbert,Male,Colorado,26,1/1/2017,2,married,41709
73,13738,,Frost,Female,Louisiana,39,1/1/2017,0,single,82518
82,89996,,Dawson,Female,Oregon,52,1/1/2017,3,married,117099
99,96166,,Oconnor,Male,Oklahoma,51,1/1/2017,1,married,155673
105,29778,,Dawson,Female,Utah,63,1/1/2017,3,married,151819
...,...,...,...,...,...,...,...,...,...,...
206038,121317,,Melton,Male,Pennsylvania,28,3/31/2020,3,married,87783
206044,200799,,Copeland,Female,Hawaii,52,4/1/2020,2,married,108488
206090,167394,,Frost,Female,Hawaii,61,4/1/2020,1,married,45275
206162,187532,,Floyd,Female,California,39,4/1/2020,0,single,56325


As previously discovered, 11259 values are missing in the 'first_name' column and the issue should be discussed with stakeholders. At the moment, data won't be modified because the missing values aren't deemed vital to the analysis.

## Duplicate values

In [14]:
# Find duplicate values in 'df_customers'
df_customers_dups = df_customers[df_customers.duplicated()]

# Return 'df_customers_dups'
df_customers_dups

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income


No duplicate values found.

## Look for anomalies

In [15]:
# Run the describe() function on 'df_customers'
df_customers.describe()

Unnamed: 0,age,n_dependants,income
count,206209.0,206209.0,206209.0
mean,49.501646,1.499823,94632.852548
std,18.480962,1.118433,42473.786988
min,18.0,0.0,25903.0
25%,33.0,0.0,59874.0
50%,49.0,1.0,93547.0
75%,66.0,3.0,124244.0
max,81.0,3.0,593901.0


No outstanding anomalies observed.

# Export 'df_customers' as 'customers_prepared.pkl'

In [16]:
df_customers.to_pickle(os.path.join(path,'02 Data','Prepared Data','customers_prepared.pkl'))

# 05. Combine 'df_customers' with 'ords_prods'

In [17]:
# Print the first 5 rows from 'df_customers'
df_customers.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [18]:
# Print the first 50 rows from 'df_customers'
ords_prods.head(50)

Unnamed: 0,order_id,user_id,order_number,day_of_week,hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,prices,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag,customer_spending,spending_flag,order_frequency,order_frequency_flag
0,2539329,1,1,2,8,,True,196,1,0,...,9.0,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2398795,1,2,3,7,15.0,False,196,1,1,...,9.0,Mid-range product,Slowest days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,473747,1,3,3,12,21.0,False,196,1,1,...,9.0,Mid-range product,Slowest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2254736,1,4,4,7,29.0,False,196,1,1,...,9.0,Mid-range product,Slowest days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,431534,1,5,4,15,28.0,False,196,1,1,...,9.0,Mid-range product,Slowest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
5,3367565,1,6,2,7,19.0,False,196,1,1,...,9.0,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
6,550135,1,7,1,9,20.0,False,196,1,1,...,9.0,Mid-range product,Busiest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
7,3108588,1,8,1,14,14.0,False,196,2,1,...,9.0,Mid-range product,Busiest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
8,2295261,1,9,1,16,0.0,False,196,4,1,...,9.0,Mid-range product,Busiest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
9,2550362,1,10,4,8,30.0,False,196,1,1,...,9.0,Mid-range product,Slowest days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


The key column that will be used to merge 'df_customers' with 'ords_prods' is '**user_id**'.

In [19]:
# Check the data types of 'ords_prods'
ords_prods.dtypes

order_id                    int64
user_id                     int64
order_number                int64
day_of_week                 int64
hour_of_day                 int64
days_since_prior_order    float64
first_order                  bool
product_id                  int64
add_to_cart_order           int64
reordered                   int64
product_name               object
aisle_id                    int64
department_id               int64
prices                    float64
price_range_loc            object
busiest_days               object
busiest_period_of_day      object
max_order                   int64
loyalty_flag               object
customer_spending         float64
spending_flag              object
order_frequency           float64
order_frequency_flag       object
dtype: object

In [20]:
# Change the type of 'user_id' column in 'ords_prods' to string
ords_prods['user_id'] = ords_prods['user_id'].astype('str')

In [21]:
# Check that the data type of 'user_id' is the same in 'df_customer' and 'ords_prods'
df_customers['user_id'].dtype == ords_prods['user_id'].dtype

True

In [22]:
# Merge 'df_customers' with 'ords_prods' using 'user_id' as key column
df_ords_prods_cust = ords_prods.merge(df_customers, on = 'user_id')

In [23]:
# Print the first 5 rows from 'df_ords_prods_cust'
df_ords_prods_cust.head()

Unnamed: 0,order_id,user_id,order_number,day_of_week,hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,order_frequency_flag,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
0,2539329,1,1,2,8,,True,196,1,0,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,False,196,1,1,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,False,196,1,1,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,False,196,1,1,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,False,196,1,1,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


# 06. Export 'df_ords_prods_cust' as 'orders_products_customers_merged.pkl'

In [24]:
df_ords_prods_cust.to_pickle(os.path.join(path,'02 Data','Prepared Data','orders_products_customers_merged.pkl'))