# Customer Data Set - Check and Combine

1. Import Libraries and Customer Data
2. Customer Data set descriptive statisitcs
3. Customer Data wrangling and consistancy check
4. Combine Customer Data with Orders & Products Data
5. Check the merged data set
6. Export Combined Data

# 01. Import libraries and data

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
path = r'C:\Users\lizan\Desktop\Data Analytics\4.0\31-05-2022 Instacart Basket Analysis'

In [3]:
cx = pd.read_csv(os.path.join(path,'02 Data','Original Data','customers.csv'))

# 02. Customer Data Descriptive Statistics

In [4]:
cx.shape

(206209, 10)

In [5]:
cx.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [6]:
cx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [7]:
cx.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


# 03. Data Wrangling & Consistency Checks

In [4]:
# Change user_id to string data
cx['user_id'] = cx['user_id'].astype('str')

In [5]:
# Find columns with missing values
cx.isnull().sum()

user_id             0
First Name      11259
Surnam              0
Gender              0
STATE               0
Age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64

In [6]:
# Find duplicates
cx_dups = cx[cx.duplicated()]

In [7]:
cx_dups

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income


First name missing values and not relevant to the analysis and duplicates not found in the customer data set.

# 04. Combine Customer Data with Orders and Products

In [6]:
# Import current Orders and products data
ords_prods = pd.read_pickle(os.path.join(path,'02 Data','Prepared Data','orders_products_spend_freq2.pkl'))

In [9]:
# Check no. of rows
ords_prods.shape

(32404859, 25)

In [7]:
ords_prods.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 25 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int64   
 1   user_id                 int64   
 2   order_number            int64   
 3   orders_day_of_week      int64   
 4   order_time              int64   
 5   days_since_prior_order  float64 
 6   new_customer            bool    
 7   product_id              int64   
 8   add_to_cart_order       int64   
 9   reordered               int64   
 10  product_name            object  
 11  aisle_id                int64   
 12  department_id           int64   
 13  prices                  float64 
 14  _merge                  category
 15  price_range_loc         object  
 16  Busiest_Day             object  
 17  Busiest_Days            object  
 18  Busiest_Hours           object  
 19  max_order               int64   
 20  loyalty_flag            object  
 21  avg_sp

In [8]:
# Change user_id to string data
ords_prods['user_id'] = ords_prods['user_id'].astype('str')

# 05. Check the merged dataset

In [None]:
# Check first 5 rows of data
ords_prods.head()

In [9]:
# remove the _merge column from previous merge
ords_prods = ords_prods.drop(columns = ['_merge'])

In [10]:
# Merge data sets customes into orders and products
ords_prods_cx = ords_prods.merge(cx, on = ['user_id'], indicator = True)

In [11]:
# Check row & columns
ords_prods_cx.shape

(32404859, 34)

In [12]:
# Check the merge first 5 rows
ords_prods_cx.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,,True,196,1,0,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,False,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both


In [13]:
ords_prods_cx.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_time', 'days_since_prior_order', 'new_customer', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'Busiest_Day',
       'Busiest_Days', 'Busiest_Hours', 'max_order', 'loyalty_flag',
       'avg_spend', 'spend_type', 'median_order_frequency', 'frequency_flag',
       'First Name', 'Surnam', 'Gender', 'STATE', 'Age', 'date_joined',
       'n_dependants', 'fam_status', 'income', '_merge'],
      dtype='object')

In [14]:
# Check for missing values in combined data
ords_prods_cx.isnull().sum()

order_id                        0
user_id                         0
order_number                    0
orders_day_of_week              0
order_time                      0
days_since_prior_order    2076096
new_customer                    0
product_id                      0
add_to_cart_order               0
reordered                       0
product_name                    0
aisle_id                        0
department_id                   0
prices                       5127
price_range_loc                 0
Busiest_Day                     0
Busiest_Days                    0
Busiest_Hours                   0
max_order                       0
loyalty_flag                    0
avg_spend                       0
spend_type                      0
median_order_frequency          5
frequency_flag                  5
First Name                1775118
Surnam                          0
Gender                          0
STATE                           0
Age                             0
date_joined   

Date since prior order are new customers, missing prices were outliers.  

In [None]:
# subset for missing first names
ords_prods_cx_nan = ords_prods_cx[ords_prods_cx['First Name'].isnull()==True]

In [None]:
ords_prods_cx_nan.head()

Not having a first name will not be a barrier to running anaylsis and therefore can remain as missing values

# 06. Export Combined Data

In [15]:
# export merged data in a pickle file
ords_prods_cx.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_customers_merged.pkl'))