# Part 1 Customer data set preparation and combining with rest data 

1. Import libraries and customer data set
2. Exploratory data analysis 
3. Data Wrangling

3.1. Dropping columns
3.2. Renaming columns
3.3. Changing data type

4. Data consistency check

4.1. Checking mixed data types 
4.2. Missing data 
4.3. Duplicates

5. Combining customer data set with ords_prods_merge data 
6. Export ords_prods_all dataset en pkl

# 1. Import libraries and customer data set

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Create path
path = r"C:\Users\irikh\iCloudDrive\Data analytics\Instacart basket Analysis"

In [3]:
# Import data set
df_customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

# 2. Exploratory data analysis

In [4]:
df_customers.head ()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [5]:
df_customers.tail ()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
206204,168073,Lisa,Case,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Jeremy,Robbins,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Doris,Richmond,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Rose,Rollins,Female,California,27,4/1/2020,1,married,99799
206208,80148,Cynthia,Noble,Female,New York,55,4/1/2020,1,married,57095


In [6]:
df_customers.shape

(206209, 10)

In [7]:
df_customers.info ()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


# 3. Data Wrangling

## 3.1. Dropping columns

### No dropping columns

## 3.2. Renaming columns

In [8]:
# Renaming columns
df_customers.rename(columns = {'Surnam' : 'Surname','STATE' : 'US state','n_dependants' : 'number_of_dependants','fam_status' : 'marital_status'}, inplace = True)                       

In [9]:
# Checking result
df_customers.columns

Index(['user_id', 'First Name', 'Surname', 'Gender', 'US state', 'Age',
       'date_joined', 'number_of_dependants', 'marital_status', 'income'],
      dtype='object')

## 3.3. Changing data type

In [10]:
# Checking data type
df_customers.dtypes

user_id                  int64
First Name              object
Surname                 object
Gender                  object
US state                object
Age                      int64
date_joined             object
number_of_dependants     int64
marital_status          object
income                   int64
dtype: object

# 4. Data consistency check

In [11]:
# General statistic data view
df_customers.describe ()

Unnamed: 0,user_id,Age,number_of_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


## 4.1. Checking mixed data types

In [12]:
# Checking data for mix data type
for col in df_customers.columns.tolist():
  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customers[weird]) > 0:
    print (col, ' mixed')
  else: print (col, ' consistent')

user_id  consistent
First Name  mixed
Surname  consistent
Gender  consistent
US state  consistent
Age  consistent
date_joined  consistent
number_of_dependants  consistent
marital_status  consistent
income  consistent


In [13]:
# Changing mixed datatype 
df_customers['First Name'] = df_customers['First Name'].astype('str')    

In [14]:
# Checking result
for col in df_customers.columns.tolist():
  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customers[weird]) > 0:
    print (col, ' mixed')
  else: print (col, ' consistent')

user_id  consistent
First Name  consistent
Surname  consistent
Gender  consistent
US state  consistent
Age  consistent
date_joined  consistent
number_of_dependants  consistent
marital_status  consistent
income  consistent


## 4.2. Missing data

In [15]:
# Finding missing value 
df_customers.isnull().sum()      

user_id                 0
First Name              0
Surname                 0
Gender                  0
US state                0
Age                     0
date_joined             0
number_of_dependants    0
marital_status          0
income                  0
dtype: int64

## 4.3. Duplicates

In [16]:
# Finding duplicates by creating subset 
df_dups = df_customers[df_customers.duplicated ()]

In [17]:
# Checking result (subset) 
df_dups

Unnamed: 0,user_id,First Name,Surname,Gender,US state,Age,date_joined,number_of_dependants,marital_status,income


# 5. Combining customer data set with ords_prods_merge data

In [18]:
# Import ords_prods_merge.pkl data
df_ords_prods_merge = pd.read_pickle(r'C:\Users\irikh\iCloudDrive\Data analytics\Instacart basket Analysis\02 Data\Prepared Data\ords_prods_merge.pkl')

In [19]:
# Checking data after importing
df_ords_prods_merge.head (10)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,...,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spending_flag,median_frequency,frequency_flag
0,2539329,1,1,2,8,,196.0,1.0,0.0,Soda,...,Mid-range product,Regularly busy,Regularly busy,Regularly busy,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2398795,1,2,3,7,15.0,196.0,1.0,1.0,Soda,...,Mid-range product,Regularly busy,Least busy,Regularly busy,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,473747,1,3,3,12,21.0,196.0,1.0,1.0,Soda,...,Mid-range product,Regularly busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2254736,1,4,4,7,29.0,196.0,1.0,1.0,Soda,...,Mid-range product,Least busy,Least busy,Regularly busy,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,431534,1,5,4,15,28.0,196.0,1.0,1.0,Soda,...,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
5,3367565,1,6,2,7,19.0,196.0,1.0,1.0,Soda,...,Mid-range product,Regularly busy,Regularly busy,Regularly busy,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
6,550135,1,7,1,9,20.0,196.0,1.0,1.0,Soda,...,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
7,3108588,1,8,1,14,14.0,196.0,2.0,1.0,Soda,...,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
8,2295261,1,9,1,16,0.0,196.0,4.0,1.0,Soda,...,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
9,2550362,1,10,4,8,30.0,196.0,1.0,1.0,Soda,...,Mid-range product,Least busy,Least busy,Regularly busy,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


In [20]:
# Checking data after importing
df_ords_prods_merge.tail (10)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,...,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spending_flag,median_frequency,frequency_flag
32404849,576295,202557,12,3,10,11.0,43553.0,14.0,1.0,Orange Energy Shots,...,Low-range product,Regularly busy,Least busy,Most orders,31,Regular customer,6.905655,Low spender,8.0,Frequent customer
32404850,960088,202557,13,4,12,15.0,43553.0,3.0,1.0,Orange Energy Shots,...,Low-range product,Least busy,Least busy,Most orders,31,Regular customer,6.905655,Low spender,8.0,Frequent customer
32404851,343962,202557,14,0,10,3.0,43553.0,2.0,1.0,Orange Energy Shots,...,Low-range product,Busiest day,Busiest days,Most orders,31,Regular customer,6.905655,Low spender,8.0,Frequent customer
32404852,2329472,202557,15,6,12,6.0,43553.0,2.0,1.0,Orange Energy Shots,...,Low-range product,Regularly busy,Regularly busy,Most orders,31,Regular customer,6.905655,Low spender,8.0,Frequent customer
32404853,694731,202557,16,1,14,2.0,43553.0,2.0,1.0,Orange Energy Shots,...,Low-range product,Regularly busy,Busiest days,Most orders,31,Regular customer,6.905655,Low spender,8.0,Frequent customer
32404854,1320836,202557,17,2,15,1.0,43553.0,2.0,1.0,Orange Energy Shots,...,Low-range product,Regularly busy,Regularly busy,Most orders,31,Regular customer,6.905655,Low spender,8.0,Frequent customer
32404855,31526,202557,18,5,11,3.0,43553.0,2.0,1.0,Orange Energy Shots,...,Low-range product,Regularly busy,Regularly busy,Most orders,31,Regular customer,6.905655,Low spender,8.0,Frequent customer
32404856,758936,203436,1,2,7,,42338.0,4.0,0.0,"Zucchini Chips, Pesto",...,Mid-range product,Regularly busy,Regularly busy,Regularly busy,3,New customer,7.631579,Low spender,15.0,Regular customer
32404857,2745165,203436,2,3,5,15.0,42338.0,16.0,1.0,"Zucchini Chips, Pesto",...,Mid-range product,Regularly busy,Least busy,Fewest orders,3,New customer,7.631579,Low spender,15.0,Regular customer
32404858,3093936,205420,1,4,14,,28818.0,8.0,0.0,Hot Oatmeal Multigrain Raisin,...,Mid-range product,Least busy,Least busy,Most orders,16,Regular customer,7.684746,Low spender,13.0,Regular customer


In [21]:
# Checking data type of key columns before merging
df_ords_prods_merge["user_id"].dtype                                                                              

dtype('int64')

In [22]:
# Checking ords_prods_merge data before merging  
df_ords_prods_merge.shape

(32404859, 24)

In [23]:
# Checking customer data before merging  
df_customers.shape

(206209, 10)

In [24]:
# Delete merge flag for next merge
del df_ords_prods_merge['_merge']

In [25]:
# Merging customer data set with main using key user_id
df_ords_prods_all = df_ords_prods_merge.merge (df_customers, on = 'user_id', indicator = True)

In [26]:
# Confirm the merge using the merge flag (inner)
df_ords_prods_all['_merge'].value_counts()

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

In [27]:
# Confirm the merge using the merge flag (outer)
df_ords_prods_all['_merge'].value_counts(dropna = False)

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

In [30]:
# Checking new data columns after merging
df_ords_prods_all.head(30)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,...,First Name,Surname,Gender,US state,Age,date_joined,number_of_dependants,marital_status,income,_merge
0,2539329,1,1,2,8,,196.0,1.0,0.0,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,196.0,1.0,1.0,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,196.0,1.0,1.0,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,196.0,1.0,1.0,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,196.0,1.0,1.0,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
5,3367565,1,6,2,7,19.0,196.0,1.0,1.0,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
6,550135,1,7,1,9,20.0,196.0,1.0,1.0,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
7,3108588,1,8,1,14,14.0,196.0,2.0,1.0,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
8,2295261,1,9,1,16,0.0,196.0,4.0,1.0,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
9,2550362,1,10,4,8,30.0,196.0,1.0,1.0,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both


In [28]:
# Checking data shape after merging
df_ords_prods_all.shape

(32404859, 33)

# 6. Export ords_prods_all dataset en pkl

In [31]:
# Export data in PKl
df_ords_prods_all.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_all.pkl'))