# Contents
### Importing Libraries and Data 
### Data Wrangling
### Quality and consistency checks
### Combine df_cus_info with the df_ords_prods_merge dataset¶
### Export Data

In [6]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [7]:
# Import Data
path = r'/Users/maitran/Documents/Instacart Basket Analysis'

In [8]:
# Import the customer_data_set dataframe
df_cus_info = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

In [9]:
df_cus_info.shape

(206209, 10)

In [10]:
df_cus_info.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


### Step 4. Data Wrangling

In [11]:
# Renaming columm 'First Name'
df_cus_info.rename(columns = {'First Name' : 'first_name'}, inplace = True)

In [12]:
# Renaming columm 'Surnam'
df_cus_info.rename(columns = {'Surnam' : 'surname'}, inplace = True)

In [13]:
# Renaming columm 'Gender'
df_cus_info.rename(columns = {'Gender' : 'gender'}, inplace = True)

In [14]:
# Renaming columm 'STATE'
df_cus_info.rename(columns = {'STATE' : 'state'}, inplace = True)

In [15]:
# Renaming columm 'Age'
df_cus_info.rename(columns = {'Age' : 'age'}, inplace = True)

In [16]:
# Rename n_dependents column
df_cus_info.rename(columns = {'n_dependants': 'number_of_dependants'}, inplace = True)

In [17]:
df_cus_info.head()

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,number_of_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


##### Observation: I changed all the column above so they can have consistent look. The title format will match with other data set too, which make them easier to look and follow when we combine all the data sets.

### Step 5. Quality and consistency checks

In [18]:
df_cus_info.shape

(206209, 10)

In [19]:
df_cus_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   user_id               206209 non-null  int64 
 1   first_name            194950 non-null  object
 2   surname               206209 non-null  object
 3   gender                206209 non-null  object
 4   state                 206209 non-null  object
 5   age                   206209 non-null  int64 
 6   date_joined           206209 non-null  object
 7   number_of_dependants  206209 non-null  int64 
 8   fam_status            206209 non-null  object
 9   income                206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


##### Observation: The first_name column has missing data. It only has 19450 values while other columns have 206209 values

In [20]:
df_cus_info.describe()

Unnamed: 0,user_id,age,number_of_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [21]:
# Find missing value in the data set
df_cus_info.isnull().sum()

user_id                     0
first_name              11259
surname                     0
gender                      0
state                       0
age                         0
date_joined                 0
number_of_dependants        0
fam_status                  0
income                      0
dtype: int64

##### Observation: It's not necessary to do anything for all the first_name missing values, especially when we have enough values for surname.

In [22]:
# Finding full duplicates
df_cus_info_dups = df_cus_info[df_cus_info.duplicated()]

In [23]:
# check the number of duplicate entries by looking at this new dataframe
df_cus_info_dups.shape

(0, 10)

##### Observation: There is no duplicates found.

In [24]:
# Check for mixed types
for col in df_cus_info.columns.tolist():
  weird = (df_cus_info[[col]].applymap(type) != df_cus_info[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_cus_info[weird]) > 0:
    print (col)

first_name


In [25]:
# Change column 'first-name' data type to string
df_cus_info['first_name'] = df_cus_info['first_name'].astype('str')

In [26]:
# Check output
df_cus_info['first_name'].dtype

dtype('O')

### Step 6: Combine df_cus_info with the df_ords_prods_merge dataset


In [28]:
# Import the orders_products_merge dataframe
df_ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_aggregated.pkl'))

In [29]:
df_ords_prods_merge.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,...,busiest_day,busiest_days,busiest_period_of_day,price_range_loc,max_order,loyalty_flag,avg_order,spending_flag,median_prior_order,order_frequency_flag
0,2539329,1,prior,1,2,8,,196,1,0,...,Regularly busy,Regularly days,Average orders,Mid-range product,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2398795,1,prior,2,3,7,15.0,196,1,1,...,Regularly busy,Slowest days,Average orders,Mid-range product,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,473747,1,prior,3,3,12,21.0,196,1,1,...,Regularly busy,Slowest days,Most orders,Mid-range product,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2254736,1,prior,4,4,7,29.0,196,1,1,...,Least busy,Slowest days,Average orders,Mid-range product,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,431534,1,prior,5,4,15,28.0,196,1,1,...,Least busy,Slowest days,Most orders,Mid-range product,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


##### Observation: The key column to use to combine all the data set is user_id column. The user_id is not quantitative so its data type should be string. We have to make sure both user_id column in each data set should be the same type so that we can combine them. 

In [30]:
# Checking to see if user_id in df_cus_info is string type
df_cus_info['user_id'] = df_cus_info['user_id'].astype('str')

In [31]:
# Check output
print(df_cus_info['user_id'].dtypes)

object


In [32]:
# Checking to see if user_id in df_ords_prods_merge is string type
df_ords_prods_merge['user_id'] = df_ords_prods_merge['user_id'].astype('str')

In [33]:
# Check output
print(df_ords_prods_merge['user_id'].dtypes)

object


In [34]:
# Merged both of the data sets on user_id
df_ords_prods_cus_combined = df_ords_prods_merge.merge(df_cus_info, on = 'user_id', indicator = 'True')

In [35]:
df_ords_prods_cus_combined.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,...,first_name,surname,gender,state,age,date_joined,number_of_dependants,fam_status,income,True
0,2539329,1,prior,1,2,8,,196,1,0,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,prior,2,3,7,15.0,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,prior,3,3,12,21.0,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,prior,4,4,7,29.0,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,prior,5,4,15,28.0,196,1,1,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both


### Step 8: Export Data

In [2]:
df_ords_prods_cus_combined.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'customer_merged.pkl'))

NameError: name 'df_ords_prods_cus_combined' is not defined