# Table of Content

1. Importing libraries and datasets
2. Data Cleaning and Wrangling
3. Consistency Checks
4. Merging Datasets
5. Exporting dataset

# Importing libraries and datasets

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Defining path for data import
path = r'/Users/frederikeschulz-mullensiefen/Desktop/Master Folder_Instacart'

In [3]:
# Import orders_products dataframe
df_ordsprods = pd.read_pickle(os.path.join(path, '02_Data', 'Prepared Data', 'ords_prods_flags.pkl'))

In [4]:
# Define needed columns for customer dataframe(first and last name are not needed and ensure privacy)
cust_list = ['user_id', 'Gender', 'STATE', 'Age', 'date_joined', 'n_dependants', 'fam_status', 'income']

In [5]:
# Import customer data
df_customer = pd.read_csv(os.path.join(path, '02_Data', 'Original Data', 'customers.csv'), usecols = cust_list)

In [6]:
df_customer.shape

(206209, 8)

# Data Cleaning and Wrangling

In [6]:
# Renaming column Gender to gender
df_customer.rename(columns = {'Gender' : 'gender'}, inplace = True)

In [7]:
# Renaming column STATE to state
df_customer.rename(columns = {'STATE' : 'state'}, inplace = True)

In [8]:
# Renaming column Age to age
df_customer.rename(columns = {'Age' : 'age'}, inplace = True)

In [9]:
# Check if renaming was successful
df_customer.head()

Unnamed: 0,user_id,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


In [10]:
# Checking data types customer data
df_customer.dtypes

user_id          int64
gender          object
state           object
age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [11]:
# Changing user_id into string data type in customer dataset
df_customer['user_id'] = df_customer['user_id'].astype('str')

In [12]:
# Changing user_id into string data type in ordsprods dataset
df_ordsprods['user_id'] = df_ordsprods['user_id'].astype('str')

# Consistency Checks

In [13]:
# Descriptive statistics
df_customer.describe()

Unnamed: 0,age,n_dependants,income
count,206209.0,206209.0,206209.0
mean,49.501646,1.499823,94632.852548
std,18.480962,1.118433,42473.786988
min,18.0,0.0,25903.0
25%,33.0,0.0,59874.0
50%,49.0,1.0,93547.0
75%,66.0,3.0,124244.0
max,81.0,3.0,593901.0


In [14]:
# Finding missing data 
df_customer.isnull().sum()

user_id         0
gender          0
state           0
age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64

In [15]:
# Finding duplicates
df_dups = df_customer[df_customer.duplicated()]

In [16]:
# Checking for duplicates
df_dups

Unnamed: 0,user_id,gender,state,age,date_joined,n_dependants,fam_status,income


In [17]:
# Checking for mixed-type data
for col in df_customer.columns.tolist():
  weird = (df_customer[[col]].applymap(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customer[weird]) > 0:
    print (col)

  weird = (df_customer[[col]].applymap(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customer[[col]].applymap(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customer[[col]].applymap(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customer[[col]].applymap(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customer[[col]].applymap(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customer[[col]].applymap(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customer[[col]].applymap(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_customer[[col]].applymap(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)


In [18]:
# Sense check for gender column
df_customer['gender'].value_counts()

gender
Male      104067
Female    102142
Name: count, dtype: int64

In [19]:
# Sense check for state column
df_customer['state'].value_counts()

state
Florida                 4044
Colorado                4044
Illinois                4044
Alabama                 4044
District of Columbia    4044
Hawaii                  4044
Arizona                 4044
Connecticut             4044
California              4044
Indiana                 4044
Arkansas                4044
Alaska                  4044
Delaware                4044
Iowa                    4044
Idaho                   4044
Georgia                 4044
Wyoming                 4043
Mississippi             4043
Oklahoma                4043
Utah                    4043
New Hampshire           4043
Kentucky                4043
Maryland                4043
Rhode Island            4043
Massachusetts           4043
Michigan                4043
New Jersey              4043
Kansas                  4043
South Dakota            4043
Minnesota               4043
Tennessee               4043
New York                4043
Washington              4043
Louisiana               4043
Montana 

In [20]:
# Sense check for fam_status column
df_customer['fam_status'].value_counts()

fam_status
married                             144906
single                               33962
divorced/widowed                     17640
living with parents and siblings      9701
Name: count, dtype: int64

Observations: 
- The summary statistics make sense for all variables
- There are no missing values
- There are no duplicates
- There are no mix-type data
- The object column values make sense

# Merging Datasets

In [22]:
# Checking columns for key for merging
df_ordsprods.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,ordertime_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,prices,price_range,busiest_days,busiest_period_of_day,max_order,loyalty_flag,average_price,spending_flag,median_order_frequency,order_frequency_flag
0,2539329,1,1,2,8,,196,1,0,Soda,...,9.0,Mid-range product,Regular days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,9.0,Mid-range product,Slowest days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,9.0,Mid-range product,Slowest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,9.0,Mid-range product,Slowest days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,9.0,Mid-range product,Slowest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


In [23]:
# Checking columns for key for merging
df_customer.head()

Unnamed: 0,user_id,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


The key for merging is user_id.

In [24]:
# Create inner merge between customer and ordsprods datasets
df_merged = df_ordsprods.merge(df_customer, on ='user_id', indicator = True)

In [25]:
# Check if merge was successful
df_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,ordertime_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,median_order_frequency,order_frequency_flag,gender,state,age,date_joined,n_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,...,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both


In [27]:
# Check merge
df_merged['_merge'].value_counts()

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

In [None]:
# Create inner merge between customer and ordsprods datasets without _merge column
df_final = df_ordsprods.merge(df_customer, on ='user_id')

In [29]:
df_final.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,ordertime_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,spending_flag,median_order_frequency,order_frequency_flag,gender,state,age,date_joined,n_dependants,fam_status,income
0,2539329,1,1,2,8,,196,1,0,Soda,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423


In [None]:
df_final.shape

# Exporting dataframe

In [31]:
# Exporting dataframe
df_final.to_pickle(os.path.join(path,'02_Data','Prepared Data', 'ords_prods_cust.pkl'))