# Table of Contents
###  01: Import Libraries and Data
### 02: Data Wrangling Customers DF
### 03: Data Quality and Consistency Checks for Customers DF
### 04: Merge Data
### 05: Export Data

# 01. Import Libraries and Data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
path = r'/Users/Katherine/Desktop/Instacart Basket Analysis'

In [4]:
#import customer data set
customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

In [5]:
#Import Orders_products_merged Data
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))

# 02. Data Wrangling Customers DF

In [6]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [7]:
customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [8]:
customers.shape

(206209, 10)

In [9]:
#change column name
customers.rename(columns = {'First Name':'first_name'}, inplace = True)

In [10]:
#change column name
customers.rename(columns = {'STATE':'State'}, inplace = True)

In [11]:
#change column name
customers.rename(columns = {'Surnam':'last_name'}, inplace = True)

In [12]:
#change column name
customers.rename(columns = {'n_dependants':'dependents'}, inplace = True)

In [13]:
#change column name
customers.rename(columns = {'fam_status':'marital_status'}, inplace = True)

# 03. Data Quality and Consistency Checks for Customers DF

In [14]:
#finding missing values
customers.isnull().sum()

user_id               0
first_name        11259
last_name             0
Gender                0
State                 0
Age                   0
date_joined           0
dependents            0
marital_status        0
income                0
dtype: int64

In [15]:
ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 24 columns):
 #   Column                         Dtype   
---  ------                         -----   
 0   order_id                       int64   
 1   user_id                        int64   
 2   order_number                   int64   
 3   orders_day_of_week             int64   
 4   order_time                     int64   
 5   days_since_prior_order         float64 
 6   product_id                     int64   
 7   add_to_cart_order              int64   
 8   reordered                      int64   
 9   _merge                         category
 10  product_name                   object  
 11  aisle_id                       int64   
 12  department_id                  int64   
 13  prices                         float64 
 14  price_range_loc                object  
 15  busiest_day                    object  
 16  busiest_days                   object  
 17  busiest_period_of_day    

In [17]:
#changing user_id to string in merged data
ords_prods_merge['user_id'] = ords_prods_merge['user_id'].astype('str')

In [18]:
#changing user_id to string in customers data
customers['user_id'] = customers['user_id'].astype('str')

In [19]:
customers['Gender'].value_counts(dropna = False)

Gender
Male      104067
Female    102142
Name: count, dtype: int64

In [20]:
customers['State'].value_counts(dropna = False)

State
Florida                 4044
Colorado                4044
Illinois                4044
Alabama                 4044
District of Columbia    4044
Hawaii                  4044
Arizona                 4044
Connecticut             4044
California              4044
Indiana                 4044
Arkansas                4044
Alaska                  4044
Delaware                4044
Iowa                    4044
Idaho                   4044
Georgia                 4044
Wyoming                 4043
Mississippi             4043
Oklahoma                4043
Utah                    4043
New Hampshire           4043
Kentucky                4043
Maryland                4043
Rhode Island            4043
Massachusetts           4043
Michigan                4043
New Jersey              4043
Kansas                  4043
South Dakota            4043
Minnesota               4043
Tennessee               4043
New York                4043
Washington              4043
Louisiana               4043
Montana 

In [21]:
customers['Age'].value_counts(dropna = False)

Age
19    3329
55    3317
51    3317
56    3306
32    3305
      ... 
65    3145
25    3127
66    3114
50    3102
36    3101
Name: count, Length: 64, dtype: int64

In [22]:
customers['dependents'].value_counts(dropna = False)

dependents
0    51602
3    51594
1    51531
2    51482
Name: count, dtype: int64

In [23]:
customers['marital_status'].value_counts(dropna = False)

marital_status
married                             144906
single                               33962
divorced/widowed                     17640
living with parents and siblings      9701
Name: count, dtype: int64

In [24]:
customers['income'].value_counts(dropna = False)

income
57192     10
95891     10
95710     10
97532      9
98675      9
          ..
73141      1
71524      1
74408      1
44780      1
148828     1
Name: count, Length: 108012, dtype: int64

In [25]:
#looking for duplicates
duplicates = customers[customers.duplicated()]

In [26]:
duplicates

Unnamed: 0,user_id,first_name,last_name,Gender,State,Age,date_joined,dependents,marital_status,income


# 04. Merge Data

In [27]:
customers.shape

(206209, 10)

In [28]:
ords_prods_merge.shape

(32404859, 24)

In [29]:
#merge on user_id column 
df_merged = ords_prods_merge.merge(customers, on = 'user_id')

In [30]:
df_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,...,order_frequency,first_name,last_name,Gender,State,Age,date_joined,dependents,marital_status,income
0,2539329,1,1,2,8,,196,1,0,both,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,both,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,both,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,both,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,both,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


In [31]:
df_merged.shape

(32404859, 33)

# 05. Export Data

In [32]:
#export
df_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_customers.pkl'))