# Cleaning, Wrangling, and Merging Customers 

## Contains:
### Exploring customers dataset
### Check for mixed datatypes - none
### Dropping two columns for security
### Rename columns for clarity
### Check for missing values & duplicates
### Descriptive statistics for all variables
### Merge customers.csv with ords_prods_merged 
### Drop merge indicator columns

## Import libraries and customers.csv

In [1]:
#import libraries
import pandas as pd
import numpy as np
import os

In [2]:
#create file path
path = r'C:\Users\krist\08.2023_InstacartBasketAnalysis'

In [3]:
#import latest customers dataset and check import
df_cust = pd.read_csv(os.path.join(path, 
                                   '02.Data', 
                                   'Original_Data', 
                                   'customers.csv'))
df_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [4]:
#check data types and shape 
df_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


### 206,209 rows, 10 columns
### no mixed data types noted

## Drop unnecessary columns for analysis and rename unclear columns

In [5]:
#drop first name and surnam columns to preserve privacy, only need user_id
df_cust = df_cust.drop(columns = ['First Name', 'Surnam'])

In [6]:
#rename state
df_cust.rename(columns = {'STATE':'State'}, inplace = True)

In [7]:
#rename n_dependants
df_cust.rename(columns = {'n_dependants':'num_dependants'}, 
               inplace = True)

In [8]:
#check column changes
df_cust.columns

Index(['user_id', 'Gender', 'State', 'Age', 'date_joined', 'num_dependants',
       'fam_status', 'income'],
      dtype='object')

## Check for missing values

In [9]:
#check for missings
df_cust.isnull().sum()

user_id           0
Gender            0
State             0
Age               0
date_joined       0
num_dependants    0
fam_status        0
income            0
dtype: int64

### no missing values

## Check for duplicates

In [10]:
#create new df for duplicate values
df_dupes = df_cust[df_cust.duplicated()]
df_dupes

Unnamed: 0,user_id,Gender,State,Age,date_joined,num_dependants,fam_status,income


### no duplicates present

## Look at descriptive statistics

In [11]:
df_cust.describe()

Unnamed: 0,user_id,Age,num_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


### min age 18 max 81, mean 49
### min number dependants 0 max 3, mean 1
### min income 25,903 max 593,901, mean 94,632 -- max nearly 6x larger than mean could skew analysis with income

## Look at possible entries for gender, state, and fam_status

In [12]:
df_cust['Gender'].value_counts()

Male      104067
Female    102142
Name: Gender, dtype: int64

### two options for gender are male/female. 49.5% female 50.5% male

In [13]:
df_cust['State'].value_counts()

Florida                 4044
Colorado                4044
Illinois                4044
Alabama                 4044
District of Columbia    4044
Hawaii                  4044
Arizona                 4044
Connecticut             4044
California              4044
Indiana                 4044
Arkansas                4044
Alaska                  4044
Delaware                4044
Iowa                    4044
Idaho                   4044
Georgia                 4044
Wyoming                 4043
Mississippi             4043
Oklahoma                4043
Utah                    4043
New Hampshire           4043
Kentucky                4043
Maryland                4043
Rhode Island            4043
Massachusetts           4043
Michigan                4043
New Jersey              4043
Kansas                  4043
South Dakota            4043
Minnesota               4043
Tennessee               4043
New York                4043
Washington              4043
Louisiana               4043
Montana       

In [14]:
df_cust['State'].nunique()

51

### state options include all 50 states plus DC for 51 options

In [15]:
df_cust['fam_status'].value_counts()

married                             144906
single                               33962
divorced/widowed                     17640
living with parents and siblings      9701
Name: fam_status, dtype: int64

### options for fam_status are married, single, divorced/wodowed, living with parents and siblings. 70% are married, 16% single, 9% divorced/widowed, 5% living with parents and siblings

## Merge customers and other orders_products_merged

In [16]:
#import orders_products_merged and check import
df_ords_prods_merge = pd.read_pickle(os.path.join(path, 
                                                  '02.Data', 
                                                  'Prepared_Data', 
                                                  'orders_products_merged_4.8_task.pkl'))
df_ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,_merge,...,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,users_avg_product_price,Spending_flag,median_days_since_last_order,frequency_flag
0,2539329,1,1,2,8,11.0,196,1,0,both,...,Mid-range product,Regularly busy,Regularly busy,average orders,10,New customer,6.367797,Low spender,20.0,Regular customer
1,2398795,1,2,3,7,15.0,196,1,1,both,...,Mid-range product,Regularly busy,Least busy days,average orders,10,New customer,6.367797,Low spender,20.0,Regular customer
2,473747,1,3,3,12,21.0,196,1,1,both,...,Mid-range product,Regularly busy,Least busy days,most orders,10,New customer,6.367797,Low spender,20.0,Regular customer
3,2254736,1,4,4,7,29.0,196,1,1,both,...,Mid-range product,Least busy,Least busy days,average orders,10,New customer,6.367797,Low spender,20.0,Regular customer
4,431534,1,5,4,15,28.0,196,1,1,both,...,Mid-range product,Least busy,Least busy days,most orders,10,New customer,6.367797,Low spender,20.0,Regular customer


### Use .merge() to merge df_cust and df_ords_prods_merge on user_id. 
### Use inner join because right, left, and outer would result in too many missing values for analysis

In [17]:
#check that data types of key are the same
df_ords_prods_merge['user_id'].dtype == df_cust['user_id']

True

In [18]:
#merge ords_prods with customers on user_id inner join
df_ords_prods_all = df_ords_prods_merge.merge(df_cust, on = 'user_id', 
                                              how = 'inner')

In [19]:
#check new dataframe
df_ords_prods_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 32 columns):
 #   Column                        Dtype   
---  ------                        -----   
 0   order_id                      int64   
 1   user_id                       int64   
 2   order_number                  int64   
 3   order_day_of_week             int64   
 4   order_hour_of_day             int64   
 5   days_since_last_order         float64 
 6   product_id                    int64   
 7   add_to_cart_order             int64   
 8   reordered                     int64   
 9   _merge                        category
 10  product_name                  object  
 11  aisle_id                      int64   
 12  department_id                 int64   
 13  prices                        float64 
 14  _exists                       category
 15  price_range_loc               object  
 16  busiest_day                   object  
 17  busiest_days                  object  
 18  

### merge successful

### drop indicator columns from previous merges

In [20]:
#drop _merge and _exists
df_ords_prods_all = df_ords_prods_all.drop(columns = 
                                           ['_merge', '_exists'])

In [21]:
#check drop was successful
df_ords_prods_all.columns

Index(['order_id', 'user_id', 'order_number', 'order_day_of_week',
       'order_hour_of_day', 'days_since_last_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest_day',
       'busiest_days', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'users_avg_product_price', 'Spending_flag',
       'median_days_since_last_order', 'frequency_flag', 'Gender', 'State',
       'Age', 'date_joined', 'num_dependants', 'fam_status', 'income'],
      dtype='object')

In [22]:
#check shape before export
df_ords_prods_all.shape

(32404859, 30)

In [23]:
#export completely merged to pkl file
df_ords_prods_all.to_pickle(os.path.join(path, 
                                         '02.Data', 
                                         'Prepared_Data', 
                                         'orders_products_all.pkl'))