**Table of contents**<a id='toc0_'></a>    
- 1. [Importing Data](#toc1_)    
- 2. [orders_products_prior.csv Data Checks](#toc2_)    
- 3. [customers.csv Data Checks](#toc3_)    
  - 3.1. [Wranging and Consistency Checks](#toc3_1_)    
  - 3.2. [Changing Data Types](#toc3_2_)    
- 4. [Combining Order_Products with Customer Data Frame](#toc4_)    
- 5. [Exporting combined data frame as a Pickle file](#toc5_)    

<!-- vscode-jupyter-toc-config
	numbering=true
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# 1. <a id='toc1_'></a>[Importing Data](#toc0_)

In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import os

In [2]:
Path = r'D:\Data Analysis\01-08-2025 Instacart Basket Analysis\Data'
df_customer = pd.read_csv(os.path.join(Path, 'Original Data', 'customers.csv'), index_col=False)
df_merged = pd.read_pickle(os.path.join(Path, 'Prepared Data', 'ord_pro_4.9.pkl'))
df_customer.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [3]:
# Importing orders_products_prior.csv
df_OrdProPrior = pd.read_csv(os.path.join(Path, 'Original Data', 'orders_products_prior.csv'), index_col=False)
df_OrdProPrior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


# 2. <a id='toc2_'></a>[orders_products_prior.csv Data Checks](#toc0_)

In [4]:
# Checking for mixed-type data with for-loop did not output any column names: none found.
for col in df_OrdProPrior.columns.tolist():
  weird = (df_OrdProPrior[[col]].map(type) != df_OrdProPrior[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_OrdProPrior[weird]) > 0:
    print (col)

In [None]:
# Checking for missing values
df_OrdProPrior.isna().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

There are no null values.

In [None]:
# Verifying if there are any duplicate values
df_OrdProPrior[df_OrdProPrior.duplicated()]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


Task 7. No duplicate values found in the df_OrdProPrior data frame.

In [None]:
df_OrdProPrior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int64
 2   add_to_cart_order  int64
 3   reordered          int64
dtypes: int64(4)
memory usage: 989.8 MB


# 3. <a id='toc3_'></a>[customers.csv Data Checks](#toc0_)

## 3.1. <a id='toc3_1_'></a>[Wranging and Consistency Checks](#toc0_)

In [3]:
# for-loop checking per column if dataframe contains any mixed-type columns. Weird checks wheter the datatype within columns are consistent and if stmt prints column names.
for col in df_customer.columns.tolist():
  weird = (df_customer[[col]].map(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customer[weird]) > 0:
    print (col)

First Name


In [4]:
df_customer['First Name'].dtype

dtype('O')

In [5]:
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [6]:
#Figuring out which columns have null values
df_customer.isnull().sum()

user_id             0
First Name      11259
Surnam              0
Gender              0
STATE               0
Age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64

In [7]:
#Renaming STATE column to State
df_customer.rename(columns={'STATE':'State'}, inplace=True)

In [8]:
#Renaming Surnam column to Surname
df_customer.rename(columns={'Surnam':'Surname'}, inplace=True)
df_customer.head()

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [9]:
#Creating a new variable that acts like a flag based on the missing value.
# ALt: df_cust_null = df_cust_null.loc[df_cust_null['First Name'].isna() == True]
df_cust_null = df_customer.loc[df_customer['First Name'].isnull()]
df_cust_null

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,date_joined,n_dependants,fam_status,income
53,76659,,Gilbert,Male,Colorado,26,1/1/2017,2,married,41709
73,13738,,Frost,Female,Louisiana,39,1/1/2017,0,single,82518
82,89996,,Dawson,Female,Oregon,52,1/1/2017,3,married,117099
99,96166,,Oconnor,Male,Oklahoma,51,1/1/2017,1,married,155673
105,29778,,Dawson,Female,Utah,63,1/1/2017,3,married,151819
...,...,...,...,...,...,...,...,...,...,...
206038,121317,,Melton,Male,Pennsylvania,28,3/31/2020,3,married,87783
206044,200799,,Copeland,Female,Hawaii,52,4/1/2020,2,married,108488
206090,167394,,Frost,Female,Hawaii,61,4/1/2020,1,married,45275
206162,187532,,Floyd,Female,California,39,4/1/2020,0,single,56325


In [10]:
#Confirming no surname appears more than once per user id
unique = (df_cust_null.groupby('user_id')['Surname'].nunique()>1).sum()
unique

np.int64(0)

In [11]:
# Figuring out if any customer surname has a corresponding firstname in df_customer
df_customer.loc[df_customer['Surname'].isin(df_cust_null['Surname'])].sort_values(by=['First Name', 'user_id'], ascending=True)

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,date_joined,n_dependants,fam_status,income
63419,29,Betty,Bean,Female,North Dakota,31,12/31/2017,3,married,68265
140627,61,,Fischer,Male,Florida,45,3/22/2019,2,married,166565
53438,81,,Sanchez,Male,Ohio,49,11/4/2017,1,married,123542
30661,101,,Dixon,Female,Wisconsin,58,6/27/2017,3,married,112780
139882,128,,Davila,Female,New Mexico,72,3/18/2019,2,married,137562
...,...,...,...,...,...,...,...,...,...,...
192246,206179,,Dawson,Female,Mississippi,24,1/13/2020,3,married,55101
126472,206181,,Grimes,Male,Pennsylvania,66,12/30/2018,0,divorced/widowed,135047
194120,206184,,Gilbert,Male,South Dakota,50,1/24/2020,1,married,163788
176880,206189,,Simpson,Female,Virginia,75,10/16/2019,2,married,156024


In [12]:
#No need to impute any values since Betty Bean does not appear more than once
df_cust_null[df_cust_null['user_id']== 29]

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,date_joined,n_dependants,fam_status,income


In [13]:
# Creating new data frame that doesn't include null First Names
df_cust_clean = df_customer.loc[df_customer['First Name'].isna() == False]
df_cust_clean.shape

(194950, 10)

In [14]:
# Verifying if there are any duplicate values
df_cust_clean[df_cust_clean.duplicated()]

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,date_joined,n_dependants,fam_status,income


In [15]:
df_cust_clean.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,194950.0,194950.0,194950.0,194950.0
mean,103140.014947,49.511167,1.4996,94664.811495
std,59527.70581,18.476519,1.117959,42477.262653
min,1.0,18.0,0.0,25903.0
25%,51597.25,33.0,0.0,59924.0
50%,103091.5,49.0,1.0,93572.5
75%,154700.75,66.0,2.0,124261.0
max,206209.0,81.0,3.0,593901.0


## 3.2. <a id='toc3_2_'></a>[Changing Data Types](#toc0_)


In [16]:
df_cust_clean['user_id'] = df_cust_clean['user_id'].astype('str')
df_cust_clean[['Gender', 'fam_status']] = df_cust_clean[['Gender', 'fam_status']].astype('category')
df_cust_clean[['Age', 'n_dependants']] = df_cust_clean[['Age', 'n_dependants']].astype('int8')
df_cust_clean['income'] = df_cust_clean['income'].astype('int32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cust_clean['user_id'] = df_cust_clean['user_id'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cust_clean[['Gender', 'fam_status']] = df_cust_clean[['Gender', 'fam_status']].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cust_clean[['Age', 'n_dependants']]

In [17]:
df_cust_clean.head(10)

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374
5,133128,Cynthia,Noble,Female,Kentucky,43,1/1/2017,2,married,49643
6,152052,Chris,Walton,Male,Montana,20,1/1/2017,0,single,61746
7,168851,Joseph,Hickman,Male,South Carolina,30,1/1/2017,0,single,63712
8,69965,Jeremy,Vang,Male,Texas,47,1/1/2017,1,married,162432
9,82820,Shawn,Chung,Male,Virginia,26,1/1/2017,2,married,32072


In [18]:
#Ensuring new data types did not alter the numeric values
df_cust_clean.describe()

Unnamed: 0,Age,n_dependants,income
count,194950.0,194950.0,194950.0
mean,49.511167,1.4996,94664.811495
std,18.476519,1.117959,42477.262653
min,18.0,0.0,25903.0
25%,33.0,0.0,59924.0
50%,49.0,1.0,93572.5
75%,66.0,2.0,124261.0
max,81.0,3.0,593901.0


In [19]:
df_cust_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 194950 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   user_id       194950 non-null  object  
 1   First Name    194950 non-null  object  
 2   Surname       194950 non-null  object  
 3   Gender        194950 non-null  category
 4   State         194950 non-null  object  
 5   Age           194950 non-null  int8    
 6   date_joined   194950 non-null  object  
 7   n_dependants  194950 non-null  int8    
 8   fam_status    194950 non-null  category
 9   income        194950 non-null  int32   
dtypes: category(2), int32(1), int8(2), object(5)
memory usage: 10.4+ MB


In [20]:
#Confirming no NaN values exist in any row
df_cust_clean[df_cust_clean.isna().any(axis=1)]

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,date_joined,n_dependants,fam_status,income


In [21]:
df_merged.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,department_id,prices,price_range,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spender_type,orders_frequency
0,3108588,1,8,1,14,14.0,12427,1,1,Original Beef Jerky,...,19,4.4,Low Range,Second Busiest,Most Orders,10,New Customer,6.367797,Low Spender,Non-frequent customer
1,3108588,1,8,1,14,14.0,196,2,1,Soda,...,7,9.0,Mid Range,Second Busiest,Most Orders,10,New Customer,6.367797,Low Spender,Non-frequent customer
2,3108588,1,8,1,14,14.0,10258,3,1,Pistachios,...,19,3.0,Low Range,Second Busiest,Most Orders,10,New Customer,6.367797,Low Spender,Non-frequent customer
3,3108588,1,8,1,14,14.0,25133,4,1,Organic String Cheese,...,16,8.6,Mid Range,Second Busiest,Most Orders,10,New Customer,6.367797,Low Spender,Non-frequent customer
4,3108588,1,8,1,14,14.0,46149,5,0,Zero Calorie Cola,...,7,13.4,Mid Range,Second Busiest,Most Orders,10,New Customer,6.367797,Low Spender,Non-frequent customer


# 4. <a id='toc4_'></a>[Combining Order_Products with Customer Data Frame](#toc0_)

In [22]:
# Creating a new data frame that merges all datasets
ord_pro_all = df_merged.merge(df_cust_clean, on='user_id', indicator=True)

In [23]:
ord_pro_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30629741 entries, 0 to 30629740
Data columns (total 31 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                object  
 1   user_id                 object  
 2   order_number            int8    
 3   order_day_of_week       int8    
 4   order_hour_of_day       int8    
 5   days_since_prior_order  float32 
 6   product_id              int32   
 7   add_to_cart_order       int16   
 8   reordered               int8    
 9   product_name            object  
 10  aisle_id                int16   
 11  department_id           int8    
 12  prices                  float32 
 13  price_range             category
 14  busiest_days            category
 15  busiest_period_of_day   category
 16  max_order               int8    
 17  loyalty_flag            category
 18  mean_price              float32 
 19  spender_type            category
 20  orders_frequency        category
 21  First 

In [24]:
ord_pro_all['_merge'].value_counts()

_merge
both          30629741
left_only            0
right_only           0
Name: count, dtype: int64

In [25]:
ord_pro_all = ord_pro_all.drop(columns=['_merge'])
ord_pro_all.shape

(30629741, 30)

In [26]:
# Verifying spender_type
ord_pro_all['spender_type'].value_counts(dropna=False)

spender_type
Low Spender     30034178
High Spender      595563
Name: count, dtype: int64

In [27]:
# Verifying orders_frequency
ord_pro_all['orders_frequency'].value_counts(dropna=False)

orders_frequency
Frequent Customer        20362711
Regular Customer          6824080
Non-frequent customer     3442950
Name: count, dtype: int64

# 5. <a id='toc5_'></a>[Exporting combined data frame as a Pickle file](#toc0_)

In [28]:
ord_pro_all.to_pickle(os.path.join(Path, 'Prepared Data', 'ord_pro_cust.pkl'))