# Part 1
________________________________________________________________________________________________________________________________________________________

# Step 1: Import Libraries and Load Customer Data

In [3]:
# Import the libraries
import pandas as pd
import numpy as np
import os

# Define the path
path = r'C:\Users\Jacques\OneDrive\Documents\Data Analytics course\Data Immersion\Section 4\08 April 2025 Instacart Basket Analysis'

# Load the customer dataset
df_customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

# Load the ords_prods_merge dataset
df_ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge.pkl'))

# Preview the dataset
df_customers.head()

# Preview the dataset
df_ords_prods_merge.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,product_merge_flag
0,2539329,1,prior,1,2,8,,196,1,0,both,Soda,77,7,9.0,both
1,2539329,1,prior,1,2,8,,14084,2,0,both,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,both
2,2539329,1,prior,1,2,8,,12427,3,0,both,Original Beef Jerky,23,19,4.4,both
3,2539329,1,prior,1,2,8,,26088,4,0,both,Aged White Cheddar Popcorn,23,19,4.7,both
4,2539329,1,prior,1,2,8,,26405,5,0,both,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,both


In [5]:
# print the number of rows
print("Number of rows:", df_ords_prods_merge.shape[0])

Number of rows: 32404859


# Step 2: Wrangle the Data

Rename confusing column names (e.g., 'fam_status' → 'family_status')

In [108]:
# Rename the 'fam_status' column to 'family_status' for improved readability and consistency
df_customers.rename(columns={'fam_status': 'family_status'}, inplace=True)

Drop unnecessary columns

In [111]:
# Reset the index to default integer values and drop the current index
df_customers.reset_index(drop=True, inplace=True)

In [113]:
# Preview the dataset
df_customers.head()

Unnamed: 0,user_id,First Name,Surname,Gender,STATE,Age,date_joined,n_dependants,family_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,2017-01-01,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,2017-01-01,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,2017-01-01,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,2017-01-01,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,2017-01-01,1,married,40374


# Step 3: Perform Data Quality Checks

Check for missing values

In [117]:
# Check for missing values in each column of the dataframe
df_customers.isnull().sum()

user_id          0
First Name       0
Surname          0
Gender           0
STATE            0
Age              0
date_joined      0
n_dependants     0
family_status    0
income           0
dtype: int64

Fill in the missing values

In [120]:
# Fill missing values in the 'First Name' column with the placeholder 'Unknown'
df_customers['First Name'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_customers['First Name'].fillna('Unknown', inplace=True)


Re-check for any remaining missing values

In [123]:
# Recheck for missing values after filling to confirm the update was successful
df_customers.isnull().sum()

user_id          0
First Name       0
Surname          0
Gender           0
STATE            0
Age              0
date_joined      0
n_dependants     0
family_status    0
income           0
dtype: int64

Check for duplicates

In [126]:
# Check for duplicate rows in the dataframe
df_customers.duplicated().sum()

0

Convert mixed data types

In [129]:
# Display basic information about the dataframe, including data types and non-null counts
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   user_id        206209 non-null  int64         
 1   First Name     206209 non-null  object        
 2   Surname        206209 non-null  object        
 3   Gender         206209 non-null  category      
 4   STATE          206209 non-null  category      
 5   Age            206209 non-null  int64         
 6   date_joined    206209 non-null  datetime64[ns]
 7   n_dependants   206209 non-null  int64         
 8   family_status  206209 non-null  category      
 9   income         206209 non-null  int64         
dtypes: category(3), datetime64[ns](1), int64(4), object(2)
memory usage: 11.6+ MB


Convert date_joined to datetime

In [132]:
# Convert the 'date_joined' column from string (object) to datetime format
df_customers['date_joined'] = pd.to_datetime(df_customers['date_joined'])

Fix the typo in Surnam

In [135]:
# Rename the incorrectly named 'Surnam' column to 'Surname'
df_customers.rename(columns={'Surnam': 'Surname'}, inplace=True)

Convert object columns to category type

In [138]:
# Convert selected categorical columns from object type to category type to save memory and improve performance
df_customers['Gender'] = df_customers['Gender'].astype('category')
df_customers['family_status'] = df_customers['family_status'].astype('category')
df_customers['STATE'] = df_customers['STATE'].astype('category')

In [140]:
# Display updated dataframe info to confirm the data type changes
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   user_id        206209 non-null  int64         
 1   First Name     206209 non-null  object        
 2   Surname        206209 non-null  object        
 3   Gender         206209 non-null  category      
 4   STATE          206209 non-null  category      
 5   Age            206209 non-null  int64         
 6   date_joined    206209 non-null  datetime64[ns]
 7   n_dependants   206209 non-null  int64         
 8   family_status  206209 non-null  category      
 9   income         206209 non-null  int64         
dtypes: category(3), datetime64[ns](1), int64(4), object(2)
memory usage: 11.6+ MB


# Step 4: Merge with Main DataFrame

Check if user_id exists in both DataFrames

In [144]:
# Confirm that 'user_id' exists as a column in both dataframes before attempting a merge
print('user_id in df_customers:', 'user_id' in df_customers.columns)
print('user_id in ords_prods_merge:', 'user_id' in ords_prods_merge.columns)

user_id in df_customers: True
user_id in ords_prods_merge: True


Check data types of user_id in both

In [147]:
# Print the data type of 'user_id' in both dataframes to ensure they match for merging
print('df_customers:', df_customers['user_id'].dtype)
print('ords_prods_merge:', ords_prods_merge['user_id'].dtype)

df_customers: int64
ords_prods_merge: int64


Merge the two DataFrames

In [150]:
# Merge the customer data with the main orders-products dataframe using a left join on 'user_id'
df_merged = ords_prods_merge.merge(df_customers, on='user_id', how='left')

# Step 5: Export the Combined Data

In [158]:
# Export the merged dataframe as a pickle file to the 'Prepared Data' folder for future use
df_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_customers.pkl'))

In [160]:
# Rename df_merged to ords_prods_cust_merge for consistency with project naming conventions
ords_prods_cust_merge = df_merged