In [7]:
#Import libraries 
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 1. Importing Customer Dataset 

In [9]:
#Import dataset 
# Project folder path 
path = r'C:\Users\hp\08-2024 Instacart Basket Analysis'
# Import customers dataset
customer = pd.read_csv(os.path.join(path,'Data', 'Original Data', 'customers.csv'))

In [7]:
customer.shape

(206209, 10)

In [9]:
customer.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [11]:
customer.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [15]:
customer.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

## 1.1 Data Wrangling 

In [27]:
# Renaming columns
customer = customer.rename(columns={ 
    'First Name' : 'First_Name',
    'Surnam' : 'Last_Name', 
    'STATE' : 'State', 
    'date_joined' : 'Date_Joined', 
    'n_dependants' : 'Number_of_Dependents', 
    'fam_status' : 'Family Status', 
    'income' : 'Income'})

In [29]:
customer.head()

Unnamed: 0,user_id,First_Name,Last_Name,Gender,State,Age,Date_Joined,Number_of_Dependents,Family Status,Income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [31]:
# Convert `user_id` to string 
customer['user_id'] = customer['user_id'].astype(str)

## 1.2. Mixed Column 

In [29]:
#Check for mixed column 
for col in customer.columns.tolist():
  weird = (customer[[col]].map(type) != customer[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customer[weird]) > 0:
    print (col)

First_Name


#### Insights : 
The fact that First_Name is printed means that it contains elements of different data types.

In [34]:
#Identify the Problematic Entries:
weird2 = (customer['First_Name'].map(type) != customer['First_Name'].iloc[0].apply(type))
print(customer[weird])

AttributeError: 'str' object has no attribute 'apply'

In [36]:
first_type = type(customer['First_Name'].iloc[0])

In [38]:
weird = customer['First_Name'].map(type) != first_type

In [40]:
print(customer[weird])

       user_id First_Name Last_Name  Gender         State  Age Date_Joined  \
53       76659        NaN   Gilbert    Male      Colorado   26    1/1/2017   
73       13738        NaN     Frost  Female     Louisiana   39    1/1/2017   
82       89996        NaN    Dawson  Female        Oregon   52    1/1/2017   
99       96166        NaN   Oconnor    Male      Oklahoma   51    1/1/2017   
105      29778        NaN    Dawson  Female          Utah   63    1/1/2017   
...        ...        ...       ...     ...           ...  ...         ...   
206038  121317        NaN    Melton    Male  Pennsylvania   28   3/31/2020   
206044  200799        NaN  Copeland  Female        Hawaii   52    4/1/2020   
206090  167394        NaN     Frost  Female        Hawaii   61    4/1/2020   
206162  187532        NaN     Floyd  Female    California   39    4/1/2020   
206171  116898        NaN   Delgado  Female      Colorado   23    4/1/2020   

        Number_of_Dependents Family Status  Income  
53        

##### Mostlikely, it is about missing values. NaN (Not a Number) is not considered a string (str).
We need further investigations 

# 1.3 Missing Values

In [43]:
# finding missing values
missing_values = customer.isnull().sum()
missing_values

user_id                     0
First_Name              11259
Last_Name                   0
Gender                      0
State                       0
Age                         0
Date_Joined                 0
Number_of_Dependents        0
Family Status               0
Income                      0
dtype: int64

#### the only column with missing values is the "First_Name" column, and it’s missing 11259 values.
We have user_id and Last_Name informations, so I will not change the data for now. 

# 1.4. Duplicates

In [52]:
# Looking for full duplicates
customer_dups = customer[customer.duplicated()]
customer_dups

Unnamed: 0,user_id,First_Name,Last_Name,Gender,State,Age,Date_Joined,Number_of_Dependents,Family Status,Income


In [54]:
# Find duplicate rows based on specific columns
duplicates = customer[customer.duplicated(subset=['First_Name', 'Last_Name'])]

#### No duplicates found. 

# 2. Combining Dataframes

In [11]:
#Importing dataframe
ords_prods_merge = pd.read_pickle(os.path.join(path,'Data', 'Prepared Data', 'ords_prods_aggregated.pkl'))

In [35]:
# Convert `user_id` to string 
ords_prods_merge['user_id'] = ords_prods_merge['user_id'].astype(str)

### I encountered a significant issue when attempting to merge the two datasets, as I ran into a memory error. To resolve this, I decided to switch browsers and optimize the data types in the datasets before performing the merge

In [71]:
ords_prods_merge.dtypes

product_id                   int64
product_name                object
aisle_id                     int64
department_id                int64
prices                     float64
order_id                     int64
user_id                     object
order_number                 int64
order_dow                    int64
order_hour_of_day            int64
days_since_prior_order     float64
add_to_cart_order            int64
reordered                    int64
_merge                    category
price_range_loc             object
busiest day                 object
busiest days                object
busiest_period_of_day       object
max_order                    int64
loyalty_flag                object
average_spend              float64
spender_flag                object
Customer_frequency         float64
frequency_flag              object
dtype: object

In [62]:
pd.set_option('display.max_columns', None)

In [None]:
# Drop the '_merge' column
ords_prods_merge = ords_prods_merge.drop(columns=['_merge'])

In [13]:
# 1. Convert all 'object' and 'category' columns to 'category' dtype
ords_prods_merge[ords_prods_merge.select_dtypes(include=['object', 'category']).columns] = ords_prods_merge.select_dtypes(include=['object', 'category']).apply(lambda x: x.astype('category'))

# 2. Convert 'int64' columns to 'int32' if within range
ords_prods_merge[ords_prods_merge.select_dtypes(include=['int64']).columns] = ords_prods_merge.select_dtypes(include=['int64']).apply(lambda x: x.astype('int32') if x.min() > np.iinfo(np.int32).min and x.max() < np.iinfo(np.int32).max else x)

# 3. Convert all 'float64' columns to 'float32'
ords_prods_merge[ords_prods_merge.select_dtypes(include=['float64']).columns] = ords_prods_merge.select_dtypes(include=['float64']).astype('float32')

In [64]:
#Verifying the changes 
ords_prods_merge.dtypes

product_id                   int32
product_name              category
aisle_id                     int32
department_id                int32
prices                     float32
order_id                     int32
user_id                     object
order_number                 int32
order_dow                    int32
order_hour_of_day            int32
days_since_prior_order     float32
add_to_cart_order            int32
reordered                    int32
price_range_loc           category
busiest day               category
busiest days              category
busiest_period_of_day     category
max_order                    int32
loyalty_flag              category
average_spend              float32
spender_flag              category
Customer_frequency         float32
frequency_flag            category
dtype: object

### Combining 'Customers' and 'orders products merge' datasets 

In [39]:
#Combining datasets 
df_merged = customer.merge(ords_prods_merge, on = 'user_id', indicator = True)

In [41]:
df_merged.head()

Unnamed: 0,user_id,First_Name,Last_Name,Gender,State,Age,Date_Joined,Number_of_Dependents,Family Status,Income,...,busiest day,busiest days,busiest_period_of_day,max_order,loyalty_flag,average_spend,spender_flag,Customer_frequency,frequency_flag,_merge
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regularly busy,Busiest days,Average orders,8,New customer,7.988889,Low_spender,19.0,Regular customer,both
1,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regularly busy,Regularly busy,Most orders,8,New customer,7.988889,Low_spender,19.0,Regular customer,both
2,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regularly busy,Busiest days,Most orders,8,New customer,7.988889,Low_spender,19.0,Regular customer,both
3,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regularly busy,Regularly busy,Average orders,8,New customer,7.988889,Low_spender,19.0,Regular customer,both
4,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regularly busy,Slowest days,Most orders,8,New customer,7.988889,Low_spender,19.0,Regular customer,both


In [44]:
df_merged.shape

(32404161, 33)

# 3. Exporting dataset 

In [68]:
#exporting as a pickle
df_merged.to_pickle(os.path.join(path, 'Data','Prepared Data', 'ords_prods_cust_merge.pkl'))