# 4.9 IC Intro to Data Visualisation with Python - Task P 1

## Content  

1. **Importing Libraries**
2. **Importing Data Frames**
3. **Data Wrangling**
4. **Data Quality Checks**  
   4.1 Missing Values  
   4.2 Duplicates  
   4.3 Checking and Changing Data Types  
5. **Merging the Data Frames**
6. **Saving the Data Frames**

## 1. Importing Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

## 2. Importing Data Frames

In [2]:
# Creating the shortcut
path = r'/home/justem/CF - Data Analyst/Achievement 4/02-2025 Instacart Basket Analysis'

In [3]:
# Importing data set customers
df_customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

In [4]:
# Importing latest data set
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'df_ords_prods_merge_agg_variables.pkl'))

## 3. Data Wrangling

In [5]:
df_customers.shape

(206209, 10)

In [6]:
df_customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [7]:
# Renaming columns
df_customers.rename(columns={
    'Gender': 'gender',
    'STATE': 'state',
    'Age': 'age',
    'n_dependants': 'num_dependants',
    'fam_status': 'family_status'
}, inplace=True)

In [8]:
# Dropping columns for privacy and compliance
df_customers.drop(columns = ['First Name', 'Surnam'], inplace=True)

## 4. Data Quality Checks

### 4.1 Missing Values

In [9]:
# Finding missing values
df_customers.isnull().sum()

user_id           0
gender            0
state             0
age               0
date_joined       0
num_dependants    0
family_status     0
income            0
dtype: int64

Apparently we have no missing values in this data set

### 4.2 Duplicates

In [10]:
# Locating duplicates
df_dups = df_customers[df_customers.duplicated()]

The datframe has no duplicates

### 4.3 Checking and changing Data Types

In [11]:
# Check for mixed types - had to change the code because the earlier one didn't work anymore
for col in df_customers.columns:
    col_types = df_customers[col].map(type)
    if col_types.nunique() > 1:
        print(f"Mixed types found in column: {col}")

In [12]:
# Changing data type of user_id column ← Not every data type the frame has is useful
df_customers['user_id'] = df_customers['user_id'].astype('str')

## 5. Merging the Data Frames

In [None]:
# Make sure key columns are the same data type

In [13]:
# Changing data type of user_id column
df['user_id'] = df['user_id'].astype('str')

In [14]:
# Drop the existing _merge column from df
df = df.drop(columns=['_merge'], errors='ignore')

In [15]:
# Checking the data type of the df
df.dtypes

order_id                     int64
user_id                     object
order_number                 int64
orders_day_of_week           int64
order_hour_of_day            int64
days_since_prior_order     float64
product_id                   int64
add_to_cart_order            int64
reordered                    int64
product_name                object
aisle_id                     int64
department_id                int64
prices                     float64
max_order                    int64
loyalty_flag                object
mean_spending              float64
spending_flag               object
median_order_prior_days    float64
order_frequency_flag        object
dtype: object

In [16]:
# Checking the data type of the df_customers
df_customers.dtypes

user_id           object
gender            object
state             object
age                int64
date_joined       object
num_dependants     int64
family_status     object
income             int64
dtype: object

Had to convert the data types to smaller ones because my pc was not able to process the merge even with bigger Swap

In [17]:
# Converting int64 to smaller ints
df = df.astype({
    'order_id': 'int32',
    'order_number': 'int16',
    'orders_day_of_week': 'int8',
    'order_hour_of_day': 'int8',
    'product_id': 'int32',
    'add_to_cart_order': 'int16',
    'reordered': 'int8',
    'aisle_id': 'int16',
    'department_id': 'int8',
    'max_order': 'int16',
})

df_customers = df_customers.astype({
    'age': 'int8',
    'num_dependants': 'int8',
    'income': 'int32',
})

In [18]:
# Converting float64 to float32
df = df.astype({
    'days_since_prior_order': 'float32',
    'prices': 'float32',
    'mean_spending': 'float32',
    'median_order_prior_days': 'float32',
})

In [19]:
# Converting object columns to category
df = df.astype({
    'user_id': 'category',
    'product_name': 'category',
    'loyalty_flag': 'category',
    'spending_flag': 'category',
    'order_frequency_flag': 'category',
})

df_customers = df_customers.astype({
    'user_id': 'category',
    'gender': 'category',
    'state': 'category',
    'family_status': 'category',
})

In [20]:
# Converting date_joined to datetime
df_customers['date_joined'] = pd.to_datetime(df_customers['date_joined'])

In [21]:
# Merging both dataframes with a left join
df_ords_prods_customer_merge = df.merge(df_customers, on='user_id', how='left', indicator=True)

In [17]:
df_ords_prods_customer_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,median_order_prior_days,order_frequency_flag,gender,state,age,date_joined,num_dependants,family_status,income,_merge
0,2539329,1,1,2,8,0.0,196,1,0,Soda,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both
1,2539329,1,1,2,8,0.0,14084,2,0,Organic Unsweetened Vanilla Almond Milk,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both
2,2539329,1,1,2,8,0.0,12427,3,0,Original Beef Jerky,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both
3,2539329,1,1,2,8,0.0,26088,4,0,Aged White Cheddar Popcorn,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both
4,2539329,1,1,2,8,0.0,26405,5,0,XL Pick-A-Size Paper Towel Rolls,...,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,both


In [22]:
df_ords_prods_customer_merge.shape

(32404859, 27)

## 6. Saving the new Data Frame

In [19]:
df_ords_prods_customer_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'df_final_data.pkl'))