# This script contains the following:
- Import libraries
- Load datasets
- Checking dataframes
- Wrangling customer dataframe
    - Modify column names
    - Dropping first_name column
- Data Quality and Consistency Checks
    - Outliers check
    - mixed types check
    - missing value check
    - duplicate check
- Combining customer & order_products dataframe  
- Export


# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy


# Load dataframes

In [None]:
# Define full absolute paths
path = r'/Users/macbook/Dropbox/Mac/Documents/Pro/Data Analyst/Course_Career foundry/A4_Python/2023.08_Instacart basket analysis/02_data'
customers_file_path = os.path.join(path, 'original data', 'customers.csv')
order_prods_aggregated = os.path.join(path, 'prepared data', 'orders_products_aggregated.pkl')

# Import the dataframe
df_cust = pd.read_csv(customers_file_path)
df_ords = pd.read_pickle(order_prods_aggregated)

# Checking dataframes 

In [None]:
# checking customer dataframe
df_cust.info()

In [None]:
df_cust.head()

In [None]:
# Checking orders & products aggregated dataframe
df_ords.info()

In [None]:
df_ords.head()

# Wrangling customer dataframe

In [None]:
# Describe customer dataframe
round(df_cust.describe())

In [None]:
df_cust.info()

### NOTE: it appears that First name column have null values

In [None]:
# Finding count of missing values for column 'First Name'
df_cust['First Name'].isnull().sum()

### NOTE: the first_name column shows 11259 null values but could be ignored. Then only use the surname and user_id to define the customers

## Modify column names

In [None]:
# Change all column names to lowercase
df_cust.columns = df_cust.columns.str.lower()

# Display the DataFrame with lowercase column names
print(df_cust.head())

In [None]:
# Rename specific columns
df_cust.rename(columns={'first name': 'first_name', 'surnam': 'surname'}, inplace=True)

In [None]:
# Checking renaming columns
df_cust.head()

## Dropping first_name column


In [None]:
# Drop the 'first_name' column
df_cust.drop(columns='first_name', inplace=True)

# Display the DataFrame after dropping the column
print(df_cust)


# Data Quality and Consistency Checks


## Outliers check

In [None]:
# Checking for outliers
round(df_cust.describe())

In [None]:
# Create a boxplot of 'income' column grouped by gender
df_cust.boxplot(column='income', by='gender')

## Mixed types check

In [None]:

# Check for mixed-types in customers dataframe
for col in df_cust.columns.tolist():
    weird = (df_cust[[col]].applymap(type) != df_cust[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df_cust[weird]) > 0:
        print(col)


### NOTE: the result prints no result. we have no mixed type issues

## Missing values check

In [None]:
# Checking for null values
null_counts = df_cust.isnull().sum()
null_counts

### NOTE: null values of the first_name have been ignored by dropping the column from the dataframe

## Duplicate check

In [None]:
# Check for duplicates
duplicate_rows = df_cust[df_cust.duplicated()]
duplicate_rows

### NOTE: no duplicate rows

In [None]:
df_cust['income'].plot.hist(bins = 60)

# Combining customer & order_products dataframe  

In [None]:
# Checking dataframes for merging
df_ords.head()

In [None]:
# Checking dataframes for merging
df_cust.head()

In [None]:
# Merging the 2 dataframes customer and order_product
df_all = df_ords.merge(df_cust, on = 'user_id', indicator = True)

In [None]:
# Checking merging
df_all.head()

In [None]:
# Remove the unecessary _merge column
df_all.drop(columns=['_merge'], inplace=True)

In [None]:
# checking the result of merging and dropped _merge column
df_all.columns

In [None]:
df_all.info()

# Export dataframe

In [None]:
# exporting in pickle format
df_all.to_pickle(os.path.join(path,'prepared data','all_data'))