In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import pandas as pd

from src.data.quality import check_duplicates, summarize_cols, check_multicolinearity, check_missing
from src.visualizations.plots import plot_num_features_grid, plot_bin_features_grid

# Set directory paths
project_root_dir = Path('../')

# Data Quality

In [None]:
# Read raw data
df_raw = pd.read_csv(project_root_dir / 'data/raw/raw_data.csv')
print(f"Number of rows:{df_raw.shape[0]}")
print(f"Number of columns:{df_raw.shape[1]}")
df_raw.head()

In [None]:
# Check for missing values
check_missing(df_raw)

# Check for duplicates in CustomerId
check_duplicates(df_raw, 'CustomerId')

# Automate summary for all columns
summarize_cols(df_raw)

**Summary**
- 10,000 rows each representing a unique customer with no missing values
- Identification Features: RowNumber, CustomerId
- Numerical Features:
    - Continuous: CreditScore (350-850), Balance (0-250898.09), EstimatedSalary (11.58-199992.48)
    - Discrete: Age (18-92), Tenure (0-10), NumOfProducts (1-4)
- Categorical Features: Surname (2932), Geography (3), Gender (2), HasCrCard (2), IsActiveMember (2), Exited (2)

# Data Exploration

In [None]:
# Classify features into binned, not binned (numerical), and target
target = 'Exited'
num_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']
bin_features = ['Geography', 'Gender', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember']

In [None]:
# Numerical features
plot_num_features_grid(df_raw, num_features, target='Exited')

In [None]:
plot_bin_features_grid(df_raw, bin_features, 'Exited')

**Summary**
- *CreditScore* 
    - Right censored normal distribution most likely due to credit score being capped at 850.
    - Differences by churn status is minimal as evidence from the plot and effect sizing.
- *Age* 
    - Right skewed normal distribution with spikes at ages which are multiples of 10, possibly due to rounding during data entry.
    - Distribution of churners are shifted to a higher age group.
- *Balance*
    - Zero-inflated normal distribution
    - Differences by churn status is minimal except for high concentration of non-churners with zero balance.
- *EstimatedSalary*
    - Uniformly distributed.
    - Essentially no difference when stratified by churn status.
- *Geography*
    - 2:1:1 ratio between customers from france, germany, and spain respectively.
    - Customers from Germany are twice as likely to churn compared to thos from France and Spain.
- *Tenure*
    - Essentially evenly distributed with drop during year 0 and 10 for general and churned population.
- *NumOfProducts*
    - Majority of customers split between 1 or 2 products.
    - Churn rate varies widely with 3+ products having a extremely high churn rate, 1 product having above average churn rate, and 2 products having very low churn rate.
- *HasCrCard*
    - Approximatly 70% of customers have a credit card.
    - Status have equal proportion of churners.
- *IsActiveMember*
    - Slightly more active member.
    - Inactive members are slightly more likely to churn

In [None]:
check_multicolinearity(df_raw, num_features, bin_features, 'Exited')

**Summary**
- There is a small amount of pair wise correlation between Balance and NumOfProducts as well as Balance and Geography.
- From the VIF table we see that most features are not multicollinear with other features or there is a negligible amount.