Import librairies

In [17]:
## main lib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from imblearn.over_sampling import SMOTE
from PIL import Image

## skelarn -- preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline #, FeatureUnion
# from sklearn_features.transformers import DataFrameSelector
from sklearn.compose import ColumnTransformer

## skelarn -- models
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression

## sklearn -- metrics
from sklearn.metrics import f1_score, confusion_matrix

# Data Preparation

Read dataset

In [18]:
TRAIN_PATH = os.path.join(os.getcwd(), 'dataset.csv')
df = pd.read_csv(TRAIN_PATH)

Drop first 3 features

In [19]:
df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

Filtering using Age Feature using threshold

In [20]:
df.drop(index=df[df['Age'] > 80].index.tolist(), axis=0, inplace=True)

To features and target

In [21]:
X = df.drop(columns=['Exited'], axis=1)
y = df['Exited']

Split to train and test

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=45, stratify=y)

# Data processing

Slice the lists (Numeric columns, categorical columns and ready columns)

In [23]:
num_cols = ['Age', 'CreditScore', 'Balance', 'EstimatedSalary']
categ_cols = ['Gender', 'Geography']
ready_cols = list(set(X_train.columns.tolist()) - set(num_cols) - set(categ_cols))

Pipeline for Numerical columns

In [24]:
num_pipeline = Pipeline(steps=[
                        # ('selector', DataFrameSelector(num_cols)),
                        ('imputer', SimpleImputer(strategy='median')),
                        ('scaler', StandardScaler())
                    ])

Pipeline for Categotrical columns

In [25]:
categ_pipeline = Pipeline(steps=[
                        # ('selector', DataFrameSelector(categ_cols)),
                        ('imputer', SimpleImputer(strategy='most_frequent')),
                        ('ohe', OneHotEncoder(drop='first', sparse_output=False))
                    ])

Pipeline for Ready columns

In [26]:
ready_pipeline = Pipeline(steps=[
                        # ('selector', DataFrameSelector(ready_cols)),
                        ('imputer', SimpleImputer(strategy='most_frequent'))
                    ])

Combine all Pipelines

In [27]:
all_pipeline = ColumnTransformer(transformers=[
                                    ('numerical', num_pipeline, num_cols),
                                    ('categorical', categ_pipeline, categ_cols),
                                    ('ready', ready_pipeline, ready_cols)
                                ])

In [28]:
all_pipeline

Apply the pipeline

In [29]:
X_train_final = all_pipeline.fit_transform(X_train)
X_test_final = all_pipeline.transform(X_test)

In [30]:
X_train_final.shape

(7990, 11)

In [31]:
X_test_final.shape

(1998, 11)

# Imbalancing

1. use algorithm without taking the effect of imbalancing

2. prepare class_weights for solving imbalance dataset

In [45]:
# Pourcentage of each class
vals_count = 1 - (np.bincount(y_train) / len(y_train))
vals_count

array([0.20387985, 0.79612015])

In [46]:
vals_count = vals_count / np.sum(vals_count)  ## normalizing
vals_count

array([0.20387985, 0.79612015])

In [47]:
dict_weights = {}
for i in range(2):  ## 2 classes (0, 1)
    dict_weights[i] = vals_count[i]
dict_weights

{0: np.float64(0.20387984981226537), 1: np.float64(0.7961201501877346)}

3. Using SMOTE for over sampling

In [49]:
over = SMOTE(sampling_strategy=0.7)
X_train_resmapled, y_train_resampled = over.fit_resample(X_train_final, y_train)

# Modeling 

In [None]:
def train_model(X_train, y_train, plot_name='', class_weight=None):
    """ A function to train model given the required train data """
    
    global clf_name

    clf = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=45, class_weight=class_weight)
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test_final)
    
    ## Using f1_score
    score_train = f1_score(y_train, y_pred_train)
    score_test = f1_score(y_test, y_pred_test)
    
    clf_name = clf.__class__.__name__

    ## Plot the confusion matrix 
    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion_matrix(y_test, y_pred_test), annot=True, cbar=False, fmt='.2f', cmap='Blues')
    plt.title(f'{plot_name}')
    plt.xticks(ticks=np.arange(2) + 0.5, labels=[False, True])
    plt.yticks(ticks=np.arange(2) + 0.5, labels=[False, True])

    ## Save the plot locally
    plt.savefig(f'{plot_name}.png', bbox_inches='tight', dpi=300)
    plt.close()

    ## Write scores to a file
    with open('metrics.txt', 'a') as f:
            f.write(f'{clf_name} {plot_name}\n')
            f.write(f"F1-score of Training is: {score_train*100:.2f} %\n")
            f.write(f"F1-Score of Validation is: {score_test*100:.2f} %\n")
            f.write('----'*10 + '\n')

    return True