In [52]:
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from imblearn.pipeline import Pipeline  as imb_pipeline
from imblearn import FunctionSampler

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

from sklearn.metrics import classification_report, f1_score, auc, accuracy_score
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve

from collections import Counter

%matplotlib inline
sns.set_style('darkgrid')

import warnings
warnings.filterwarnings("ignore")

random_state = 42

In [4]:
df = pd.read_csv('../data/default-of-credit-card-clients.csv')
df = df.rename(columns={"PAY_0": "PAY_1"}, errors="raise")
df.drop(columns=['ID'],inplace=True)

### Preprocessing

In [1]:
# group unkown education elements with "Others", as well as marriage
df["EDUCATION"] = df["EDUCATION"].apply(lambda x : x if x < 4 and x > 0 else 4)
df["MARRIAGE"] = df["MARRIAGE"].apply(lambda x : x if x < 3 and x > 0 else 3)

## Utils

In [8]:
def split_data(X,y, random_state=42):
    """Splits the dataset into train and test set"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25,
                                                        stratify= y, random_state=random_state)

    return X_train, X_test, y_train, y_test


def outliers_lof(X,y, n_neighbors=20):
    """Removes the outliers by LocalOutlierFactor"""
    y = np.asarray(y)
    X = np.asarray(X)

    outliers = LocalOutlierFactor(n_neighbors = n_neighbors, n_jobs = -1).fit_predict(X)
    X = X[outliers == 1, :]
    y = y[outliers == 1]

    return X, y

def make_tests(outliers_options,scaling,pca_options,resampling_options):
    """Defines all the test that must be done"""
    
    tests = []
    for o in outliers_options:
        for s in scaling:
            for p in pca_options:
                for r in resampling_options:
                    tests.append([o,s,p,r])
    
    return tests

### Pipeline

In [65]:
param_grids = {
    "rf" : {
        "clf__n_estimators":[80,100,120],
        "clf__criterion":["gini", "entropy"],
        "clf__max_features":['sqrt','log2'],
        "clf__n_jobs": [-1],
    },
    "svc" : {
        "clf__kernel": ['linear', 'rbf', 'poly'],
        "clf__gamma" :['scale', 'auto'],
        "clf__C": [0.8, 1.0,1.2] 
    },
    "knn" : {
        "clf__n_neighbors": [10, 50, 75],
        "clf__weights": ["uniform", "distance"],
        "clf__algorithm": ["auto", "ball_tree", "kd_tree"],
        "clf__n_jobs": [-1],
    },
    "logisticregression" : {
        "clf__penalty": ['l1','l2'],
        "clf__tol": [1e-1, 1e-2, 1e-3, 1e-4],
        "clf__C": [0.6,0.8, 1.0,1.2, 1.4],
        "clf__solver" : ['saga'],
        "clf__max_iter" : [500]
    }
}

models = {
    "rf" : RandomForestClassifier(random_state=random_state),
    "knn" : KNeighborsClassifier(n_neighbors=9),
    "svc" : SVC(),
    "logistic": LogisticRegression(random_state=random_state)
} 

In [80]:
def training_pipeline(df, tests, model, param_grid, random_state=42):
    """Tests all the combinations of Preprocessing and Hyperparameters with 
    K Fold Cross Validation. Then, test on the test set"""
    
    
    y, X = df["default payment next month"], df.drop("default payment next month", axis=1)
    X_train, X_test, y_train, y_test = split_data(X, y, 42)
    
    results = []
    counter = 1

    max_f1 = 0
    
    for curr_test in tests:
        
        outliers_option, scaling, pca, resampling = curr_test
        field_0 = ''
        
        pipe_steps = []
      
        if outliers_option == "LOF":
            field_0 += 'LOF + '
            pipe_steps.append(("outlier", FunctionSampler(func=outliers_lof)))
        
        if scaling:
            field_0 += 'SCALE + '
            pipe_steps.append(("scale",StandardScaler()))
            
        if pca:
            field_0 += 'PCA + '
            pipe_steps.append(("PCA",PCA(n_components=11, random_state=random_state)))
          
        if resampling:
            field_0 += f'{resampling}'  
            
        if resampling == "NearMiss":
            pipe_steps.append((resampling, NearMiss(version=3))) # 50-50
        elif resampling == "SMOTE":
            pipe_steps.append((resampling,  SMOTE(sampling_strategy=0.5, random_state=random_state) )) # 50-50
        elif resampling == "SMOTEEN":
            pipe_steps.append((resampling, SMOTEENN(0.77,random_state=random_state))) # 50.21 - 49.29
            
        pipe_steps.append(('clf',model))
        
        pipeline = imb_pipeline(pipe_steps)    
        
        gridsearch = GridSearchCV(pipeline, param_grid, scoring='f1', n_jobs=-1)
        gridsearch.fit(X_train, y_train)
        
        best_estimator = gridsearch.best_estimator_
        
        y_pred = best_estimator.predict(X_test)
        
        f1 = round(f1_score(y_test, y_pred),2)
        
        if f1 > max_f1:
            y_pred_max = y_pred
            y_test_max = y_test
            max_f1 = f1
            
        # clean field_0 > remove + if there is no more text
        if field_0.split("+")[-1] == ' ':
            field_0 = field_0[:-3]
            
        curr_result = (field_0, f1, gridsearch.best_params_)
        results.append(curr_result)
        
        print(f"Done {counter}/{len(tests)} \t | recall : {f1} | >> {field_0}")
        counter += 1

    return results, y_pred_max, y_test_max

def plot_summary(result,y_pred, y_test):
    """Plots a barchart with the partial results and a confusion matrix
    on the best estimator with the best preprocessing technique"""
    
    fig = plt.figure(constrained_layout=False,figsize=(14,4))
    gs1 = fig.add_gridspec(nrows=1, ncols=6, left=0.1, right=0.90, wspace=0.5, hspace=0.5)
    ax1 = fig.add_subplot(gs1[0,0:3])
    ax2 = fig.add_subplot(gs1[0,3:5])
    
    
    plot_x = []
    plot_y = []
    
    for r in result:
        plot_x.append(r[1])
        plot_y.append(r[0])

    temp_df = pd.DataFrame({'x':plot_x, 'y' :plot_y}).sort_values(['x'], ascending=False)
    
    sns.barplot(x='x',y='y',data=temp_df, palette='viridis',ax=ax1)
    ax1.set_xlabel("F1-score", fontsize="14")
    ax1.set_ylabel("")
    ax1.set_yticklabels(temp_df.y,fontsize=13)
    
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot = True, 
                cmap = 'viridis', 
                annot_kws={"size": 14},
                linecolor = 'w',
                linewidth = 4,ax=ax2,
                fmt='d'
               )
    
    ax2.set_xlabel("Predicted labels",fontsize="14")
    ax2.set_ylabel("True labels",fontsize="14")
    return plt

# Random Forest

In [None]:
pca_options = [None]
scaling = [True]
outliers_options = [None, "LOF"]
resampling_options = [None, "NearMiss", "SMOTE", "SMOTEENN"]

tests = make_tests(outliers_options,scaling,pca_options,resampling_options)

results_rf, y_pred_rf, y_test_rf = training_pipeline(df, tests, models["rf"], param_grids["rf"])

plt_rf = plot_summary(results_rf, y_pred_rf, y_test_rf)
# plt_rf.savefig('rf-results.svg')
plt_rf.show()

# KNN

In [None]:
pca_options = ["PCA"]
scaling = [True]
outliers_options = [None, "LOF"]
resampling_options = [None, "NearMiss", "SMOTE", "SMOTEENN"]

tests = make_tests(outliers_options,scaling,pca_options,resampling_options)

results_knn, y_pred_knn, y_test_knn = training_pipeline(df, tests, models["knn"], param_grids["knn"])

plt_knn = plot_summary(results_knn, y_pred_knn, y_test_knn)
# plt_knn.savefig('knn-results.svg')
plt_knn.show()

## Logistic Regression

In [None]:
pca_options = [None]
scaling = [True]
outliers_options = [None, "LOF"]
resampling_options = [None, "NearMiss", "SMOTE", "SMOTEENN"]

tests = make_tests(outliers_options,scaling,pca_options,resampling_options)

results_lr, y_pred_lr, y_test_lr = training_pipeline(df, tests, models["lr"], param_grids["lr"])

plt_lr = plot_summary(results_lr, y_pred_lr, y_test_lr)
# plt_lr.savefig('lr-results.svg')
plt_lr.show()

### SVM

In [None]:
pca_options = [None"]
scaling = [True]
outliers_options = [None, "LOF"]
resampling_options = [None, "NearMiss", "SMOTE", "SMOTEENN"]

tests = make_tests(outliers_options,scaling,pca_options,resampling_options)

results_svm, y_pred_svm, y_test_svm = training_pipeline(df, tests, models["svc"], param_grids["svc"])

plt_svm = plot_summary(results_svm, y_pred_svm, y_test_svm)
# plt_svm.savefig('svm-results.svg')
plt_svm.show()