In [1]:
#numpy,pandas,scipy, math, matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from math import sqrt

#model metrics
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import classification_report

#cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


# Estimators
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

# RFE 
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

#Hot encoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
##################################################
##         Import and Prepare the Data          ##
##################################################

rawDefaultCredit= pd.read_csv('dataset/default of credit card clients.csv', header =1)

# Data and Text Cleaning

# ID: represents the number of the observation and has no value, needs to get removed
rawDefaultCredit= rawDefaultCredit.drop("ID", axis=1)

# PAY_0 should be renamed PAY_1
# (the inplace parameter will change the dataframe without assignment)
rawDefaultCredit.rename(columns={"PAY_0": "PAY_1"}, inplace=True)

# Default Payment next month (not standard name needs to get renamed)
rawDefaultCredit.rename(columns={"default payment next month": "DEFAULT"}, inplace=True)

# Replace 4, 5, 6 to 0 to unify others to one unique value
rawDefaultCredit['EDUCATION'].replace([0, 5, 6], [4, 4, 4], inplace=True)

rawDefaultCredit.to_csv('dataset/defaultCreditCardClients.csv')

defaultCredit = rawDefaultCredit


In [3]:
defaultCredit['DEFAULT'] = defaultCredit['DEFAULT'].astype('category')
defaultCredit['SEX'] = defaultCredit['SEX'].astype('category')
defaultCredit['EDUCATION'] = defaultCredit['EDUCATION'].astype('category')
defaultCredit['MARRIAGE'] = defaultCredit['MARRIAGE'].astype('category')
defaultCredit['PAY_1'] = defaultCredit['PAY_1'].astype('category')
defaultCredit['PAY_2'] = defaultCredit['PAY_2'].astype('category')
defaultCredit['PAY_3'] = defaultCredit['PAY_3'].astype('category')
defaultCredit['PAY_4'] = defaultCredit['PAY_4'].astype('category')
defaultCredit['PAY_5'] = defaultCredit['PAY_5'].astype('category')
defaultCredit['PAY_6'] = defaultCredit['PAY_6'].astype('category')

In [4]:
#One hot Encoding
df = defaultCredit
#X = allFeatures
cat_columns = ['SEX', 'EDUCATION', 'MARRIAGE']

In [5]:
# use when different features need different preprocessing
from sklearn.compose import make_column_transformer
col_transformer = make_column_transformer(
        (OneHotEncoder(drop='first'), cat_columns),
        remainder='passthrough')

X_ohe = col_transformer.fit_transform(defaultCredit)
X_ohe

array([[1.0, 1.0, 0.0, ..., 0, 0, 1],
       [1.0, 1.0, 0.0, ..., 0, 2000, 1],
       [1.0, 1.0, 0.0, ..., 1000, 5000, 0],
       ...,
       [0.0, 1.0, 0.0, ..., 2000, 3100, 1],
       [0.0, 0.0, 1.0, ..., 52964, 1804, 1],
       [0.0, 1.0, 0.0, ..., 1000, 1000, 1]], dtype=object)

In [6]:
#Dependent Variable Training Set (y Training)
depVar = defaultCredit['DEFAULT']

#Training Data is divided into two parts: X-train and y_train
#Testing data follows the same rules and contains two sets: X_test and y_test (ground truth)
X_train, X_test, y_train, y_test = train_test_split(X_ohe, depVar, test_size=0.25, random_state=123)

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Classification Report", "Cross validation", "Score", "LogLoss"]
log = pd.DataFrame(columns=log_cols)


classifiers = [
    GradientBoostingClassifier(max_depth=2),      #82.2267%
    AdaBoostClassifier(),              #81.8000%
    RandomForestClassifier(max_depth=2, n_estimators=50, max_features="auto"),  #81.5600%
    LinearDiscriminantAnalysis(),                 #81.2267%
    KNeighborsClassifier(n_neighbors=2),          #76.4400%
    DecisionTreeClassifier(max_depth=2),           #72.6000%
#    SVC(gamma=2, C=1, probability=True),  #78.2533% Takes a long time to run
]

for clf in classifiers:
    print("="*60)
    name = clf.__class__.__name__
    print(name)
    clf.fit(X_train, y_train)
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
     
    class_rep = classification_report(y_test, train_predictions )
    print("Classification Report:\n {}".format(class_rep))
     
    cross_val = cross_val_score(clf, X_train, y_train)
    print("Cross Validation: {}".format(cross_val))
    
    #score
    score = clf.score(X_train,y_train)
    print("Score: {}".format(score))
    
    #Predictions
    train_predictions_proba = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions_proba)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, class_rep, cross_val, score, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*60)

GradientBoostingClassifier
Accuracy: 100.0000%
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5873
           1       1.00      1.00      1.00      1627

    accuracy                           1.00      7500
   macro avg       1.00      1.00      1.00      7500
weighted avg       1.00      1.00      1.00      7500

Cross Validation: [1. 1. 1. 1. 1.]
Score: 1.0
Log Loss: 1.4877390396612129e-05
AdaBoostClassifier
Accuracy: 100.0000%
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5873
           1       1.00      1.00      1.00      1627

    accuracy                           1.00      7500
   macro avg       1.00      1.00      1.00      7500
weighted avg       1.00      1.00      1.00      7500

Cross Validation: [1. 1. 1. 1. 1.]
Score: 1.0
Log Loss: 9.992007221626413e-16
RandomForestClassifier
Accuracy: 89.8667%
Classificatio