In [5]:
# Importing all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [6]:
# Reading the zomato.csv dataset
df=pd.read_csv('EDA_FE.csv')

In [7]:
# First five rows of dataframe
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [8]:
# Shape of the dataframe
df.shape

(46728, 24)

## Pipeline

In [9]:
X = df.drop(['default.payment.next.month'],axis=1)
y = df['default.payment.next.month']

In [10]:
# Handling Feature Scaling
from sklearn.preprocessing import StandardScaler 
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [11]:
# Define which columns should be ordinal-encoded and which should be scaled
numerical_cols = ['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5',
                    'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
categorical_cols = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

In [12]:
## Numerical Pipeline with Outlier Handling
num_pipeline = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)

In [13]:
# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('scaler',StandardScaler())
    ]
)

In [14]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('cat_pipeline', cat_pipeline, categorical_cols)
])

## Model

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, classification_report, roc_curve, auc


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify = y)

In [17]:
preprocessor.fit_transform(X_train)

array([[ 1.65281841, -0.26985567,  2.12954382, ...,  0.06095606,
        -0.75289628, -0.71814273],
       [ 1.65281841,  0.52499661, -0.68299927, ..., -1.61967035,
        -1.61327459, -1.56767586],
       [-0.82421858,  0.86564759, -0.67421769, ..., -0.77935715,
         0.96786032, -0.71814273],
       ...,
       [ 0.85377422,  0.18434564, -0.1840693 , ...,  0.06095606,
         0.10748202,  0.13139041],
       [-0.82421858, -1.51890925, -0.04244758, ...,  0.06095606,
         0.10748202,  0.13139041],
       [ 0.74205985, -0.04275501, -0.54250759, ..., -0.77935715,
         0.10748202,  0.13139041]])

In [18]:
preprocessor.transform(X_test)

array([[ 0.21453887, -0.26985567, -0.2641655 , ...,  0.06095606,
         0.10748202, -0.71814273],
       [ 0.29444329,  1.43339922,  1.30498737, ...,  0.06095606,
         0.10748202,  0.13139041],
       [-0.10507881,  0.41144629, -0.68324434, ..., -0.77935715,
        -0.75289628,  0.13139041],
       ...,
       [-0.58450532, -1.51890925,  0.35699864, ...,  0.06095606,
         0.10748202,  0.13139041],
       [ 0.69396538, -0.26985567,  2.57449089, ...,  0.06095606,
         0.10748202,  0.13139041],
       [-0.98402742, -1.06470794, -0.29999979, ...,  0.90126926,
         0.96786032,  1.83045667]])

In [19]:
preprocessor.get_feature_names_out()

array(['num_pipeline__LIMIT_BAL', 'num_pipeline__AGE',
       'num_pipeline__BILL_AMT1', 'num_pipeline__BILL_AMT2',
       'num_pipeline__BILL_AMT3', 'num_pipeline__BILL_AMT4',
       'num_pipeline__BILL_AMT5', 'num_pipeline__BILL_AMT6',
       'num_pipeline__PAY_AMT1', 'num_pipeline__PAY_AMT2',
       'num_pipeline__PAY_AMT3', 'num_pipeline__PAY_AMT4',
       'num_pipeline__PAY_AMT5', 'num_pipeline__PAY_AMT6',
       'cat_pipeline__SEX', 'cat_pipeline__EDUCATION',
       'cat_pipeline__MARRIAGE', 'cat_pipeline__PAY_0',
       'cat_pipeline__PAY_2', 'cat_pipeline__PAY_3',
       'cat_pipeline__PAY_4', 'cat_pipeline__PAY_5',
       'cat_pipeline__PAY_6'], dtype=object)

In [20]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [21]:
X_train

Unnamed: 0,num_pipeline__LIMIT_BAL,num_pipeline__AGE,num_pipeline__BILL_AMT1,num_pipeline__BILL_AMT2,num_pipeline__BILL_AMT3,num_pipeline__BILL_AMT4,num_pipeline__BILL_AMT5,num_pipeline__BILL_AMT6,num_pipeline__PAY_AMT1,num_pipeline__PAY_AMT2,...,num_pipeline__PAY_AMT6,cat_pipeline__SEX,cat_pipeline__EDUCATION,cat_pipeline__MARRIAGE,cat_pipeline__PAY_0,cat_pipeline__PAY_2,cat_pipeline__PAY_3,cat_pipeline__PAY_4,cat_pipeline__PAY_5,cat_pipeline__PAY_6
0,1.652818,-0.269856,2.129544,2.286463,2.206927,2.710843,-0.615998,-0.650106,0.382556,1.137645,...,-0.284153,-1.016588,-1.088172,1.049648,-0.156357,-0.038007,0.008190,0.060956,-0.752896,-0.718143
1,1.652818,0.524997,-0.682999,-0.678517,-0.675469,-0.664687,-0.656936,-0.650106,-0.333527,-0.277491,...,-0.284153,-1.016588,-1.088172,-0.893200,-1.941574,-1.688327,-1.662114,-1.619670,-1.613275,-1.567676
2,-0.824219,0.865648,-0.674218,-0.674509,-0.669799,-0.658174,-0.652056,-0.642054,-0.332661,-0.255454,...,-0.240513,-1.016588,-1.088172,-0.893200,-1.048965,0.787153,-0.826962,-0.779357,0.967860,-0.718143
3,0.693965,-0.383406,-0.683244,-0.679974,-0.675469,-0.667065,-0.656936,-0.650106,-0.341036,-0.277491,...,-0.284153,-1.016588,0.344194,1.049648,0.736252,-1.688327,-1.662114,-1.619670,-1.613275,-1.567676
4,-0.009329,0.752097,0.241531,0.292564,0.358745,0.475365,0.550705,0.618035,-0.110139,-0.083231,...,-0.101148,0.983683,0.344194,-0.893200,1.628860,0.787153,0.843341,0.901269,1.828239,0.980924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31302,0.374348,-0.724057,-0.552052,-0.493849,-0.428573,-0.404784,-0.292952,-0.204431,-0.045520,-0.052549,...,-0.020959,-1.016588,-1.088172,1.049648,-0.156357,-0.038007,0.008190,0.901269,0.107482,0.131390
31303,-0.184983,-0.951158,-0.599704,-0.550239,-0.626079,-0.629216,-0.631227,-0.625549,0.328406,-0.085548,...,-0.248700,-1.016588,0.344194,1.049648,-1.048965,-0.863167,-0.826962,-0.779357,-0.752896,-0.718143
31304,0.853774,0.184346,-0.184069,-0.368486,-0.103628,-0.047191,0.633202,0.233612,-0.219018,0.815404,...,-0.076782,-1.016588,-1.088172,1.049648,-0.156357,-0.038007,0.008190,0.060956,0.107482,0.131390
31305,-0.824219,-1.518909,-0.042448,-0.047525,0.018882,-0.227573,-0.197093,-0.167217,-0.174975,-0.059273,...,-0.213246,0.983683,1.776561,1.049648,-0.156357,-0.038007,0.008190,0.060956,0.107482,0.131390


In [22]:
X_test

Unnamed: 0,num_pipeline__LIMIT_BAL,num_pipeline__AGE,num_pipeline__BILL_AMT1,num_pipeline__BILL_AMT2,num_pipeline__BILL_AMT3,num_pipeline__BILL_AMT4,num_pipeline__BILL_AMT5,num_pipeline__BILL_AMT6,num_pipeline__PAY_AMT1,num_pipeline__PAY_AMT2,...,num_pipeline__PAY_AMT6,cat_pipeline__SEX,cat_pipeline__EDUCATION,cat_pipeline__MARRIAGE,cat_pipeline__PAY_0,cat_pipeline__PAY_2,cat_pipeline__PAY_3,cat_pipeline__PAY_4,cat_pipeline__PAY_5,cat_pipeline__PAY_6
0,0.214539,-0.269856,-0.264165,-0.242731,-0.222009,-0.154456,-0.288072,-0.426825,-0.196635,-0.197933,...,0.694753,0.983683,0.344194,-0.893200,-0.156357,-0.038007,0.008190,0.060956,0.107482,-0.718143
1,0.294443,1.433399,1.304987,1.309959,1.300240,0.929530,0.466455,0.119544,0.070795,-0.034242,...,5.921130,0.983683,3.208928,-0.893200,-0.156357,-0.038007,0.008190,0.060956,0.107482,0.131390
2,-0.105079,0.411446,-0.683244,-0.668805,-0.672707,-0.616393,-0.122832,-0.306831,-0.283493,-0.266755,...,-0.105338,0.983683,-1.088172,-0.893200,0.736252,-0.863167,-0.826962,-0.779357,-0.752896,0.131390
3,1.812627,0.979198,-0.584754,-0.613103,-0.609765,-0.607720,-0.589864,-0.582174,-0.232736,-0.164483,...,0.184737,0.983683,1.776561,-0.893200,-0.156357,-0.038007,0.008190,0.060956,-0.752896,-0.718143
4,1.013583,-0.496956,1.334287,1.432169,1.560164,1.745936,1.695809,1.787526,0.019966,0.005367,...,0.012368,-1.016588,-1.088172,1.049648,-0.156357,-0.038007,0.008190,0.060956,0.107482,0.131390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15416,-0.664410,-0.269856,0.036995,-0.063654,-0.189281,-0.279628,-0.482034,-0.525710,-0.195841,-0.182395,...,0.042020,0.983683,0.344194,-0.893200,-0.156357,-0.038007,0.008190,0.060956,0.107482,0.131390
15417,-0.824219,-0.724057,-0.071488,-0.014552,0.008297,0.068048,0.169576,0.165747,-0.104724,-0.246075,...,-0.169155,-1.016588,-1.088172,-0.893200,-0.156357,-0.038007,0.843341,0.060956,0.107482,1.830457
15418,-0.584505,-1.518909,0.356999,0.399654,0.480775,0.510711,-0.179997,-0.504952,-0.140319,-0.065884,...,-0.147883,0.983683,0.344194,1.049648,-0.156357,-0.038007,0.008190,0.060956,0.107482,0.131390
15419,0.693965,-0.269856,2.574491,2.641435,2.794268,2.919717,2.566157,2.716561,0.350788,0.285456,...,0.141870,0.983683,-1.088172,1.049648,-0.156357,-0.038007,0.008190,0.060956,0.107482,0.131390


In [23]:
def evaluate_classification_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    confusion = confusion_matrix(true, predicted)
    return accuracy, precision, recall, f1, confusion


In [25]:
## Train multiple models

models = {
            'SVM': SVC(C=0.01, degree=5, kernel='poly'),
            'KNN': KNeighborsClassifier(),
            'DecisionTree': DecisionTreeClassifier(),
            'GradientBoosting': GradientBoostingClassifier(),
            'LogisticRegression': LogisticRegression(max_iter=1000),
            'AdaBoosting':AdaBoostClassifier(),
            'Naive Bayes': GaussianNB()
            }

In [31]:
trained_model_list=[]
model_list = []
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

In [32]:
list(models)

['SVM',
 'KNN',
 'DecisionTree',
 'GradientBoosting',
 'LogisticRegression',
 'AdaBoosting',
 'Naive Bayes']

In [33]:
for i in range(len(list(models))):
    model=list(models.values())[i]
    print(model)

SVC(C=0.01, degree=5, kernel='poly')
KNeighborsClassifier()
DecisionTreeClassifier()
GradientBoostingClassifier()
LogisticRegression(max_iter=1000)
AdaBoostClassifier()
GaussianNB()


In [34]:
models.keys()

dict_keys(['SVM', 'KNN', 'DecisionTree', 'GradientBoosting', 'LogisticRegression', 'AdaBoosting', 'Naive Bayes'])

In [35]:
models.values()

dict_values([SVC(C=0.01, degree=5, kernel='poly'), KNeighborsClassifier(), DecisionTreeClassifier(), GradientBoostingClassifier(), LogisticRegression(max_iter=1000), AdaBoostClassifier(), GaussianNB()])

In [37]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make Predictions
    y_pred = model.predict(X_test)

    # This is a validation (test) score
    accuracy, precision, recall, f1, _ = evaluate_classification_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

    print('=' * 35)
    print('\n')


SVM
Model Training Performance
Accuracy: 0.5901692497244018
Precision: 0.5532730338801165
Recall: 0.9361867704280156
F1 Score: 0.6955097321256504


KNN
Model Training Performance
Accuracy: 0.74982167174632
Precision: 0.7420206081930133
Recall: 0.7658884565499352
F1 Score: 0.7537656369670667


DecisionTree
Model Training Performance
Accuracy: 0.7456066402957007
Precision: 0.738086256758456
Recall: 0.7613488975356679
F1 Score: 0.7495371257102726


GradientBoosting
Model Training Performance
Accuracy: 0.7856818623954348
Precision: 0.8069686411149826
Recall: 0.7509727626459144
F1 Score: 0.7779643936849178


LogisticRegression
Model Training Performance
Accuracy: 0.721872770896829
Precision: 0.7208521626856036
Recall: 0.724124513618677
F1 Score: 0.7224846328049175


AdaBoosting
Model Training Performance
Accuracy: 0.7567602619804163
Precision: 0.7706823465062218
Recall: 0.7309987029831387
F1 Score: 0.7503161818544898


Naive Bayes
Model Training Performance
Accuracy: 0.6028143440762597
Prec