In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv("../../Data/cleanedNotRecoded.csv")

df = data.copy()

df = df.drop(["ID", "Customer_ID", "Name", "SSN", "Occupation"], axis=1)

df["Credit_Score"] = df["Credit_Score"].map({"Good":2, "Standard":1, "Poor":0})
df["Credit_Mix"] = df["Credit_Mix"].map({"Good":2, "Standard":1, "Bad":0})

df = pd.get_dummies(df, columns=['Payment_of_Min_Amount', 'Payment_Behaviour'], drop_first=True)

In [4]:
from sklearn.ensemble import IsolationForest

iforestModel = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)

iforestModel.fit(df)

# Predict anomalies (-1 for outliers, 1 for inliers)
df['anomaly'] = iforestModel.predict(df)

df = df[df['anomaly'] == 1].drop(columns=['anomaly'])

In [5]:
X = df.drop(["Credit_Score"], axis=1)
y = df["Credit_Score"]

In [6]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X, y = smote.fit_resample(X, y)

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier

stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True)

In [9]:
from sklearn.linear_model import LogisticRegression

bagLogclf = BaggingClassifier(
    LogisticRegression(multi_class='ovr'), 
    n_estimators=100,
    bootstrap=True,
    max_samples=0.7,
    bootstrap_features=True,
    max_features=0.7,
    oob_score=True, 
    n_jobs=-1
)

bagLogclf.fit(X_train, y_train)
print("OOB score: " ,bagLogclf.oob_score_)

y_pred = bagLogclf.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(bagLogclf, X_train, y_train, cv = stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = bagLogclf.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

OOB score:  0.7222627387007005
Accuracy: 0.7228960752327032
f1_score: 0.7167494706662209
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.78      0.75     34649
           1       0.72      0.54      0.62     34754
           2       0.72      0.85      0.78     34807

    accuracy                           0.72    104210
   macro avg       0.72      0.72      0.72    104210
weighted avg       0.72      0.72      0.72    104210

Cross-validation accuracy scores: [0.72737741 0.72632185 0.72411477 0.71912484 0.72411477 0.71768544
 0.72478649 0.72584205 0.72257941 0.71653392]
Mean accuracy: 0.7228480951923999
Accuracy: 0.7221049399301424
f1_score: 0.7162198882458685
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.77      0.75      8772
           1       0.71      0.55      0.62      8667
           2       0.72      0.85      0.78      8614

    accuracy             

In [10]:
from sklearn.neighbors import KNeighborsClassifier

bagKnnclf = BaggingClassifier(
    KNeighborsClassifier(n_neighbors=3), 
    n_estimators=100,
    bootstrap=True,
    max_samples=0.7,
    bootstrap_features=True,
    max_features=0.7,
    oob_score=True, 
    n_jobs=-1
)

bagKnnclf.fit(X_train, y_train)
print("OOB score: " ,bagKnnclf.oob_score_)

y_pred = bagKnnclf.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(bagKnnclf, X_train, y_train, cv = stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = bagKnnclf.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

OOB score:  0.830371365511947
Accuracy: 0.9291718645043662
f1_score: 0.928132290733403
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94     34649
           1       0.97      0.84      0.90     34754
           2       0.91      0.99      0.95     34807

    accuracy                           0.93    104210
   macro avg       0.93      0.93      0.93    104210
weighted avg       0.93      0.93      0.93    104210

Cross-validation accuracy scores: [0.8310143  0.8310143  0.8305345  0.83379714 0.82698397 0.82496881
 0.8386911  0.8290951  0.83360522 0.84147395]
Mean accuracy: 0.8321178389789848
Accuracy: 0.8333781138448547
f1_score: 0.8296620338572958
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.86      0.85      8772
           1       0.87      0.68      0.77      8667
           2       0.81      0.96      0.87      8614

    accuracy               

In [11]:
from sklearn.naive_bayes import GaussianNB

bagNbclf = BaggingClassifier(
    GaussianNB(), 
    n_estimators=100,
    bootstrap=True,
    max_samples=0.7,
    bootstrap_features=True,
    max_features=0.7,
    oob_score=True, 
    n_jobs=-1
)

bagNbclf.fit(X_train, y_train)
print("OOB score: " ,bagNbclf.oob_score_)

y_pred = bagNbclf.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(bagNbclf, X_train, y_train, cv = stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = bagNbclf.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

OOB score:  0.6938393628250648
Accuracy: 0.695000479800403
f1_score: 0.6810511154311096
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.77      0.74     34649
           1       0.72      0.44      0.54     34754
           2       0.67      0.88      0.76     34807

    accuracy                           0.70    104210
   macro avg       0.70      0.69      0.68    104210
weighted avg       0.70      0.70      0.68    104210

Cross-validation accuracy scores: [0.69628634 0.69926111 0.68995298 0.69609442 0.69091258 0.69139238
 0.69004894 0.69417522 0.70329143 0.70530659]
Mean accuracy: 0.6956722003646483
Accuracy: 0.6908609373200782
f1_score: 0.6773174829731899
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.76      0.73      8772
           1       0.71      0.44      0.54      8667
           2       0.67      0.88      0.76      8614

    accuracy              

In [9]:
from sklearn.svm import SVC

bagSVMLinearclf = BaggingClassifier(
    SVC(kernel='linear', decision_function_shape='ovr'), 
    n_estimators=100,
    bootstrap=True,
    max_samples=0.7,
    bootstrap_features=True,
    max_features=0.7,
    oob_score=True, 
    n_jobs=-1
)

bagSVMLinearclf.fit(X_train, y_train)
print("OOB score: " ,bagSVMLinearclf.oob_score_)

y_pred = bagSVMLinearclf.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))


y_pred = bagSVMLinearclf.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

OOB score:  0.7214566740236061
Accuracy: 0.722003646483063
f1_score: 0.7192232757381906
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.76      0.76     34649
           1       0.68      0.59      0.63     34754
           2       0.73      0.81      0.77     34807

    accuracy                           0.72    104210
   macro avg       0.72      0.72      0.72    104210
weighted avg       0.72      0.72      0.72    104210

Accuracy: 0.7217594902698345
f1_score: 0.7192166048022725
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.75      0.75      8772
           1       0.68      0.60      0.64      8667
           2       0.73      0.81      0.77      8614

    accuracy                           0.72     26053
   macro avg       0.72      0.72      0.72     26053
weighted avg       0.72      0.72      0.72     26053



In [10]:
from sklearn.linear_model import RidgeClassifier

bagRigclf = BaggingClassifier(
    RidgeClassifier(), 
    n_estimators=100,
    bootstrap=True,
    max_samples=0.7,
    bootstrap_features=True,
    max_features=0.7,
    oob_score=True, 
    n_jobs=-1
)

bagRigclf.fit(X_train, y_train)
print("OOB score: " ,bagRigclf.oob_score_)

y_pred = bagRigclf.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))


y_pred = bagRigclf.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

OOB score:  0.7099030803185875
Accuracy: 0.7116303617695039
f1_score: 0.7026794736083785
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.78      0.75     34649
           1       0.72      0.50      0.59     34754
           2       0.70      0.86      0.77     34807

    accuracy                           0.71    104210
   macro avg       0.71      0.71      0.70    104210
weighted avg       0.71      0.71      0.70    104210

Accuracy: 0.712278816259164
f1_score: 0.703766963443878
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.77      0.75      8772
           1       0.72      0.50      0.59      8667
           2       0.70      0.86      0.77      8614

    accuracy                           0.71     26053
   macro avg       0.71      0.71      0.70     26053
weighted avg       0.71      0.71      0.70     26053



In [13]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

bagLDAclf = BaggingClassifier(
    LinearDiscriminantAnalysis(), 
    n_estimators=100,
    bootstrap=True,
    max_samples=0.7,
    bootstrap_features=True,
    max_features=0.7,
    oob_score=True, 
    n_jobs=-1
)

bagLDAclf.fit(X_train, y_train)
print("OOB score: " ,bagLDAclf.oob_score_)

y_pred = bagLDAclf.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))


y_pred = bagLDAclf.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

OOB score:  0.723471835716342
Accuracy: 0.7242203243450724
f1_score: 0.7203105089834717
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.75      0.75     34649
           1       0.69      0.58      0.63     34754
           2       0.72      0.84      0.78     34807

    accuracy                           0.72    104210
   macro avg       0.72      0.72      0.72    104210
weighted avg       0.72      0.72      0.72    104210

Accuracy: 0.7234867385713737
f1_score: 0.7198690143924721
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.75      0.75      8772
           1       0.69      0.59      0.63      8667
           2       0.72      0.84      0.77      8614

    accuracy                           0.72     26053
   macro avg       0.72      0.72      0.72     26053
weighted avg       0.72      0.72      0.72     26053



In [14]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

bagQDAclf = BaggingClassifier(
    QuadraticDiscriminantAnalysis(), 
    n_estimators=100,
    bootstrap=True,
    max_samples=0.7,
    bootstrap_features=True,
    max_features=0.7,
    oob_score=True, 
    n_jobs=-1
)

bagQDAclf.fit(X_train, y_train)
print("OOB score: " ,bagQDAclf.oob_score_)

y_pred = bagQDAclf.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))


y_pred = bagQDAclf.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

OOB score:  0.5723059207369734
Accuracy: 0.6107763170521063
f1_score: 0.6043461691377693
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.37      0.51     34649
           1       0.48      0.78      0.60     34754
           2       0.74      0.68      0.71     34807

    accuracy                           0.61    104210
   macro avg       0.68      0.61      0.60    104210
weighted avg       0.68      0.61      0.60    104210

Accuracy: 0.6065712202049668
f1_score: 0.5988993799045289
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.36      0.49      8772
           1       0.48      0.78      0.59      8667
           2       0.74      0.69      0.71      8614

    accuracy                           0.61     26053
   macro avg       0.67      0.61      0.60     26053
weighted avg       0.67      0.61      0.60     26053

