In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("../../Data/cleanedNotRecoded.csv")

df = data.copy()

df = df.drop(["ID", "Customer_ID", "Name", "SSN", "Occupation"], axis=1)

df["Credit_Score"] = df["Credit_Score"].map({"Good":2, "Standard":1, "Poor":0})
df["Credit_Mix"] = df["Credit_Mix"].map({"Good":2, "Standard":1, "Bad":0})

df = pd.get_dummies(df, columns=['Payment_of_Min_Amount', 'Payment_Behaviour'], drop_first=True)

In [3]:
from sklearn.ensemble import IsolationForest

iforestModel = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)

iforestModel.fit(df)

# Predict anomalies (-1 for outliers, 1 for inliers)
df['anomaly'] = iforestModel.predict(df)

df = df[df['anomaly'] == 1].drop(columns=['anomaly'])

In [4]:
X = df.drop(["Credit_Score"], axis=1)
y = df["Credit_Score"]

In [5]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X, y = smote.fit_resample(X, y)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import VotingClassifier

stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True)

In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier


nbModel = GaussianNB()
svmLModel = SVC(kernel='linear', decision_function_shape='ovr')
RgModel = RidgeClassifier()

votingclf1 = VotingClassifier(
                estimators=[('nb', nbModel), ('svm', svmLModel), ('rg', RgModel)],
                voting='hard')
votingclf1.fit(X_train, y_train)

y_pred = votingclf1.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(votingclf1, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = votingclf1.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7253334612801074
f1_score: 0.7217456983004743
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.74      0.74     34649
           1       0.71      0.59      0.65     34754
           2       0.72      0.84      0.78     34807

    accuracy                           0.73    104210
   macro avg       0.72      0.73      0.72    104210
weighted avg       0.72      0.73      0.72    104210

Cross-validation accuracy scores: [0.73006429 0.72488245 0.72238749 0.72401881 0.72632185 0.73121581
 0.72641781 0.72430669 0.72066021 0.72075617]
Mean accuracy: 0.7251031570866521
Accuracy: 0.7230261390242966
f1_score: 0.7198055956994289
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.72      0.73      8772
           1       0.70      0.60      0.65      8667
           2       0.72      0.84      0.78      8614

    accuracy                           0.72     26053
  

In [10]:
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression


svmLModel = SVC(kernel='linear', decision_function_shape='ovr')
RgModel = RidgeClassifier()
logModel = LogisticRegression(multi_class='ovr')

votingclf2 = VotingClassifier(
                estimators=[('svm', svmLModel), ('rg', RgModel), ('lr', logModel)],
                voting='hard')
votingclf2.fit(X_train, y_train)

y_pred = votingclf2.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(votingclf2, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = votingclf2.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7241051722483447
f1_score: 0.7212711002965854
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.72      0.73     34649
           1       0.69      0.61      0.65     34754
           2       0.73      0.84      0.78     34807

    accuracy                           0.72    104210
   macro avg       0.72      0.72      0.72    104210
weighted avg       0.72      0.72      0.72    104210

Cross-validation accuracy scores: [0.73188753 0.72545821 0.72737741 0.71662988 0.72507437 0.72756933
 0.72814509 0.72238749 0.71643796 0.716342  ]
Mean accuracy: 0.7237309279339794
Accuracy: 0.7200322419682954
f1_score: 0.7174965231022974
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.71      0.73      8772
           1       0.68      0.62      0.65      8667
           2       0.73      0.84      0.78      8614

    accuracy                           0.72     26053
  

In [11]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis



RgModel = RidgeClassifier()
logModel = LogisticRegression(multi_class='ovr')
LDAModel = LinearDiscriminantAnalysis()

votingclf3 = VotingClassifier(
                estimators=[('rg', RgModel), ('lr', logModel), ('lda', LDAModel)],
                voting='hard')
votingclf3.fit(X_train, y_train)

y_pred = votingclf3.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(votingclf3, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = votingclf3.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7229056712407639
f1_score: 0.719926277992461
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.73      0.74     34649
           1       0.69      0.60      0.64     34754
           2       0.73      0.84      0.78     34807

    accuracy                           0.72    104210
   macro avg       0.72      0.72      0.72    104210
weighted avg       0.72      0.72      0.72    104210

Cross-validation accuracy scores: [0.71384704 0.72181173 0.72958449 0.72152385 0.7197006  0.72171577
 0.72641781 0.72209961 0.72795317 0.72209961]
Mean accuracy: 0.7226753670473083
Accuracy: 0.7187655932138334
f1_score: 0.716111074866372
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.71      0.73      8772
           1       0.68      0.61      0.64      8667
           2       0.73      0.84      0.78      8614

    accuracy                           0.72     26053
   m

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier


logModel = LogisticRegression(multi_class='ovr')
LDAModel = LinearDiscriminantAnalysis()
adaModel = AdaBoostClassifier()

votingclf4 = VotingClassifier(
                estimators=[('lr', logModel), ('lda', LDAModel), ('ada', adaModel)],
                voting='hard')
votingclf4.fit(X_train, y_train)

y_pred = votingclf4.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(votingclf4, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = votingclf4.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.729450148738125
f1_score: 0.7270954064536475
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.73      0.74     34649
           1       0.69      0.62      0.66     34754
           2       0.74      0.84      0.78     34807

    accuracy                           0.73    104210
   macro avg       0.73      0.73      0.73    104210
weighted avg       0.73      0.73      0.73    104210





Cross-validation accuracy scores: [0.73217542 0.72584205 0.72814509 0.72401881 0.73793302 0.72977641
 0.7348623  0.72641781 0.73179157 0.72593801]
Mean accuracy: 0.7296900489396412
Accuracy: 0.726480635627375
f1_score: 0.7244040042416015
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.71      0.73      8772
           1       0.69      0.63      0.66      8667
           2       0.73      0.84      0.78      8614

    accuracy                           0.73     26053
   macro avg       0.73      0.73      0.72     26053
weighted avg       0.73      0.73      0.72     26053



In [13]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis




LDAModel = LinearDiscriminantAnalysis()
adaModel = AdaBoostClassifier()
QDAModel = QuadraticDiscriminantAnalysis()

votingclf5 = VotingClassifier(
                estimators=[('lda', LDAModel), ('ada', adaModel), ('qda', QDAModel )],
                voting='hard')
votingclf5.fit(X_train, y_train)

y_pred = votingclf5.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(votingclf5, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = votingclf5.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.7425870837731504
f1_score: 0.7396118332378583
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.75      0.76     34649
           1       0.73      0.62      0.67     34754
           2       0.74      0.85      0.79     34807

    accuracy                           0.74    104210
   macro avg       0.74      0.74      0.74    104210
weighted avg       0.74      0.74      0.74    104210





Cross-validation accuracy scores: [0.73783706 0.7434987  0.74868055 0.74100374 0.74042798 0.74042798
 0.7363017  0.7430189  0.74829671 0.74292294]
Mean accuracy: 0.7422416274829671
Accuracy: 0.7372663416880973
f1_score: 0.7345866877092023
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.74      0.75      8772
           1       0.71      0.63      0.67      8667
           2       0.74      0.85      0.79      8614

    accuracy                           0.74     26053
   macro avg       0.74      0.74      0.73     26053
weighted avg       0.74      0.74      0.73     26053



In [14]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC



adaModel = AdaBoostClassifier()
QDAModel = QuadraticDiscriminantAnalysis()
svmRBFModel = SVC(kernel='rbf', decision_function_shape='ovr')

votingclf6 = VotingClassifier(
                estimators=[('ada', adaModel), ('qda', QDAModel ), ('svc', svmRBFModel)],
                voting='hard')
votingclf6.fit(X_train, y_train)

y_pred = votingclf6.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(votingclf6, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = votingclf6.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.7578639286057001
f1_score: 0.7545859896141368
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.77      0.78     34649
           1       0.76      0.63      0.69     34754
           2       0.74      0.87      0.80     34807

    accuracy                           0.76    104210
   macro avg       0.76      0.76      0.75    104210
weighted avg       0.76      0.76      0.75    104210





Cross-validation accuracy scores: [0.75415027 0.75568563 0.75885232 0.76182708 0.75606948 0.74551387
 0.75040783 0.76413012 0.75021591 0.7562614 ]
Mean accuracy: 0.755311390461568
Accuracy: 0.7516216942386673
f1_score: 0.7486530191477928
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.76      0.76      8772
           1       0.74      0.63      0.68      8667
           2       0.74      0.87      0.80      8614

    accuracy                           0.75     26053
   macro avg       0.75      0.75      0.75     26053
weighted avg       0.75      0.75      0.75     26053



In [15]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier


QDAModel = QuadraticDiscriminantAnalysis()
svmRBFModel = SVC(kernel='rbf', decision_function_shape='ovr')
gbModel = GradientBoostingClassifier()

votingclf7 = VotingClassifier(
                estimators=[('qda', QDAModel ), ('svc', svmRBFModel), ('gb', gbModel )],
                voting='hard')
votingclf7.fit(X_train, y_train)

y_pred = votingclf7.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(votingclf7, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = votingclf7.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7663563957393724
f1_score: 0.7636581506484836
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.77      0.78     34649
           1       0.76      0.65      0.70     34754
           2       0.75      0.88      0.81     34807

    accuracy                           0.77    104210
   macro avg       0.77      0.77      0.76    104210
weighted avg       0.77      0.77      0.76    104210

Cross-validation accuracy scores: [0.757221   0.7600998  0.76422608 0.76000384 0.77430189 0.76029172
 0.76739276 0.75962    0.7677766  0.75856444]
Mean accuracy: 0.7629498128778428
Accuracy: 0.7589145204007216
f1_score: 0.7563729827588032
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.76      0.77      8772
           1       0.75      0.65      0.69      8667
           2       0.75      0.87      0.80      8614

    accuracy                           0.76     26053
  