In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv("../../Data/cleanedNotRecoded.csv")

df = data.copy()

df = df.drop(["ID", "Customer_ID", "Name", "SSN", "Occupation"], axis=1)

df["Credit_Score"] = df["Credit_Score"].map({"Good":2, "Standard":1, "Poor":0})
df["Credit_Mix"] = df["Credit_Mix"].map({"Good":2, "Standard":1, "Bad":0})

df = pd.get_dummies(df, columns=['Payment_of_Min_Amount', 'Payment_Behaviour'], drop_first=True)

In [4]:
from sklearn.ensemble import IsolationForest

iforestModel = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)

iforestModel.fit(df)

# Predict anomalies (-1 for outliers, 1 for inliers)
df['anomaly'] = iforestModel.predict(df)

df = df[df['anomaly'] == 1].drop(columns=['anomaly'])

In [5]:
X = df.drop(["Credit_Score"], axis=1)
y = df["Credit_Score"]

In [6]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X, y = smote.fit_resample(X, y)

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True)

meta_classifier = LogisticRegression()

In [9]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

base_classifiers3 = [
    ('rg', RidgeClassifier()),
    ('lg', LogisticRegression()),
    ('lda', LinearDiscriminantAnalysis())
]


stacking_clf3 = StackingClassifier(
    estimators=base_classifiers3,
    final_estimator=meta_classifier,
    cv=stratified_kfold
)


stacking_clf3.fit(X_train, y_train)

y_pred = stacking_clf3.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(stacking_clf3, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = stacking_clf3.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy: 0.7255541694655023
f1_score: 0.7242544604487424
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.71      0.73     34649
           1       0.67      0.65      0.66     34754
           2       0.75      0.82      0.78     34807

    accuracy                           0.73    104210
   macro avg       0.73      0.73      0.72    104210
weighted avg       0.73      0.73      0.72    104210



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation accuracy scores: [0.72142789 0.72104405 0.72536225 0.72795317 0.72334709 0.73092793
 0.72804913 0.72440265 0.72200365 0.73035217]
Mean accuracy: 0.7254869974090778
Accuracy: 0.7238321882316816
f1_score: 0.7225108661070865
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.69      0.73      8772
           1       0.66      0.65      0.66      8667
           2       0.75      0.83      0.79      8614

    accuracy                           0.72     26053
   macro avg       0.72      0.72      0.72     26053
weighted avg       0.72      0.72      0.72     26053



In [10]:
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

base_classifiers4 = [
    ('xgb', xgb.XGBClassifier(objective='multi:softmax', num_class=3)),
    ('rf', RandomForestClassifier()),
    ('et', ExtraTreesClassifier())
]


stacking_clf4 = StackingClassifier(
    estimators=base_classifiers4,
    final_estimator=meta_classifier,
    cv=stratified_kfold
)


stacking_clf4.fit(X_train, y_train)

y_pred = stacking_clf4.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(stacking_clf4, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = stacking_clf4.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
f1_score: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     34649
           1       1.00      1.00      1.00     34754
           2       1.00      1.00      1.00     34807

    accuracy                           1.00    104210
   macro avg       1.00      1.00      1.00    104210
weighted avg       1.00      1.00      1.00    104210

Cross-validation accuracy scores: [0.88072162 0.88168122 0.88312062 0.87937818 0.87477209 0.87582766
 0.87985798 0.88014586 0.88427214 0.88360042]
Mean accuracy: 0.8803377794837347
Accuracy: 0.8833531647027214
f1_score: 0.8827643161422751
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.90      0.89      8772
           1       0.85      0.81      0.83      8667
           2       0.92      0.94      0.93      8614

    accuracy                           0.88     26053
   macro avg       0.88      0.8