In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score


from sklearn.ensemble import IsolationForest



import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, f1_score

from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression

In [9]:
data = pd.read_csv("../../Data/cleaned_train.csv")

df = data.copy()

df = df.drop(["ID", "Customer_ID", "Name", "SSN", "Occupation"], axis=1)

df["Credit_Score"] = df["Credit_Score"].map({"Good":2, "Standard":1, "Poor":0})

In [10]:
iforestModel = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)

iforestModel.fit(df)

# Predict anomalies (-1 for outliers, 1 for inliers)
df['anomaly'] = iforestModel.predict(df)

df = df[df['anomaly'] == 1].drop(columns=['anomaly'])

In [11]:
X = df.drop("Credit_Score", axis=1)
y = df["Credit_Score"]

categorical_features = ["Credit_Mix", "Payment_of_Min_Amount" , "Spending_Behaviour", "Paying_Behaviour"]
numerical_features = ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Total_EMI_per_month',
       'Amount_invested_monthly', 'Monthly_Balance']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Model Training

In [6]:
model = xgb.XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss')

model.fit(X_train, y_train)

y_pred = model.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = model.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

scores = cross_val_score(model, X_test, y_test, cv=10, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

Accuracy: 0.8570353531394529
f1_score: 0.8572462456775826
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.86      0.84     19947
           1       0.88      0.86      0.87     34441
           2       0.83      0.85      0.84      9397

    accuracy                           0.86     63785
   macro avg       0.85      0.86      0.85     63785
weighted avg       0.86      0.86      0.86     63785

Cross-validation accuracy scores: [0.79040602 0.78758426 0.78852485 0.79824424 0.79589277 0.78739417
 0.79805582 0.79288178 0.80244591 0.80354343]
Mean accuracy: 0.7944973243628123
Accuracy: 0.7892393553646454
f1_score: 0.7893554442556232
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.79      0.78      4987
           1       0.82      0.80      0.81      8610
           2       0.74      0.74      0.74      2350

    accuracy                           0.79     15947
  

In [7]:
logModel = LogisticRegression(multi_class='ovr', max_iter=1000, random_state=42)

logModel.fit(X_train, y_train)

y_pred = logModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(logModel, X_train, y_train, cv=10, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = logModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

scores = cross_val_score(logModel, X_test, y_test, cv=10, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

Accuracy: 0.6733871599905934
f1_score: 0.6701405647909849
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.58      0.63     19947
           1       0.69      0.77      0.73     34441
           2       0.55      0.52      0.54      9397

    accuracy                           0.67     63785
   macro avg       0.65      0.62      0.63     63785
weighted avg       0.67      0.67      0.67     63785

Cross-validation accuracy scores: [0.67236244 0.6729895  0.66687569 0.6751842  0.67063803 0.67340859
 0.68250235 0.66948887 0.67952336 0.66588272]
Mean accuracy: 0.672885574633881
Accuracy: 0.6660813946196776
f1_score: 0.662280259647057
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.57      0.62      4987
           1       0.69      0.77      0.73      8610
           2       0.54      0.50      0.52      2350

    accuracy                           0.67     15947
   m

In [13]:
smote = SMOTE(random_state=42)

X_train, y_train = smote.fit_resample(X_train, y_train)

In [7]:
model = xgb.XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss')

model.fit(X_train, y_train)

y_pred = model.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = model.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

scores = cross_val_score(model, X_test, y_test, cv=10, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

Accuracy: 0.8888824366307598
f1_score: 0.8882329843924506
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.89      0.88     34441
           1       0.88      0.83      0.85     34441
           2       0.91      0.95      0.93     34441

    accuracy                           0.89    103323
   macro avg       0.89      0.89      0.89    103323
weighted avg       0.89      0.89      0.89    103323

Cross-validation accuracy scores: [0.76550856 0.76686345 0.80557437 0.850271   0.85336818 0.86178862
 0.88230739 0.88124274 0.88424313 0.88124274]
Mean accuracy: 0.8432410187148147
Accuracy: 0.7804602746598106
f1_score: 0.7812052427700502
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.79      0.77      4987
           1       0.82      0.78      0.80      8610
           2       0.69      0.76      0.72      2350

    accuracy                           0.78     15947
  

In [14]:
logModel = LogisticRegression(multi_class='ovr', max_iter=1000, random_state=42)

logModel.fit(X_train, y_train)

y_pred = logModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(logModel, X_train, y_train, cv=10, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = logModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

scores = cross_val_score(logModel, X_test, y_test, cv=10, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

Accuracy: 0.7204591426884623
f1_score: 0.7169455503256028
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.73      0.74     34441
           1       0.68      0.59      0.63     34441
           2       0.74      0.84      0.79     34441

    accuracy                           0.72    103323
   macro avg       0.72      0.72      0.72    103323
weighted avg       0.72      0.72      0.72    103323

Cross-validation accuracy scores: [0.70831317 0.71605536 0.72031356 0.71989934 0.71389857 0.71912505
 0.72435153 0.72560976 0.72880372 0.72435153]
Mean accuracy: 0.7200721575485238
Accuracy: 0.6612529002320185
f1_score: 0.6659833488294807
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.71      0.66      4987
           1       0.82      0.59      0.68      8610
           2       0.48      0.84      0.61      2350

    accuracy                           0.66     15947
  