In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv("../../Data/cleanedNotRecoded.csv")

df = data.copy()

df = df.drop(["ID", "Customer_ID", "Name", "SSN", "Occupation"], axis=1)

df["Credit_Score"] = df["Credit_Score"].map({"Good":2, "Standard":1, "Poor":0})
df["Credit_Mix"] = df["Credit_Mix"].map({"Good":2, "Standard":1, "Bad":0})

df = pd.get_dummies(df, columns=['Payment_of_Min_Amount', 'Payment_Behaviour'], drop_first=True)

In [4]:
from sklearn.ensemble import IsolationForest

iforestModel = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)

iforestModel.fit(df)

# Predict anomalies (-1 for outliers, 1 for inliers)
df['anomaly'] = iforestModel.predict(df)

df = df[df['anomaly'] == 1].drop(columns=['anomaly'])

In [5]:
X = df.drop(["Credit_Score"], axis=1)
y = df["Credit_Score"]

In [6]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X, y = smote.fit_resample(X, y)

In [7]:
X.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,130253,130254,130255,130256,130257,130258,130259,130260,130261,130262
Month,1,2,3,4,5,6,7,8,1,2,...,2,3,4,4,5,2,1,1,5,1
Age,23,23,23,23,23,23,23,23,28,28,...,52,22,27,22,28,26,39,52,36,23
Annual_Income,19114.12,19114.12,19114.12,19114.12,19114.12,19114.12,19114.12,19114.12,34847.84,34847.84,...,41113.35,81233.009242,16674.01,34565.73,44890.9,43269.35,18692.02,29709.73,148688.261247,32612.36
Monthly_Inhand_Salary,1824.84,1824.84,1824.84,1824.84,1824.84,1824.84,1824.84,1824.84,3037.99,3037.99,...,3446.714027,6692.779988,1342.5,2976.48,3715.91,3558.78,1622.67,2587.81,12390.906638,2670.7
Num_Bank_Accounts,3,3,3,3,3,3,3,3,2,2,...,1,1,1,1,3,2,2,8,3,5
Num_Credit_Card,4,4,4,4,4,4,4,4,4,4,...,3,3,4,1,7,3,4,7,4,5
Interest_Rate,3,3,3,3,3,3,3,3,6,6,...,9,7,5,3,6,3,1,11,2,12
Num_of_Loan,4,4,4,4,4,4,4,4,1,1,...,2,3,1,2,4,3,1,2,1,4
Type_of_Loan,0.076445,0.076451,0.076448,0.076449,0.076449,0.076449,0.076449,0.076449,-0.084963,-0.084963,...,-0.108052,-0.407008,-0.237896,-0.387186,-0.036686,-0.125772,-0.26362,0.02248,-0.272527,0.341497
Delay_from_due_date,3,-1,3,5,6,8,3,3,3,7,...,6,12,13,13,13,3,4,12,13,3


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True)

In [34]:

from sklearn.linear_model import LogisticRegression


logModel = LogisticRegression(multi_class='ovr')

selector = SelectFromModel(estimator=logModel)

pipeline = Pipeline([
    ('feature_selection', selector),
    ('classification', logModel)
])


pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(pipeline, X_train, y_train, cv = stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = pipeline.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

selected_features = selector.get_support(indices=True)
print(f"Selected features: {selected_features}")

accuracy = pipeline.score(X_test, y_test)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7075712503598504
f1_score: 0.7043217449237004
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.72      0.73     34649
           1       0.66      0.58      0.62     34754
           2       0.72      0.82      0.77     34807

    accuracy                           0.71    104210
   macro avg       0.70      0.71      0.70    104210
weighted avg       0.70      0.71      0.70    104210

Cross-validation accuracy scores: [0.70895308 0.70588235 0.71020056 0.70904904 0.71615008 0.70645811
 0.70118031 0.71605412 0.71068036 0.71183188]
Mean accuracy: 0.70964398810095
Accuracy: 0.7130080988753694
f1_score: 0.7098116107330638
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.72      0.73      8772
           1       0.67      0.59      0.62      8667
           2       0.73      0.83      0.78      8614

    accuracy                           0.71     26053
   m

In [35]:
import xgboost as xgb

XGBModel = xgb.XGBClassifier(objective='multi:softmax', num_class=3)

selector = SelectFromModel(estimator=XGBModel)

pipeline = Pipeline([
    ('feature_selection', selector),
    ('classification', XGBModel)
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(pipeline, X_train, y_train, cv = stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = pipeline.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

selected_features = selector.get_support(indices=True)
print(f"Selected features: {selected_features}")

accuracy = pipeline.score(X_test, y_test)
print(f'Accuracy: {accuracy}')

Accuracy: 0.771749352269456
f1_score: 0.76966385817135
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.78      0.79     34649
           1       0.76      0.67      0.71     34754
           2       0.75      0.87      0.81     34807

    accuracy                           0.77    104210
   macro avg       0.77      0.77      0.77    104210
weighted avg       0.77      0.77      0.77    104210

Cross-validation accuracy scores: [0.75299875 0.75434219 0.74580175 0.74964015 0.74580175 0.76547356
 0.75309471 0.75136743 0.74743307 0.74676135]
Mean accuracy: 0.7512714710680356
Accuracy: 0.7645568648524163
f1_score: 0.7622183514666626
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.76      0.78      8772
           1       0.75      0.66      0.70      8667
           2       0.75      0.87      0.81      8614

    accuracy                           0.76     26053
   ma

In [14]:
from sklearn.ensemble import RandomForestClassifier

RFModel = RandomForestClassifier(n_estimators=100)

selector = SelectFromModel(estimator=RFModel)

pipeline = Pipeline([
    ('feature_selection', selector),
    ('classification', RFModel)
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(pipeline, X_train, y_train, cv = stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = pipeline.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

selected_features = selector.get_support(indices=True)
print(f"Selected features: {selected_features}")

accuracy = pipeline.score(X_test, y_test)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9949333077439785
f1_score: 0.9949266469498333
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00     34649
           1       1.00      0.99      0.99     34754
           2       0.99      1.00      1.00     34807

    accuracy                           0.99    104210
   macro avg       0.99      0.99      0.99    104210
weighted avg       0.99      0.99      0.99    104210



KeyboardInterrupt: 

In [25]:
from sklearn.neighbors import KNeighborsClassifier

knnModel = KNeighborsClassifier(n_neighbors=3)

selector = SelectFromModel(estimator=knnModel)

pipeline = Pipeline([
    ('feature_selection', selector),
    ('classification', knnModel)
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(pipeline, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = pipeline.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

selected_features = selector.get_support(indices=True)
print(f"Selected features: {selected_features}")

accuracy = pipeline.score(X_test, y_test)
print(f'Accuracy: {accuracy}')

ValueError: when `importance_getter=='auto'`, the underlying estimator KNeighborsClassifier should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.

In [24]:
selector