In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("../../Data/cleanedNotRecoded.csv")

df = data.copy()

df = df.drop(["ID", "Customer_ID", "Name", "SSN", "Occupation"], axis=1)

df["Credit_Score"] = df["Credit_Score"].map({"Good":2, "Standard":1, "Poor":0})
df["Credit_Mix"] = df["Credit_Mix"].map({"Good":2, "Standard":1, "Bad":0})

df = pd.get_dummies(df, columns=['Payment_of_Min_Amount', 'Payment_Behaviour'], drop_first=True)

In [3]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,88582,88583,88584,88585,88586,88587,88588,88589,88590,88591
Month,1,2,3,4,5,6,7,8,1,2,...,7,8,1,2,3,4,5,6,7,8
Age,23,23,23,23,23,23,23,23,28,28,...,28,28,25,25,25,25,25,25,25,25
Annual_Income,19114.12,19114.12,19114.12,19114.12,19114.12,19114.12,19114.12,19114.12,34847.84,34847.84,...,20002.88,20002.88,39628.99,39628.99,39628.99,39628.99,39628.99,39628.99,39628.99,39628.99
Monthly_Inhand_Salary,1824.84,1824.84,1824.84,1824.84,1824.84,1824.84,1824.84,1824.84,3037.99,3037.99,...,1929.91,1929.91,3359.42,3359.42,3359.42,3359.42,3359.42,3359.42,3359.42,3359.42
Num_Bank_Accounts,3,3,3,3,3,3,3,3,2,2,...,10,10,4,4,4,4,4,4,4,4
Num_Credit_Card,4,4,4,4,4,4,4,4,4,4,...,8,8,6,6,6,6,6,6,6,6
Interest_Rate,3,3,3,3,3,3,3,3,6,6,...,29,29,7,7,7,7,7,7,7,7
Num_of_Loan,4,4,4,4,4,4,4,4,1,1,...,5,5,2,2,2,2,2,2,2,2
Type_of_Loan,0.076445,0.076451,0.076448,0.076449,0.076449,0.076449,0.076449,0.076449,-0.084963,-0.084963,...,-0.171175,-0.171175,-0.268888,-0.268888,-0.268888,-0.268888,-0.268888,-0.268888,-0.268888,-0.268888
Delay_from_due_date,3,-1,3,5,6,8,3,3,3,7,...,33,33,23,23,20,23,18,27,20,18


In [4]:
from sklearn.ensemble import IsolationForest

iforestModel = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)

iforestModel.fit(df)

# Predict anomalies (-1 for outliers, 1 for inliers)
df['anomaly'] = iforestModel.predict(df)

df = df[df['anomaly'] == 1].drop(columns=['anomaly'])

In [5]:
X = df.drop(["Credit_Score"], axis=1)
y = df["Credit_Score"]

In [6]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X, y = smote.fit_resample(X, y)

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Model Training

In [8]:
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True)

In [30]:
from sklearn.linear_model import LogisticRegression


logModel = LogisticRegression(multi_class='ovr')

logModel.fit(X_train, y_train)

y_pred = logModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(logModel, X_train, y_train, cv = stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = logModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7216485941848191
f1_score: 0.7191852741084561
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.72      0.73     34649
           1       0.68      0.62      0.65     34754
           2       0.74      0.83      0.78     34807

    accuracy                           0.72    104210
   macro avg       0.72      0.72      0.72    104210
weighted avg       0.72      0.72      0.72    104210

Cross-validation accuracy scores: [0.72363497 0.72296325 0.71989252 0.71749352 0.71931676 0.72440265
 0.72037233 0.71691776 0.72603397 0.72382689]
Mean accuracy: 0.7214854620477882
Accuracy: 0.7204928415153725
f1_score: 0.7180209241707463
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.71      0.72      8772
           1       0.68      0.62      0.65      8667
           2       0.74      0.84      0.78      8614

    accuracy                           0.72     26053
  

In [32]:
import xgboost as xgb

XGBModel = xgb.XGBClassifier(objective='multi:softmax', num_class=3)

XGBModel.fit(X_train, y_train)

y_pred = XGBModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(XGBModel, X_train, y_train, cv = stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = XGBModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8709240955762403
f1_score: 0.8697078332457627
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.89      0.88     34649
           1       0.86      0.78      0.82     34754
           2       0.88      0.94      0.91     34807

    accuracy                           0.87    104210
   macro avg       0.87      0.87      0.87    104210
weighted avg       0.87      0.87      0.87    104210

Cross-validation accuracy scores: [0.83705978 0.83657998 0.83792342 0.83226178 0.83600422 0.84061031
 0.83849918 0.83744362 0.8386911  0.83849918]
Mean accuracy: 0.837357259380098
Accuracy: 0.8392891413656777
f1_score: 0.8379789542468771
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      8772
           1       0.81      0.75      0.78      8667
           2       0.86      0.92      0.89      8614

    accuracy                           0.84     26053
   

In [33]:
from sklearn.ensemble import RandomForestClassifier

RFModel = RandomForestClassifier(n_estimators=100)

RFModel.fit(X_train, y_train)

y_pred = RFModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(RFModel, X_train, y_train, cv = stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = RFModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9999712119758181
f1_score: 0.9999712116994588
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     34649
           1       1.00      1.00      1.00     34754
           2       1.00      1.00      1.00     34807

    accuracy                           1.00    104210
   macro avg       1.00      1.00      1.00    104210
weighted avg       1.00      1.00      1.00    104210

Cross-validation accuracy scores: [0.88033778 0.8752519  0.8771711  0.87448421 0.87486805 0.87947414
 0.87477209 0.87697918 0.87947414 0.87333269]
Mean accuracy: 0.8766145283562038
Accuracy: 0.8807814838982075
f1_score: 0.8792453537701721
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.91      0.89      8772
           1       0.88      0.78      0.83      8667
           2       0.89      0.95      0.92      8614

    accuracy                           0.88     26053
  

In [34]:
from sklearn.neighbors import KNeighborsClassifier

knnModel = KNeighborsClassifier(n_neighbors=3)

knnModel.fit(X_train, y_train)

y_pred = knnModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(knnModel, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = knnModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8914403608099031
f1_score: 0.8903629042500644
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.91      0.89     34649
           1       0.91      0.81      0.86     34754
           2       0.89      0.96      0.92     34807

    accuracy                           0.89    104210
   macro avg       0.89      0.89      0.89    104210
weighted avg       0.89      0.89      0.89    104210

Cross-validation accuracy scores: [0.78361002 0.78322618 0.77948373 0.7814989  0.77670089 0.78380194
 0.77957969 0.77957969 0.78687266 0.77094329]
Mean accuracy: 0.7805296996449477
Accuracy: 0.787241392545964
f1_score: 0.7840647233084519
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.80      0.79      8772
           1       0.79      0.66      0.72      8667
           2       0.81      0.90      0.85      8614

    accuracy                           0.79     26053
   

In [35]:
from sklearn.linear_model import RidgeClassifier

RgModel = RidgeClassifier()

RgModel.fit(X_train, y_train)

y_pred = RgModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(RgModel, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = RgModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7169945302754054
f1_score: 0.713209279292735
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.72      0.73     34649
           1       0.69      0.58      0.63     34754
           2       0.72      0.84      0.77     34807

    accuracy                           0.72    104210
   macro avg       0.72      0.72      0.71    104210
weighted avg       0.72      0.72      0.71    104210

Cross-validation accuracy scores: [0.71624604 0.71912484 0.7144228  0.71125612 0.72027637 0.70837732
 0.72219557 0.71605412 0.71567028 0.72344305]
Mean accuracy: 0.716706650033586
Accuracy: 0.716424212182858
f1_score: 0.7130004119606841
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.71      0.73      8772
           1       0.69      0.59      0.64      8667
           2       0.71      0.84      0.77      8614

    accuracy                           0.72     26053
   ma

In [36]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

LDAModel = LinearDiscriminantAnalysis()

LDAModel.fit(X_train, y_train)

y_pred = LDAModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(LDAModel, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = LDAModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7227713271279148
f1_score: 0.720456090235295
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.72      0.74     34649
           1       0.68      0.62      0.65     34754
           2       0.73      0.83      0.78     34807

    accuracy                           0.72    104210
   macro avg       0.72      0.72      0.72    104210
weighted avg       0.72      0.72      0.72    104210

Cross-validation accuracy scores: [0.72238749 0.72430669 0.73140773 0.72574609 0.71519048 0.72622589
 0.71998848 0.71691776 0.71576624 0.73083197]
Mean accuracy: 0.7228768832165817
Accuracy: 0.721836256861014
f1_score: 0.7198288660200366
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.71      0.73      8772
           1       0.68      0.63      0.65      8667
           2       0.73      0.83      0.78      8614

    accuracy                           0.72     26053
   m

In [38]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

QDAModel = QuadraticDiscriminantAnalysis()

QDAModel.fit(X_train, y_train)

y_pred = QDAModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(QDAModel, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = QDAModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7380961520007677
f1_score: 0.7345132486334214
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.75      0.74     34649
           1       0.73      0.61      0.66     34754
           2       0.74      0.86      0.80     34807

    accuracy                           0.74    104210
   macro avg       0.74      0.74      0.73    104210
weighted avg       0.74      0.74      0.73    104210

Cross-validation accuracy scores: [0.73994818 0.73457442 0.73802898 0.74004414 0.73841282 0.73658958
 0.73064005 0.73591786 0.73985222 0.73668554]
Mean accuracy: 0.7370693791382785
Accuracy: 0.7350401105438913
f1_score: 0.7315484124204177
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.74      0.74      8772
           1       0.72      0.61      0.66      8667
           2       0.74      0.86      0.80      8614

    accuracy                           0.74     26053
  

In [9]:
from sklearn.naive_bayes import GaussianNB

nbModel = GaussianNB()

nbModel.fit(X_train, y_train)

y_pred = nbModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(nbModel, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = nbModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6971979656462911
f1_score: 0.6844875173057786
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.77      0.74     34649
           1       0.71      0.45      0.55     34754
           2       0.68      0.87      0.76     34807

    accuracy                           0.70    104210
   macro avg       0.70      0.70      0.68    104210
weighted avg       0.70      0.70      0.68    104210

Cross-validation accuracy scores: [0.69974091 0.70204395 0.70357931 0.69417522 0.69244794 0.69782171
 0.69964495 0.69571058 0.6896651  0.69849343]
Mean accuracy: 0.6973323097591402
Accuracy: 0.694430583809926
f1_score: 0.6816716226969318
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.76      0.74      8772
           1       0.71      0.45      0.55      8667
           2       0.67      0.87      0.76      8614

    accuracy                           0.69     26053
   

In [9]:
from sklearn.svm import SVC

svmLModel = SVC(kernel='linear', decision_function_shape='ovr')

svmLModel.fit(X_train, y_train)

y_pred = svmLModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(svmLModel, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = svmLModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7164187697917667
f1_score: 0.7146916218165095
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.72      0.73     34649
           1       0.68      0.62      0.65     34754
           2       0.73      0.81      0.76     34807

    accuracy                           0.72    104210
   macro avg       0.72      0.72      0.71    104210
weighted avg       0.72      0.72      0.71    104210

Cross-validation accuracy scores: [0.71077632 0.71576624 0.71375108 0.71135208 0.71739756 0.71864504
 0.7158622  0.72056425 0.7192208  0.71950868]
Mean accuracy: 0.7162844256789176
Accuracy: 0.7185736767358846
f1_score: 0.7170894044464331
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.72      0.73      8772
           1       0.68      0.63      0.65      8667
           2       0.73      0.81      0.77      8614

    accuracy                           0.72     26053
  

In [9]:
from sklearn.svm import SVC

svmRBFModel = SVC(kernel='rbf', decision_function_shape='ovr')

svmRBFModel.fit(X_train, y_train)

y_pred = svmRBFModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(svmRBFModel, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = svmRBFModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.763621533442088
f1_score: 0.7611770089878207
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.76      0.78     34649
           1       0.75      0.65      0.70     34754
           2       0.74      0.88      0.80     34807

    accuracy                           0.76    104210
   macro avg       0.77      0.76      0.76    104210
weighted avg       0.77      0.76      0.76    104210

Cross-validation accuracy scores: [0.76528164 0.75395835 0.75347855 0.76192304 0.75942808 0.75933212
 0.75904424 0.7577008  0.7634584  0.75760484]
Mean accuracy: 0.7591210056616446
Accuracy: 0.757110505508003
f1_score: 0.7546765613199589
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.75      0.77      8772
           1       0.74      0.65      0.69      8667
           2       0.74      0.87      0.80      8614

    accuracy                           0.76     26053
   m

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

gbModel = GradientBoostingClassifier()

gbModel.fit(X_train, y_train)

y_pred = gbModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(gbModel, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = gbModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7709912676326648
f1_score: 0.7690195546127165
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.77      0.79     34649
           1       0.75      0.67      0.71     34754
           2       0.77      0.87      0.81     34807

    accuracy                           0.77    104210
   macro avg       0.77      0.77      0.77    104210
weighted avg       0.77      0.77      0.77    104210

Cross-validation accuracy scores: [0.76269072 0.76125132 0.77209481 0.77055945 0.76604932 0.76969581
 0.77670089 0.76662508 0.76816045 0.76864025]
Mean accuracy: 0.7682468093273199
Accuracy: 0.7650174643994934
f1_score: 0.7632279160093267
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.76      0.78      8772
           1       0.73      0.67      0.70      8667
           2       0.76      0.86      0.81      8614

    accuracy                           0.77     26053
  

In [12]:
from sklearn.ensemble import ExtraTreesClassifier

etModel = ExtraTreesClassifier()

etModel.fit(X_train, y_train)

y_pred = etModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(etModel, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = etModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
f1_score: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     34649
           1       1.00      1.00      1.00     34754
           2       1.00      1.00      1.00     34807

    accuracy                           1.00    104210
   macro avg       1.00      1.00      1.00    104210
weighted avg       1.00      1.00      1.00    104210

Cross-validation accuracy scores: [0.87832262 0.87285289 0.8762115  0.87314077 0.87697918 0.87333269
 0.87189329 0.87227713 0.86843873 0.87381249]
Mean accuracy: 0.8737261299299492
Accuracy: 0.8753310559244617
f1_score: 0.8741897288517506
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.89      0.88      8772
           1       0.86      0.79      0.82      8667
           2       0.89      0.95      0.92      8614

    accuracy                           0.88     26053
   macro avg       0.87      0.8

In [13]:
from sklearn.tree import DecisionTreeClassifier

dtModel = DecisionTreeClassifier()

dtModel.fit(X_train, y_train)

y_pred = dtModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(dtModel, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = dtModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
f1_score: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     34649
           1       1.00      1.00      1.00     34754
           2       1.00      1.00      1.00     34807

    accuracy                           1.00    104210
   macro avg       1.00      1.00      1.00    104210
weighted avg       1.00      1.00      1.00    104210

Cross-validation accuracy scores: [0.78217062 0.79349391 0.79915555 0.79925151 0.78955954 0.79358987
 0.79454947 0.78946358 0.79762019 0.80030707]
Mean accuracy: 0.79391613088955
Accuracy: 0.7997159636126357
f1_score: 0.7992080779909695
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.80      0.81      8772
           1       0.74      0.72      0.73      8667
           2       0.85      0.87      0.86      8614

    accuracy                           0.80     26053
   macro avg       0.80      0.80 

In [14]:
from sklearn.ensemble import AdaBoostClassifier

adaModel = AdaBoostClassifier()

adaModel.fit(X_train, y_train)

y_pred = adaModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))

scores = cross_val_score(adaModel, X_train, y_train, cv=stratified_kfold, scoring='accuracy') 

print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")

y_pred = adaModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.7320602629306209
f1_score: 0.7300002723286987
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.73      0.74     34649
           1       0.69      0.63      0.66     34754
           2       0.74      0.83      0.78     34807

    accuracy                           0.73    104210
   macro avg       0.73      0.73      0.73    104210
weighted avg       0.73      0.73      0.73    104210





Cross-validation accuracy scores: [0.72421073 0.73745322 0.73409462 0.73994818 0.73006429 0.7382209
 0.73380674 0.73428654 0.72728145 0.72708953]
Mean accuracy: 0.7326456194223203
Accuracy: 0.7280927340421448
f1_score: 0.7264655511738718
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.73      0.74      8772
           1       0.68      0.64      0.66      8667
           2       0.74      0.82      0.78      8614

    accuracy                           0.73     26053
   macro avg       0.73      0.73      0.73     26053
weighted avg       0.73      0.73      0.73     26053



In [13]:
from sklearn.ensemble import ExtraTreesClassifier

etModel = ExtraTreesClassifier(n_estimators=200)

etModel.fit(X_train, y_train)

y_pred = etModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))


y_pred = etModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
f1_score: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     34649
           1       1.00      1.00      1.00     34754
           2       1.00      1.00      1.00     34807

    accuracy                           1.00    104210
   macro avg       1.00      1.00      1.00    104210
weighted avg       1.00      1.00      1.00    104210

Accuracy: 0.8754078225156412
f1_score: 0.874323100727209
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.89      0.88      8772
           1       0.86      0.79      0.82      8667
           2       0.89      0.94      0.92      8614

    accuracy                           0.88     26053
   macro avg       0.87      0.88      0.87     26053
weighted avg       0.87      0.88      0.87     26053



In [9]:
from sklearn.ensemble import ExtraTreesClassifier

etModel = ExtraTreesClassifier(n_estimators=200)

etModel.fit(X_train, y_train)

y_pred = etModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))


y_pred = etModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
f1_score: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     34649
           1       1.00      1.00      1.00     34754
           2       1.00      1.00      1.00     34807

    accuracy                           1.00    104210
   macro avg       1.00      1.00      1.00    104210
weighted avg       1.00      1.00      1.00    104210

Accuracy: 0.8791310021878479
f1_score: 0.8779934315003547
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.90      0.89      8772
           1       0.87      0.79      0.83      8667
           2       0.89      0.95      0.92      8614

    accuracy                           0.88     26053
   macro avg       0.88      0.88      0.88     26053
weighted avg       0.88      0.88      0.88     26053



In [11]:
from sklearn.ensemble import ExtraTreesClassifier

etModel = ExtraTreesClassifier(n_estimators=800)

etModel.fit(X_train, y_train)

y_pred = etModel.predict(X_train)

print("=====Train======")
print("Accuracy:", accuracy_score(y_train, y_pred))
print("f1_score:", f1_score(y_train, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_train, y_pred))


y_pred = etModel.predict(X_test)

print("=====Test======")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
f1_score: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     34649
           1       1.00      1.00      1.00     34754
           2       1.00      1.00      1.00     34807

    accuracy                           1.00    104210
   macro avg       1.00      1.00      1.00    104210
weighted avg       1.00      1.00      1.00    104210

Accuracy: 0.8789390857098991
f1_score: 0.8778012484642196
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.90      0.89      8772
           1       0.86      0.79      0.83      8667
           2       0.90      0.95      0.92      8614

    accuracy                           0.88     26053
   macro avg       0.88      0.88      0.88     26053
weighted avg       0.88      0.88      0.88     26053

