# Model Building Section

In [1]:
import pickle

# Load the data from the pickle file
with open('train_test_data.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.ensemble import StackingClassifier

In [3]:
# Excluding the unncessary columns from the input features for the models by dropping them now
columns_to_exclude = ['ID', 'Customer_ID', 'Month', 'SSN', 'Type_of_Loan', 'Name']  # List of columns to exclude
X_train = X_train.drop(columns=columns_to_exclude)
X_test = X_test.drop(columns=columns_to_exclude)

# XG Boost

In [4]:
# !pip install xgboost
# Ran this code one to install, now it is being commented out

In [5]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the target variable
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Instantiate XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Train XGBoost classifier using the encoded target variables
xgb_classifier.fit(X_train, y_train_encoded)

# Predict
y_pred_xgb = xgb_classifier.predict(X_test)

# Evaluate accuracy
accuracy_xgb = accuracy_score(y_test_encoded, y_pred_xgb)
print("XGBoost Classifier Accuracy:", accuracy_xgb)

XGBoost Classifier Accuracy: 0.7519308001235712


## Bagging

In [6]:
# Ensemble Methods - Bagging
bagging = BaggingClassifier()
bagging.fit(X_train, y_train)
y_pred_bagging = bagging.predict(X_test)
bagging_score = accuracy_score(y_test, y_pred_bagging)
print("Bagging Classifier Accuracy:", bagging_score)

Bagging Classifier Accuracy: 0.7966481309854804


## Logistic Regression

In [7]:
# Train the logistic regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

# Predictions using logistic regression model
logistic_regression_preds = logistic_regression_model.predict(X_test)

# Evaluate the logistic regression model
logistic_regression_accuracy = logistic_regression_model.score(X_test, y_test)
print("Logistic Regression Accuracy:", logistic_regression_accuracy)

# All of the other models follow the same basic procedures

Logistic Regression Accuracy: 0.6581711461229534


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Random Forest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(criterion = 'entropy', n_estimators = 500, max_depth = 5, random_state = 101)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

rf_score = accuracy_score(y_test, y_pred_rf)

print("Random Forest Classifier Accuracy:", rf_score)

Random Forest Classifier Accuracy: 0.6762434352795799


## Gradient Boosting

In [9]:
# Gradient Boosting Machine 
gbm = GradientBoostingClassifier()
gbm.fit(X_train, y_train)
y_pred_gbm = gbm.predict(X_test)
gbm_score = accuracy_score(y_test, y_pred_gbm)
print("GBM Accuracy:", gbm_score)

GBM Accuracy: 0.6984090206981773


## Naive Bayes

In [10]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
nb_score = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes Accuracy:", nb_score)

Naive Bayes Accuracy: 0.5841056533827618


## Neural Network

In [11]:
# Neural Network
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
mlp_score = accuracy_score(y_test, y_pred_mlp)
print("Neural Network Accuracy:", mlp_score)

Neural Network Accuracy: 0.6981773246833488




## Decision Tree

In [12]:
# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
dt_score = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", dt_score)

Decision Tree Accuracy: 0.72505406240346


## AdaBoost

In [13]:
# Ensemble Methods - AdaBoost
adaboost = AdaBoostClassifier()
adaboost.fit(X_train, y_train)
y_pred_adaboost = adaboost.predict(X_test)
adaboost_score = accuracy_score(y_test, y_pred_adaboost)
print("AdaBoost Classifier Accuracy:", adaboost_score)

AdaBoost Classifier Accuracy: 0.6719956750077232


## Support Vector Machine (SVM)

In [14]:
# This one takes a while to run
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
svm_score = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", svm_score)

SVM Accuracy: 0.6677479147358666


## Stacking

In [15]:
# Ensemble Methods - Stacking (running this one takes a long time)
estimators = [('dt', DecisionTreeClassifier()), ('bagging', BaggingClassifier()), ('gbm', GradientBoostingClassifier())]
stacking = StackingClassifier(estimators=estimators)
stacking.fit(X_train, y_train)
y_pred_stacking = stacking.predict(X_test)
stacking_score = accuracy_score(y_test, y_pred_stacking)
print("Stacking Classifier Accuracy:", stacking_score)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Stacking Classifier Accuracy: 0.7763361136855113


In [17]:
import pandas as pd

# Define evaluation data
evaluation_data = [("XG Boost", accuracy_xgb), ("Bagging", bagging_score), 
                   ("Random Forest", rf_score), ("Gradient Boosting", gbm_score),
                   ("Naive Bayes", nb_score),
                   ("Neural Network", mlp_score), ("Decision Tree", dt_score),
                   ("Adaboost", adaboost_score), ("SVM", svm_score), ("Stacking", stacking_score)]

# Create a pandas DataFrame
evaluation_df = pd.DataFrame(evaluation_data, columns=["Model Name", "Accuracy Score"])

# Sort DataFrame by 'R2 Score' column in descending order
evaluation_df = evaluation_df.sort_values(by="Accuracy Score", ascending=False)

# Display the DataFrame
print(evaluation_df)


          Model Name  Accuracy Score
1            Bagging        0.796648
9           Stacking        0.776336
0           XG Boost        0.751931
6      Decision Tree        0.725054
3  Gradient Boosting        0.698409
5     Neural Network        0.698177
2      Random Forest        0.676243
7           Adaboost        0.671996
8                SVM        0.667748
4        Naive Bayes        0.584106


Although the Stacking ensemble method had a slightly higher accuracy, we will use Bagging as our final model because the computation time is significantly faster, and the accuracy is only about 0.003 worse than the stacking model. 

## Classification Reports

In [21]:
# Print the classification reports of each of the models

from sklearn.metrics import classification_report

# Transform predicted labels back to original classes just for XG Boost
y_pred_xgb_original = label_encoder.inverse_transform(y_pred_xgb)

print("XG Boost:")
print(classification_report(y_test, y_pred_xgb_original))

print("Bagging:")
print(classification_report(y_test, y_pred_bagging))

print("Logistic Regression:")
print(classification_report(y_test, logistic_regression_preds))

print("Random Forest:")
print(classification_report(y_test, y_pred_rf))

print("Gradient Boosting:")
print(classification_report(y_test, y_pred_gbm))

print("Naive Bayes:")
print(classification_report(y_test, y_pred_nb))

print("Neural Network:")
print(classification_report(y_test, y_pred_mlp))

print("Decision Tree:")
print(classification_report(y_test, y_pred_dt))

print("Adaboost:")
print(classification_report(y_test, y_pred_adaboost))

print("SVM:")
print(classification_report(y_test, y_pred_svm))

print("Stacking:")
print(classification_report(y_test, y_pred_stacking))

XG Boost:
              precision    recall  f1-score   support

        Good       0.58      0.85      0.69      2100
        Poor       0.74      0.83      0.78      4018
    Standard       0.86      0.68      0.76      6830

    accuracy                           0.75     12948
   macro avg       0.73      0.78      0.74     12948
weighted avg       0.78      0.75      0.75     12948

Bagging:
              precision    recall  f1-score   support

        Good       0.71      0.82      0.76      2100
        Poor       0.77      0.86      0.81      4018
    Standard       0.85      0.75      0.80      6830

    accuracy                           0.80     12948
   macro avg       0.78      0.81      0.79     12948
weighted avg       0.80      0.80      0.80     12948

Logistic Regression:
              precision    recall  f1-score   support

        Good       0.49      0.81      0.61      2100
        Poor       0.64      0.70      0.67      4018
    Standard       0.80      0.59  

The F1 scores generated in the classification report also show that XG Boost and Bagging are the best models. Based on both the accuracy scores and the classification reports, both the XG Boost model and the Bagging model will move on to the parameter tuning and complex model design stage. 