# Model Building Section

In [1]:
import pickle

# Load the data from the pickle file
with open('train_test_data.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.ensemble import StackingClassifier

In [3]:
# Excluding the unncessary columns from the input features for the models by dropping them now
columns_to_exclude = ['ID', 'Customer_ID', 'Month', 'SSN', 'Type_of_Loan', 'Name']  # List of columns to exclude
X_train = X_train.drop(columns=columns_to_exclude)
X_test = X_test.drop(columns=columns_to_exclude)

## Logistic Regression

In [4]:
# Train the logistic regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

# Predictions using logistic regression model
logistic_regression_preds = logistic_regression_model.predict(X_test)

# Evaluate the logistic regression model
logistic_regression_accuracy = logistic_regression_model.score(X_test, y_test)
print("Logistic Regression Accuracy:", logistic_regression_accuracy)

# All of the other models follow the same basic procedures

Logistic Regression Accuracy: 0.655313561940068


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Random Forest Classifier

In [5]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(criterion = 'entropy', n_estimators = 500, max_depth = 5, random_state = 101)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

rf_score = accuracy_score(y_test, y_pred_rf)

print(rf_score)

0.6706827309236948


## Gradient Boosting

In [6]:
# Gradient Boosting Machine (XGBoost)
gbm = GradientBoostingClassifier()
gbm.fit(X_train, y_train)
y_pred_gbm = gbm.predict(X_test)
gbm_score = accuracy_score(y_test, y_pred_gbm)
print("GBM Accuracy:", gbm_score)

GBM Accuracy: 0.7219647822057461


## Support Vector Machine (SVM)

In [7]:
# This one takes a while to run
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
svm_score = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", svm_score)

SVM Accuracy: 0.6574760580784678


## K-Nearest Neighbors (KNN)

In [8]:
# K-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
knn_score = accuracy_score(y_test, y_pred_knn)
print("KNN Accuracy:", knn_score)

KNN Accuracy: 0.6898362681495211


## Naive Bayes

In [9]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
nb_score = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes Accuracy:", nb_score)

Naive Bayes Accuracy: 0.6105189990732159


## Neural Network

In [10]:
# Neural Network
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
mlp_score = accuracy_score(y_test, y_pred_mlp)
print("Neural Network Accuracy:", mlp_score)

Neural Network Accuracy: 0.7144732777262898




## Decision Tree

In [11]:
# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
dt_score = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", dt_score)

Decision Tree Accuracy: 0.7391102873030584


## Ensemble Methods

## Bagging

In [15]:
# Ensemble Methods - Bagging
bagging = BaggingClassifier()
bagging.fit(X_train, y_train)
y_pred_bagging = bagging.predict(X_test)
bagging_score = accuracy_score(y_test, y_pred_bagging)
print("Bagging Classifier Accuracy:", bagging_score)

Bagging Classifier Accuracy: 0.7893110905159098


## AdaBoost

In [16]:
# Ensemble Methods - AdaBoost
adaboost = AdaBoostClassifier()
adaboost.fit(X_train, y_train)
y_pred_adaboost = adaboost.predict(X_test)
adaboost_score = accuracy_score(y_test, y_pred_adaboost)
print("AdaBoost Classifier Accuracy:", adaboost_score)

AdaBoost Classifier Accuracy: 0.6731541550818659


## Stacking

In [17]:
# Ensemble Methods - Stacking (running this one takes a long time)
estimators = [('dt', DecisionTreeClassifier()), ('bagging', BaggingClassifier()), ('gbm', GradientBoostingClassifier())]
stacking = StackingClassifier(estimators=estimators)
stacking.fit(X_train, y_train)
y_pred_stacking = stacking.predict(X_test)
stacking_score = accuracy_score(y_test, y_pred_stacking)
print("Stacking Classifier Accuracy:", stacking_score)

Stacking Classifier Accuracy: 0.7925548347235094


Although the Stacking ensemble method had a slightly higher accuracy, we will use Baggin s our final model because the computation time is significantly faster, and the accuracy is only about 0.003 worse than the stacking model. 