# Model Building Section

In [1]:
import pickle

# Load the data from the pickle file
with open('train_test_data.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.ensemble import StackingClassifier



In [3]:
# Excluding the unncessary columns from the input features for the models by dropping them now
columns_to_exclude = ['ID', 'Customer_ID', 'Month', 'SSN', 'Type_of_Loan', 'Name']  # List of columns to exclude
X_train = X_train.drop(columns=columns_to_exclude)
X_test = X_test.drop(columns=columns_to_exclude)

# XG Boost

In [4]:
# !pip install xgboost
# Ran this code one to install, now it is being commented out

In [5]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the target variable
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Instantiate XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Train XGBoost classifier using the encoded target variables
xgb_classifier.fit(X_train, y_train_encoded)

# Predict
y_pred_xgb = xgb_classifier.predict(X_test)

# Evaluate accuracy
accuracy_xgb = accuracy_score(y_test_encoded, y_pred_xgb)
print("XGBoost Classifier Accuracy:", accuracy_xgb)

XGBoost Classifier Accuracy: 0.7723200494284832


# LSTM

In [6]:
#pip install keras

In [9]:
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense


# Reshape input data for LSTM
X_train_lstm = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_lstm = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Define LSTM model
model_lstm = Sequential()
model_lstm.add(LSTM(units=50, return_sequences=True, input_shape=(X_train_lstm.shape[1], 1)))
model_lstm.add(LSTM(units=50))
model_lstm.add(Dense(1, activation='sigmoid'))

# Compile LSTM model
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train LSTM model
model_lstm.fit(X_train_lstm, y_train, epochs=10, batch_size=32, validation_data=(X_test_lstm, y_test))

# Evaluate LSTM model
loss, accuracy = model_lstm.evaluate(X_test_lstm, y_test)
print("LSTM Model Accuracy:", accuracy)


ModuleNotFoundError: No module named 'tensorflow.keras'

## Logistic Regression

In [None]:
# Train the logistic regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

# Predictions using logistic regression model
logistic_regression_preds = logistic_regression_model.predict(X_test)

# Evaluate the logistic regression model
logistic_regression_accuracy = logistic_regression_model.score(X_test, y_test)
print("Logistic Regression Accuracy:", logistic_regression_accuracy)

# All of the other models follow the same basic procedures

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(criterion = 'entropy', n_estimators = 500, max_depth = 5, random_state = 101)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

rf_score = accuracy_score(y_test, y_pred_rf)

print(rf_score)

## Gradient Boosting

In [None]:
# Gradient Boosting Machine (XGBoost)
gbm = GradientBoostingClassifier()
gbm.fit(X_train, y_train)
y_pred_gbm = gbm.predict(X_test)
gbm_score = accuracy_score(y_test, y_pred_gbm)
print("GBM Accuracy:", gbm_score)

## Support Vector Machine (SVM)

In [None]:
# This one takes a while to run
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
svm_score = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", svm_score)

## K-Nearest Neighbors (KNN)

In [None]:
# K-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
knn_score = accuracy_score(y_test, y_pred_knn)
print("KNN Accuracy:", knn_score)

## Naive Bayes

In [None]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
nb_score = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes Accuracy:", nb_score)

## Neural Network

In [None]:
# Neural Network
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
mlp_score = accuracy_score(y_test, y_pred_mlp)
print("Neural Network Accuracy:", mlp_score)

## Decision Tree

In [None]:
# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
dt_score = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", dt_score)

## Ensemble Methods

## Bagging

In [None]:
# Ensemble Methods - Bagging
bagging = BaggingClassifier()
bagging.fit(X_train, y_train)
y_pred_bagging = bagging.predict(X_test)
bagging_score = accuracy_score(y_test, y_pred_bagging)
print("Bagging Classifier Accuracy:", bagging_score)

## AdaBoost

In [None]:
# Ensemble Methods - AdaBoost
adaboost = AdaBoostClassifier()
adaboost.fit(X_train, y_train)
y_pred_adaboost = adaboost.predict(X_test)
adaboost_score = accuracy_score(y_test, y_pred_adaboost)
print("AdaBoost Classifier Accuracy:", adaboost_score)

## Stacking

In [None]:
# Ensemble Methods - Stacking (running this one takes a long time)
estimators = [('dt', DecisionTreeClassifier()), ('bagging', BaggingClassifier()), ('gbm', GradientBoostingClassifier())]
stacking = StackingClassifier(estimators=estimators)
stacking.fit(X_train, y_train)
y_pred_stacking = stacking.predict(X_test)
stacking_score = accuracy_score(y_test, y_pred_stacking)
print("Stacking Classifier Accuracy:", stacking_score)

Although the Stacking ensemble method had a slightly higher accuracy, we will use Bagging as our final model because the computation time is significantly faster, and the accuracy is only about 0.003 worse than the stacking model. 