In [10]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='darkgrid')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris, make_classification
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

from imblearn.over_sampling import SMOTE


In [5]:
data = pd.read_csv("modelling_data.csv")

In [6]:
# Define target variable (y) and other varibles (X)
y = data['Loan Status']
X = data.drop(['Loan Status', "Loan ID", "Customer ID"], axis = 1)

# Split dataset
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=42)

# Show the data split in numbers
shape_dict = {'shape of whole dataset': data.shape, 'shape of train_X': train_X.shape, 'shape of test_X': test_X.shape, 'shape of train_y': train_y.shape, 'shape of test_y': test_y.shape}
for i, v in shape_dict.items(): 
    print(i, v)

shape of whole dataset (20371, 19)
shape of train_X (13648, 16)
shape of test_X (6723, 16)
shape of train_y (13648,)
shape of test_y (6723,)


In [8]:
sm = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=42)
smote_X, smote_y = sm.fit_resample(train_X, train_y)

smote_y.value_counts()

Loan Status
0    10393
1    10393
Name: count, dtype: int64

In [11]:
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

In [12]:
# Create individual models
logistic_regression_model = LogisticRegression(random_state=42)
decision_tree_model = DecisionTreeClassifier(random_state=42)
random_forest_model = RandomForestClassifier(random_state=42)

In [13]:
# Create a voting classifier
voting_classifier = VotingClassifier(estimators=[('logistic_regression', logistic_regression_model), ('decision_tree', decision_tree_model), ('random_forest', random_forest_model)], voting='hard')


In [14]:
# Fit the voting classifier on the training data
voting_classifier.fit(smote_X, smote_y)

In [15]:
pred_y = voting_classifier.predict(test_X)

In [16]:
accuracy = accuracy_score(test_y, pred_y)
print("Ensemble Model Accuracy:", accuracy)

Ensemble Model Accuracy: 0.775844117209579
