In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from tabulate import tabulate
import joblib

In [2]:
# Path to save
file_path = '../data/processed/final_bank_loan_data.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Age,Gender,Experience,Income,Family,CCAvg,Education,Mortgage,Home Ownership,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,3,1,49.0,4,1.6,1,0,1,0,1,0,0.0,0
1,45,3,19,34.0,3,1.5,1,0,2,0,1,0,0.0,0
2,39,3,15,11.0,1,1.0,1,0,2,0,0,0,0.0,0
3,35,3,9,100.0,1,2.7,2,0,2,0,0,0,0.0,0
4,35,3,8,45.0,4,1.0,2,0,2,0,0,0,0.0,1


In [3]:
# Scale numerical features
scaler = StandardScaler()
numerical_features = ['Age', 'Experience', 'Income', 'CCAvg', 'Mortgage']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [4]:
df.head()

Unnamed: 0,Age,Gender,Experience,Income,Family,CCAvg,Education,Mortgage,Home Ownership,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,-0.820716,3,-1.666881,-0.514747,4,-0.173929,1,-0.557249,1,0,1,0,0.0,0
1,-0.040774,3,-0.096378,-0.847215,3,-0.232277,1,-0.557249,2,0,1,0,0.0,0
2,-0.274757,3,-0.445379,-1.356999,1,-0.524017,1,-0.557249,2,0,0,0,0.0,0
3,-0.430745,3,-0.96888,0.615643,1,0.467898,2,-0.557249,2,0,0,0,0.0,0
4,-0.430745,3,-1.05613,-0.603405,4,-0.524017,2,-0.557249,2,0,0,0,0.0,1


In [5]:
# Split the data into training and testing sets
X = df.drop(columns=['Personal Loan'])
y = df['Personal Loan']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=69)

In [6]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(random_state=69),
    'Support Vector Machine': SVC(),
    'Gradient Boosting': GradientBoostingClassifier(random_state=69)
}

In [7]:
# Train and evaluate models
results = []
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    })

# Results in a table format
print(tabulate(results, headers='keys', tablefmt='pretty'))

+------------------------+--------------------+--------------------+--------------------+--------------------+
|         Model          |      Accuracy      |     Precision      |       Recall       |      F1-Score      |
+------------------------+--------------------+--------------------+--------------------+--------------------+
|  Logistic Regression   | 0.9646017699115044 | 0.9624897153971943 | 0.9646017699115044 |  0.96304094471213  |
|  K-Nearest Neighbors   | 0.9700476514635806 | 0.9693057795781328 | 0.9700476514635806 | 0.9671080210303664 |
|     Decision Tree      | 0.9782164737916951 | 0.9780331887919568 | 0.9782164737916951 | 0.9781187019910971 |
|     Random Forest      | 0.9877467665078284 | 0.9876455585004167 | 0.9877467665078284 | 0.9873358977748566 |
| Support Vector Machine | 0.9768550034036759 | 0.9767715188782845 | 0.9768550034036759 | 0.9750303703671679 |
|   Gradient Boosting    | 0.9877467665078284 | 0.9875264031769397 | 0.9877467665078284 | 0.9874597013347739 |
+

In [8]:
# Best model based on accuracy
best_model_info = max(results, key=lambda x: x['Accuracy'])
best_model_name = best_model_info['Model']
best_model = models[best_model_name]

print(f'Best Model: {best_model_name}')
print(f'Accuracy: {best_model_info["Accuracy"]:.4f}')
print(f'Precision: {best_model_info["Precision"]:.4f}')
print(f'Recall: {best_model_info["Recall"]:.4f}')
print(f'F1-Score: {best_model_info["F1-Score"]:.4f}')

Best Model: Random Forest
Accuracy: 0.9877
Precision: 0.9876
Recall: 0.9877
F1-Score: 0.9873


In [9]:
model_filename = '../models/bank_loan_model.pkl'
joblib.dump(best_model, model_filename)

['../models/bank_loan_model.pkl']