In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from tabulate import tabulate
import joblib

In [2]:
# Path to save
file_path = '../data/processed/final_bank_loan_data.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Age,Gender,Experience,Income,Family,CCAvg,Education,Mortgage,Home Ownership,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,1,49.0,4,1.6,1,0,1,0,1,0,0.0,0
1,45,1,19,34.0,3,1.5,1,0,2,0,1,0,0.0,0
2,39,1,15,11.0,1,1.0,1,0,2,0,0,0,0.0,0
3,35,1,9,100.0,1,2.7,2,0,2,0,0,0,0.0,0
4,35,1,8,45.0,4,1.0,2,0,2,0,0,0,0.0,1


In [3]:
# Scale numerical features
scaler = StandardScaler()
numerical_features = ['Age', 'Experience', 'Income', 'CCAvg', 'Mortgage']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [4]:
df.head()

Unnamed: 0,Age,Gender,Experience,Income,Family,CCAvg,Education,Mortgage,Home Ownership,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,-0.690057,1,-1.679035,-0.563392,4,-0.222532,1,-0.565221,1,0,1,0,0.0,0
1,-0.051473,1,-0.107944,-0.881798,3,-0.277885,1,-0.565221,2,0,1,0,0.0,0
2,-0.243048,1,-0.457075,-1.370023,1,-0.554645,1,-0.565221,2,0,0,0,0.0,0
3,-0.370765,1,-0.980772,0.519192,1,0.386341,2,-0.565221,2,0,0,0,0.0,0
4,-0.370765,1,-1.068055,-0.6483,4,-0.554645,2,-0.565221,2,0,0,0,0.0,1


In [5]:
# Split the data into training and testing sets
X = df.drop(columns=['Personal Loan'])
y = df['Personal Loan']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=69)

In [10]:
X.columns

Index(['Age', 'Gender', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Home Ownership', 'Securities Account', 'CD Account',
       'Online', 'CreditCard'],
      dtype='object')

In [6]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(random_state=69),
    'Support Vector Machine': SVC(),
    'Gradient Boosting': GradientBoostingClassifier(random_state=69)
}

In [7]:
# Train and evaluate models
results = []
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    })

# Results in a table format
print(tabulate(results, headers='keys', tablefmt='pretty'))

+------------------------+--------------------+--------------------+--------------------+--------------------+
|         Model          |      Accuracy      |     Precision      |       Recall       |      F1-Score      |
+------------------------+--------------------+--------------------+--------------------+--------------------+
|  Logistic Regression   | 0.9504950495049505 | 0.9485426518992197 | 0.9504950495049505 | 0.9491900150773754 |
|  K-Nearest Neighbors   | 0.9471947194719472 | 0.9464351313773138 | 0.9471947194719472 | 0.9410602543175728 |
|     Decision Tree      | 0.976897689768977  | 0.9785983816271535 | 0.976897689768977  | 0.9774326473210838 |
|     Random Forest      | 0.9834983498349835 | 0.9834297989750221 | 0.9834983498349835 |  0.98346112159915  |
| Support Vector Machine | 0.9691969196919692 | 0.9685086874188022 | 0.9691969196919692 | 0.9678085018696443 |
|   Gradient Boosting    | 0.9856985698569857 | 0.9857682064502746 | 0.9856985698569857 | 0.9857304566044089 |
+

In [8]:
# Best model based on accuracy
best_model_info = max(results, key=lambda x: x['Accuracy'])
best_model_name = best_model_info['Model']
best_model = models[best_model_name]

print(f'Best Model: {best_model_name}')
print(f'Accuracy: {best_model_info["Accuracy"]:.4f}')
print(f'Precision: {best_model_info["Precision"]:.4f}')
print(f'Recall: {best_model_info["Recall"]:.4f}')
print(f'F1-Score: {best_model_info["F1-Score"]:.4f}')

Best Model: Gradient Boosting
Accuracy: 0.9857
Precision: 0.9858
Recall: 0.9857
F1-Score: 0.9857


In [9]:
model_filename = '../models/bank_loan_model.pkl'
joblib.dump(best_model, model_filename)

['../models/bank_loan_model.pkl']