In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

In [2]:
#data = pd.read_csv("bank.csv", sep=';')
#https://www.kaggle.com/datasets/janiobachmann/bank-marketing-dataset
import kagglehub
import os
path = kagglehub.dataset_download("janiobachmann/bank-marketing-dataset")

print("Path to dataset files:", path)
file_path = os.path.join(path, "bank.csv")
data = pd.read_csv(file_path, sep=',')

Using Colab cache for faster access to the 'bank-marketing-dataset' dataset.
Path to dataset files: /kaggle/input/bank-marketing-dataset


In [3]:
import joblib
# Encode target variable
data['deposit'] = data['deposit'].map({'yes': 1, 'no': 0})

# One-hot encoding for categorical features
data = pd.get_dummies(data, drop_first=True)

# Split features and target
X = data.drop('deposit', axis=1)
y = data['deposit']
joblib.dump(X.columns.tolist(), "feature_columns.pkl")

# ---------------------------------------
# 3. Train-Test Split
# ---------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ---------------------------------------
# 4. Define Models
# ---------------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42
    )
}

# ---------------------------------------
# 5. Train, Predict & Evaluate
# ---------------------------------------
results = []

for name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    results.append([
        name,
        accuracy_score(y_test, y_pred),
        roc_auc_score(y_test, y_prob),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred),
        matthews_corrcoef(y_test, y_pred)
    ])

# ---------------------------------------
# 6. Results Table
# ---------------------------------------
results_df = pd.DataFrame(
    results,
    columns=[
        "Model", "Accuracy", "AUC",
        "Precision", "Recall", "F1 Score", "MCC"
    ]
)

print(results_df)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


                 Model  Accuracy       AUC  Precision    Recall  F1 Score  \
0  Logistic Regression  0.828377  0.906653   0.831108  0.800303  0.815414   
1        Decision Tree  0.796489  0.795468   0.790447  0.776097  0.783206   
2                  KNN  0.778932  0.841185   0.800512  0.710287  0.752705   
3          Naive Bayes  0.715872  0.807618   0.800910  0.532526  0.639709   
4        Random Forest  0.857757  0.921905   0.825934  0.886536  0.855162   
5              XGBoost  0.855249  0.921720   0.831647  0.870651  0.850702   

        MCC  
0  0.655568  
1  0.591562  
2  0.557361  
3  0.444382  
4  0.717499  
5  0.711075  


In [4]:
import joblib

# Save models
for name, model in models.items():
    filename = name.replace(" ", "_").lower() + ".pkl"
    joblib.dump(model, filename)

# Save scaler
joblib.dump(scaler, "scaler.pkl")

print("All models and scaler saved successfully.")

All models and scaler saved successfully.
