<a href="https://colab.research.google.com/github/NagaShivaChaitanya7/Bank-Marketing-Campaign-Term-Deposit-Prediction/blob/main/Bank_Marketing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
"""
Bank Marketing — Professional Pipeline
Generated: 2025-09-05 06:27:04

This script reproduces the preprocessing, modeling, and evaluation pipeline used in the professional report.
It trains multiple models and saves results to console.
Run: python bank_marketing_professional_pipeline.py
Dependencies: pandas, numpy, scikit-learn, matplotlib, (optional) xgboost
Place 'bank-full (1).csv' in the same directory as this script before running.
"""

import os
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
try:
    from xgboost import XGBClassifier
    xgb_available = True
except Exception:
    xgb_available = False

def load_data(path='bank-full (1).csv', sample_n=10000):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset not found at {path}. Place the CSV in the same folder.")
    df = pd.read_csv(path, sep=';')
    df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
    if 'y' not in df.columns:
        raise ValueError("Target column 'y' not found.")
    df['y'] = df['y'].map({'yes':1,'no':0}).astype(int)
    if len(df) > sample_n:
        df_sample, _ = train_test_split(df, train_size=sample_n, stratify=df['y'], random_state=42)
    else:
        df_sample = df
    return df_sample

def build_preprocessor(X):
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = [c for c in X.columns if c not in numeric_cols]
    pre = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
                             ('num', StandardScaler(), numeric_cols)])
    return pre, numeric_cols, categorical_cols

def main():
    df = load_data()
    X = df.drop(columns=['y']); y = df['y'].astype(int)
    pre, num_cols, cat_cols = build_preprocessor(X)
    # Models
    models = {
        'LogisticRegression': LogisticRegression(max_iter=500, class_weight='balanced', solver='liblinear'),
        'RandomForest': RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)
    }
    if xgb_available:
        models['XGBoost'] = XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss', random_state=42)

    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    results = []
    for name, clf in models.items():
        pipe = Pipeline([('pre', pre), ('clf', clf)])
        pipe.fit(Xtr, ytr)
        ypred = pipe.predict(Xte)
        yproba = pipe.predict_proba(Xte)[:,1] if hasattr(pipe, 'predict_proba') else pipe.decision_function(Xte)
        auc = roc_auc_score(yte, yproba)
        print(f'=== Model: {name} ===')
        print('ROC-AUC:', auc)
        print('Classification report:\n', classification_report(yte, ypred, digits=4))
        print('Confusion matrix:\n', confusion_matrix(yte, ypred))
        results.append((name, auc))
    print('\nBest by AUC:', sorted(results, key=lambda x: x[1], reverse=True)[0])

if __name__ == '__main__':
    main()

=== Model: LogisticRegression ===
ROC-AUC: 0.8901544850015971
Classification report:
               precision    recall  f1-score   support

           0     0.9626    0.8460    0.9005      1766
           1     0.3929    0.7521    0.5161       234

    accuracy                         0.8350      2000
   macro avg     0.6777    0.7991    0.7083      2000
weighted avg     0.8960    0.8350    0.8556      2000

Confusion matrix:
 [[1494  272]
 [  58  176]]
=== Model: RandomForest ===
ROC-AUC: 0.9100604001510002
Classification report:
               precision    recall  f1-score   support

           0     0.9113    0.9836    0.9461      1766
           1     0.6915    0.2778    0.3963       234

    accuracy                         0.9010      2000
   macro avg     0.8014    0.6307    0.6712      2000
weighted avg     0.8856    0.9010    0.8818      2000

Confusion matrix:
 [[1737   29]
 [ 169   65]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== Model: XGBoost ===
ROC-AUC: 0.9122564392949444
Classification report:
               precision    recall  f1-score   support

           0     0.9296    0.9649    0.9469      1766
           1     0.6287    0.4487    0.5237       234

    accuracy                         0.9045      2000
   macro avg     0.7792    0.7068    0.7353      2000
weighted avg     0.8944    0.9045    0.8974      2000

Confusion matrix:
 [[1704   62]
 [ 129  105]]

Best by AUC: ('XGBoost', np.float64(0.9122564392949444))
