### 0. Setup

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor, AdaBoostClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd



### 1. Load Dataset


In [25]:
# Load the dataset
df = pd.read_csv(r"F:/Ironhack/ML_Project\data\bank-full-preprocessed.csv", sep=",")
df.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,total_contacts,has_previous_contact,pdays_category,has_any_loan,age_group,month_num,month_sin,month_cos,balance_log
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,1,-1,0,unknown,0,1,0,no_previous,1,55-64,5,0.5,-0.866025,7.670429
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,1,-1,0,unknown,0,1,0,no_previous,1,35-44,5,0.5,-0.866025,3.401197
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,1,-1,0,unknown,0,1,0,no_previous,1,25-34,5,0.5,-0.866025,1.098612
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,1,-1,0,unknown,0,1,0,no_previous,1,45-54,5,0.5,-0.866025,7.317876
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,1,-1,0,unknown,0,1,0,no_previous,0,25-34,5,0.5,-0.866025,0.693147


## 2. Encoding & Train/Test Split

Use `ColumnTransformer` + `OneHotEncoder` for categoricals; pass-through numerics.
We **stratify** by `y` due to class imbalance.

In [24]:
y = df['y']
X = df.drop(columns=['y'])
numeric_features = X.select_dtypes(include=['int64','float64','Int64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object','category','bool']).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
        ('num', 'passthrough', numeric_features)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train_proc = preprocess.fit_transform(X_train)
X_test_proc  = preprocess.transform(X_test)

ohe = preprocess.named_transformers_['cat'] if categorical_features else None
cat_names = (ohe.get_feature_names_out(categorical_features) if ohe is not None else np.array([]))
feature_names = np.concatenate([cat_names, np.array(numeric_features)])

pd.DataFrame(X_train_proc, columns=feature_names).head()



Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes,contact_cellular,contact_telephone,contact_unknown,month_apr,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,pdays_category_no_previous,age_group_25-34,age_group_35-44,age_group_45-54,age_group_55-64,age_group_65+,age_group_<25,age,balance,day,campaign,pdays,previous,total_contacts,has_previous_contact,has_any_loan,month_num,month_sin,month_cos,balance_log
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,36.0,861.0,29.0,2.0,-1.0,0.0,2.0,0.0,0.0,8.0,-0.866025,-0.5,6.759255
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,24.0,3462.0,5.0,4.0,-1.0,0.0,4.0,0.0,0.0,4.0,0.866025,-0.5,8.149891
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,44.0,244.0,12.0,4.0,-1.0,0.0,4.0,0.0,1.0,8.0,-0.866025,-0.5,5.501258
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,48.0,0.0,31.0,6.0,-1.0,0.0,6.0,0.0,0.0,7.0,-0.5,-0.866025,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,38.0,257.0,26.0,6.0,-1.0,0.0,6.0,0.0,0.0,8.0,-0.866025,-0.5,5.55296


## 3. Scaling

Scale numeric features.

In [4]:
# Example scaler to apply after preprocessing when training linear models
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

num_idx = list(range(len(cat_names), len(feature_names)))
normalizer = MinMaxScaler()

X_train_normalized = X_train_proc.copy()
X_train_normalized[:, num_idx] = normalizer.fit_transform(X_train_normalized[:, num_idx])
X_test_normalized = X_test_proc.copy()
X_test_normalized[:, num_idx] = normalizer.transform(X_test_normalized[:, num_idx])

pd.DataFrame(X_train_normalized, columns=feature_names).head()



Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,campaign,pdays,previous,total_contacts,has_previous_contact,has_any_loan,month_num,month_sin,month_cos,balance_log
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.2,0.0,0.0,0.2,0.0,0.0,0.636364,0.066987,0.25,0.93669
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.6,0.0,0.0,0.6,0.0,0.0,0.272727,0.933013,0.25,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.6,0.0,0.0,0.6,0.0,1.0,0.636364,0.066987,0.25,0.879418
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.545455,0.25,0.066987,0.628967
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.636364,0.066987,0.25,0.881772


### 4. Modeling

In [13]:
def evaluate_model(model, X_train_normalized, y_train, X_test_normalized, y_test):
    """Train model and return key metrics."""
    model.fit(X_train_normalized, y_train)
    y_pred = model.predict(X_test_normalized)
    y_prob = model.predict_proba(X_test_normalized)[:,1] if hasattr(model, "predict_proba") else None

    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_prob) if y_prob is not None else None
    }
    return metrics


In [14]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42, class_weight="balanced"),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42, class_weight="balanced"),
    "AdaBoost": AdaBoostClassifier(random_state=42, algorithm="SAMME")}


In [15]:
results = {}

for name, model in models.items():
    print(f"Training {name}...")
    metrics = evaluate_model(model, X_train_proc, y_train, X_test_proc, y_test)
    results[name] = metrics

# Convert to DataFrame for easy viewing
results_df = pd.DataFrame(results).T.sort_values(by="ROC-AUC", ascending=False)
print (results_df)


Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Random Forest...
Training Gradient Boosting...
Training Decision Tree...
Training AdaBoost...
                     Accuracy  Precision    Recall        F1   ROC-AUC
Gradient Boosting    0.894504   0.652941  0.209830  0.317597  0.799685
Random Forest        0.893398   0.627027  0.219282  0.324930  0.787740
Logistic Regression  0.764901   0.281148  0.648393  0.392224  0.777041
AdaBoost             0.893841   0.671329  0.181474  0.285714  0.764749
Decision Tree        0.837664   0.307692  0.310019  0.308851  0.608798


In [16]:
importances = pd.Series(models["Random Forest"].feature_importances_, index=feature_names)
importances.sort_values(ascending=False).head(10)


balance             0.103426
day                 0.101281
balance_log         0.098602
age                 0.097574
poutcome_success    0.040074
campaign            0.035759
total_contacts      0.035738
month_sin           0.025120
poutcome_unknown    0.024663
month_num           0.024380
dtype: float64

In [17]:
importances = pd.Series(models["Gradient Boosting"].feature_importances_, index=feature_names)
importances.sort_values(ascending=False).head(10)


poutcome_success    0.402748
age                 0.083216
month_sin           0.067269
has_any_loan        0.056117
day                 0.049666
contact_unknown     0.043903
month_oct           0.040685
month_cos           0.033403
month_jun           0.031786
month_mar           0.025316
dtype: float64

In [9]:
coefs = pd.Series(models["Logistic Regression"].coef_[0], index=feature_names)
coefs.sort_values(ascending=False).head(10)


poutcome_success     1.680045
month_oct            0.794307
month_mar            0.790889
month_sep            0.603790
contact_cellular     0.484552
age_group_65+        0.484121
age_group_<25        0.474052
month_sin            0.270280
job_retired          0.238433
contact_telephone    0.226011
dtype: float64

#### 4.1 Grid Search

In [None]:
# Create stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics
scoring = {
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1",
    "roc_auc": "roc_auc"
}

pipe = Pipeline([
    ("prep", preprocess),
    ("clf", None)
])

In [None]:
# Logistic Regression grid and base model
logreg_grid = {
    "clf": [LogisticRegression(max_iter=5000, solver="saga", class_weight="balanced", n_jobs=-1, random_state=42)],
    "clf__penalty": ["l2", "elasticnet"],
    "clf__C": [1.0, 0.5, 0.25, 0.1],
    "clf__l1_ratio": [0.0, 0.5, 0.9],
}

In [None]:
# Random Forest grid and base model
rf_grid = {
    "clf": [RandomForestClassifier(n_estimators=400, random_state=42, class_weight="balanced_subsample", n_jobs=-1)],
    "clf__max_depth": [None, 8, 12, 20],
    "clf__min_samples_split": [2, 10, 20],
    "clf__min_samples_leaf": [1, 3, 5],
    "clf__max_features": ["sqrt", "log2", 0.5],
}

In [None]:
# Gradient Boosting grid and base model
gb_grid = {
    "clf": [GradientBoostingClassifier(random_state=42)],
    "clf__n_estimators": [200, 400],
    "clf__learning_rate": [0.05, 0.1],
    "clf__max_depth": [2, 3, 4],
    "clf__subsample": [0.7, 1.0],
    "clf__max_features": [None, "sqrt"],
}

In [None]:
# Adjust scale_pos_weight based on class imbalance
neg, pos = (y_train == 0).sum(), (y_train == 1).sum()
spw = max(1.0, neg / max(1, pos))  # imbalance adjustment

# XGBoost grid and base model
xgb_base = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    n_jobs=-1,
    random_state=42,
    scale_pos_weight=spw
)

xgb_grid = {
    "clf": [xgb_base],
    "clf__n_estimators": [300, 500],
    "clf__learning_rate": [0.05, 0.1, 0.2],
    "clf__max_depth": [3, 5, 7],
    "clf__subsample": [0.7, 1.0],
    "clf__colsample_bytree": [0.7, 1.0],
    "clf__min_child_weight": [1, 3, 5],
    "clf__reg_lambda": [1, 5, 10],
}

In [None]:
# Function to run grid search and display results
def run_grid(grid, name):
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=grid,
        scoring=scoring,
        refit="roc_auc",
        cv=cv,
        n_jobs=-1,
        verbose=1,
        return_train_score=False
    )
    gs.fit(X_train, y_train)
    
    print(f"\n=== {name} — Best ROC-AUC: {gs.best_score_:.4f}")
    print("Best params:", gs.best_params_)
    
    y_pred = gs.predict(X_test)
    y_proba = gs.predict_proba(X_test)[:, 1]
    
    print("\nTest metrics:")
    print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_test, y_pred):.4f}")
    print(f"F1:        {f1_score(y_test, y_pred):.4f}")
    print(f"ROC-AUC:   {roc_auc_score(y_test, y_proba):.4f}")
    print("\nClassification report:\n", classification_report(y_test, y_pred))
    
    return gs


Run four models × multiple parameter combinations × cross-validation

In [43]:
gs_logreg = run_grid(logreg_grid, "Logistic Regression")
gs_rf     = run_grid(rf_grid,     "Random Forest")
gs_gb     = run_grid(gb_grid,     "Gradient Boosting")
gs_xgb    = run_grid(xgb_grid,    "XGBoost")


Fitting 5 folds for each of 24 candidates, totalling 120 fits





=== Logistic Regression — Best ROC-AUC: 0.7545
Best params: {'clf': LogisticRegression(class_weight='balanced', max_iter=5000, n_jobs=-1,
                   random_state=42, solver='saga'), 'clf__C': 1.0, 'clf__l1_ratio': 0.0, 'clf__penalty': 'l2'}

Test metrics:
Accuracy:  0.6960
Precision: 0.2330
Recall:    0.6975
F1:        0.3493
ROC-AUC:   0.7665

Classification report:
               precision    recall  f1-score   support

           0       0.95      0.70      0.80      7985
           1       0.23      0.70      0.35      1058

    accuracy                           0.70      9043
   macro avg       0.59      0.70      0.58      9043
weighted avg       0.86      0.70      0.75      9043

Fitting 5 folds for each of 108 candidates, totalling 540 fits


KeyboardInterrupt: 

In [None]:
# Compare all models and select the best
models_all = [
    (gs_logreg, "LogReg"),
    (gs_rf, "RF"),
    (gs_gb, "GB"),
    (gs_xgb, "XGB")
]
# Select best by ROC-AUC
best = max(models_all, key=lambda t: t[0].best_score_)
print("\n🏆 Overall Best Model by CV ROC-AUC:", best[1], f"({best[0].best_score_:.4f})")
final_clf = best[0].best_estimator_


In [None]:
# Get feature names
cat_names = preprocess.named_transformers_["cat"].named_steps["ohe"].get_feature_names_out(categorical_features)
feature_names = np.concatenate([cat_names, np.array(numeric_features, dtype=object)])

# For RF, GB, or XGB
clf = final_clf.named_steps["clf"]
if hasattr(clf, "feature_importances_"):
    importances = pd.Series(clf.feature_importances_, index=feature_names).sort_values(ascending=False)
    display(importances.head(15))
