# Hyperparameter Tuning

This notebook:
- Tunes the Gradient Boosting classifier using cross-validation
- Uses a full preprocessing + SMOTE pipeline
- Optimizes ROC-AUC score
- Saves the final tuned model

In [1]:
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score

## Load Train-test split

In [2]:
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv').values.ravel()
y_test = pd.read_csv('../data/y_test.csv').values.ravel()

## Define feature groups

In [3]:
numeric_features = ["tenure", "MonthlyCharges", "TotalCharges"]

binary_features = ["gender", "Partner", "Dependents", "PhoneService", "PaperlessBilling" , "MultipleLines", "OnlineSecurity", "OnlineBackup", 
                   "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]

categorical_features = ["InternetService", "Contract", "PaymentMethod"]

## Preprocessing pipeline

In [4]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("bin", OneHotEncoder(drop="if_binary"), binary_features),
        ("cat", OneHotEncoder(drop="first"), categorical_features)
    ]
)

## Build full pipeline

In [5]:
pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", LogisticRegression(random_state=42))
])

param_grid = {
    'model__C': [0.01, 0.1, 1, 10, 100],
    'model__penalty': ['l1', 'l2'],
    'model__solver': ['liblinear', 'saga'],
    'model__max_iter': [100, 200, 300, 400, 500]
}

## Best hyperparameters

In [6]:
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)
print(f"Best parameters found: {grid.best_params_}")

Best parameters found: {'model__C': 0.1, 'model__max_iter': 100, 'model__penalty': 'l1', 'model__solver': 'liblinear'}


## Evaluate tuned model on test set

In [7]:
best_pipeline = grid.best_estimator_
y_pred = best_pipeline.predict(X_test)
y_proba = best_pipeline.predict_proba(X_test)[:, 1]

print(f"Test ROC-AUC: {roc_auc_score(y_test, y_proba)}")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Test ROC-AUC: 0.8339321119629757

Classification Report:

              precision    recall  f1-score   support

           0       0.90      0.72      0.80      1033
           1       0.50      0.78      0.61       374

    accuracy                           0.73      1407
   macro avg       0.70      0.75      0.70      1407
weighted avg       0.79      0.73      0.75      1407



## Save final tuned model pipeline

In [8]:
joblib.dump(best_pipeline, "../models/final_pipeline.pkl")

['../models/final_pipeline.pkl']