# Solutions IV: Model Tuning with GridSearch

In [None]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

## Load Data

In [None]:
# Read in the banking data set.
df = pd.read_csv("../../0_data/banking/bank-additional-full.csv", sep=";")
df.head(3)

In [None]:
# Split features and labels.
X = df.drop(columns=["duration", "pdays", "y"])
y = df["y"].replace({"yes": 1, "no": 0})

## Create the model

In [None]:
# Get column names per data type.
categorical = X.select_dtypes("object").columns
numerical = X.select_dtypes("number").columns

In [None]:
# Set up the ColumnTransformer.
transformer = ColumnTransformer(
    transformers=[
        ("encode_categorical", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical),
        ("scale_numerical", StandardScaler(), numerical),
    ]
)

In [None]:
# Create the gradient boosting model.
hgb = Pipeline(
    steps=[
        ("preparation", transformer),
        ("model", HistGradientBoostingClassifier()),
    ]
)

In [None]:
# Define the parameter search space.
# Note: As we have multiple Pipeline steps, use: <step>__<parameter>.
parameters = {
    "model__max_iter": [50, 100, 250],
    "model__learning_rate": [0.1, 0.3, 0.5],
}

In [None]:
# Set up KFold
kfold = KFold(n_splits=5, shuffle=True)

# Create GridSearch
gs = GridSearchCV(
    hgb,
    param_grid=parameters,
    scoring="recall",
    cv=kfold,
    n_jobs=-1
)

In [None]:
# Perform GridSearch on the data.
gs.fit(X, y)

In [None]:
# Display the GridSearch results (as DataFrame).
pd.DataFrame(gs.cv_results_)

In [None]:
# Best parameter settings
gs.best_params_

In [None]:
# Best estimator instance
gs.best_estimator_