# Wstęp do analizy danych i uczenia maszynowego

## 5. HPO i CV w scikit-learn

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from seaborn import load_dataset
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
df = load_dataset("titanic")
df.head()

In [None]:
df.isna().sum()

In [None]:
X = df.drop(columns=["survived", "alive", "pclass"], axis=1)
y = df["survived"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=2/9, random_state=42, stratify=y_train
)

In [None]:
numeric_features = ["age", "fare", "sibsp", "parch"]
categorical_features = ["sex", "embarked", "class", "who", "adult_male", "deck", "embark_town", "alone"]

In [None]:
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])

In [None]:
# Baseline model

baseline_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

baseline_model.fit(X_train, y_train)
y_val_pred = baseline_model.predict_proba(X_val)[:, 1]
baseline_roc_auc = roc_auc_score(y_val, y_val_pred)
print(f"Baseline ROC AUC: {baseline_roc_auc:.4f}")

In [None]:
# Hyperparameter tuning

param_grid = {
    "classifier__n_estimators": [50, 100, 200],
    "classifier__max_depth": [3, 5, 7, 9],
    "classifier__min_samples_split": [2, 4, 8],
    "classifier__min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=baseline_model,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
# Evaluate the best model and baseline on the test set
best_model = grid_search.best_estimator_
y_test_pred_baseline = baseline_model.predict_proba(X_test)[:, 1]
y_test_pred_best = best_model.predict_proba(X_test)[:, 1]
baseline_test_roc_auc = roc_auc_score(y_test, y_test_pred_baseline)
best_test_roc_auc = roc_auc_score(y_test, y_test_pred_best)
print(f"Baseline Test ROC AUC: {baseline_test_roc_auc:.4f}")
print(f"Best Model Test ROC AUC: {best_test_roc_auc:.4f}")

Po więcej szczegółów warto zajrzeć do dokumentacji scikit-learn:
- [Grid Search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
- [Randomized Search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

### Praca domowa

Wykorzystać wiedzę z tego wykładu i labów do przeprowadzenia HPO w swoim projekcie.