In [44]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from tqdm.notebook import tqdm

In [45]:
df = pd.read_csv("train.csv")


In [46]:
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})

In [47]:
df["Embarked"] = df["Embarked"].fillna("S")
emb_dummies = pd.get_dummies(df["Embarked"], prefix="Embarked", drop_first=True)


In [48]:
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 
                                   'Sir','Jonkheer','Dona'],'Rare')
df['Title'] = df['Title'].map({'Mr':0,'Miss':1,'Mrs':2,'Master':3,'Rare':4})
df['Title'] = df['Title'].fillna(0)

  df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


In [49]:
# FamilySize
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

In [50]:
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'FamilySize', 'Title']].join(emb_dummies)
y = df['Survived']


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41, stratify=y)


In [52]:
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'tree_method': 'hist',
        'device': 'cuda',
        'random_state': 41,
        'n_jobs': -1
    }
    
    model = xgb.XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    return scores.mean()


In [53]:
from tqdm import tqdm

n_trials = 50
study = optuna.create_study(direction='maximize', study_name="XGBoost_Titanic")

for _ in tqdm(range(n_trials), desc="Optuna Trials"):
    study.optimize(objective, n_trials=1, show_progress_bar=False)


[I 2025-12-05 11:41:53,230] A new study created in memory with name: XGBoost_Titanic
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
[I 2025-12-05 11:41:58,223] Trial 0 finished with value: 0.7993806451612905 and parameters: {'n_estimators': 480, 'learning_rate': 0.025348545473143025, 'max_depth': 7, 'subsample': 0.6153218338601127, 'colsample_bytree': 0.5412839753871777, 'reg_alpha': 6.574295220334044e-06, 'reg_lambda': 0.17420322308870234}. Best is trial 0 with value: 0.7993806451612905.
Optuna Trials:   2%|▏         | 1/50 [00:04<04:04,  4.99s/it][I 2025-12-05 11:42:14,678] Trial 1 finished with value: 0.8025935483870968 and parameters: {'n_estimators': 1078, 'learning_rate': 0.004019859388339389, 'max_depth': 13, 'subsample': 0.7786069757211693, 'colsample_bytree': 0.9525973621915714, 'reg_alpha': 0.04283174646848456, 'reg_lambda': 0.16599556804179388}.

In [54]:
print("\n--- Risultati Ottimizzazione ---")
print(f"Miglior Trial (Tentativo #{study.best_trial.number})")
print(f"Accuratezza Migliore (CV): {study.best_value:.4f}")
print("Migliori Iperparametri:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")



--- Risultati Ottimizzazione ---
Miglior Trial (Tentativo #19)
Accuratezza Migliore (CV): 0.8202
Migliori Iperparametri:
  n_estimators: 882
  learning_rate: 0.07144478795218712
  max_depth: 11
  subsample: 0.8711115449940225
  colsample_bytree: 0.8822594638701202
  reg_alpha: 2.518522562541505
  reg_lambda: 0.011206403555939475


In [55]:
best_params = study.best_params
final_model = xgb.XGBClassifier(**best_params, tree_method='gpu_hist', predictor='gpu_predictor',
                                random_state=41, n_jobs=-1)
final_model.fit(X_train, y_train)
final_acc = final_model.score(X_test, y_test)
print(f"\nAccuratezza Finale sul Test Set: {final_acc:.4f}")

XGBoostError: Invalid Input: 'gpu_hist', valid values are: {'approx', 'auto', 'exact', 'hist'}