In [None]:
import pandas as pd
df = pd.read_csv("train.csv")

df.info()
df.nunique()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})

In [None]:
df["Age"] = df["Age"].fillna(df["Age"].mean())

In [None]:
df["Embarked"] = df["Embarked"].map({"S" : 0,"C" :1 ,"Q": 2})

In [None]:
dfd = df.drop(columns=["Cabin","Name","Ticket","Fare"])

In [None]:
import matplotlib.pyplot as plt
import numpy as np


numeric_cols = dfd.select_dtypes(include=[np.number]).columns
numeric_cols = numeric_cols.drop(["Survived"])
numeric_cols = [col for col in numeric_cols if col != "PassengerId"]

plt.figure(figsize=(12, 6))

for i, col in enumerate(numeric_cols):
    data = dfd[col].dropna()  
    if len(data) == 0:
        continue
    
    # Jitter sull'asse x
    x = np.random.normal(i, 0.08, size=len(data))
    plt.scatter(x, data.values, alpha=0.6, s=10, color='blue')

plt.xticks(range(len(numeric_cols)), numeric_cols, rotation=45)
plt.title("Distribuzione valori con jitter (Prima della rimozione outlier)")
plt.ylabel("Valore")
plt.tight_layout()
plt.show()


In [None]:
dfd.isna().sum()

In [None]:
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    dfd[col] = np.where(dfd[col].between(lower, upper), dfd[col], np.nan)

In [None]:
from sklearn.model_selection import train_test_split


X = dfd[numeric_cols].copy()
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

In [None]:
import xgboost as xgb
import optuna
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

X = dfd[numeric_cols].copy()          
y = df['Survived']                     

X = X.fillna(X.median())


In [None]:
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.1, 0.5),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.8),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 9.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 8.0, log=True),
        'n_jobs': -1,
        'random_state': 41,
        'tree_method': 'hist', 
        'device': 'cuda'    
    }

    model = xgb.XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    
    return scores.mean()


study = optuna.create_study(direction='maximize', study_name="XGBoost_Titanic")
study.optimize(objective, n_trials=50, show_progress_bar=True)


print("\n--- Risultati Ottimizzazione ---")
print(f"Miglior Trial (Tentativo #{study.best_trial.number})")
print(f"Accuratezza Migliore (CV): {study.best_value:.4f}")
print("Migliori Iperparametri:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

best_params = study.best_params
final_model = xgb.XGBClassifier(**best_params, n_jobs=-1, random_state=42)
final_model.fit(X_train, y_train)
final_acc = final_model.score(X_test, y_test)
print(f"\nAccuratezza Finale sul Test Set: {final_acc:.4f}")
