In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization

# Carica il dataset
xlsx_file = 'dataset/richerDataset.xlsx'
df = pd.read_excel(xlsx_file)

# Pulisci il dataset
df = df[df['k_m2'] != 0]
df = df.drop(['Porosity_Tot', 'SSA_Tot'], axis=1)
df.reset_index(drop=True, inplace=True)

# Trasforma la colonna 'k_m2' con log10
df['k_m2'] = np.log10(df['k_m2'])

# Scala i dati
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df)
df_scaled = pd.DataFrame(scaled_data, columns=df.columns)

# Prepara le feature e i target
features_X = df_scaled.drop(['k_m2'], axis=1)
targets_X = df_scaled[['k_m2']]

# Funzione per valutare il modello
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    return r2, mse, mae, rmse, y_pred

# Funzione di cross-validation
def cross_val_model(model, X, y, cv):
    scores = cross_val_score(model, X, y.values.ravel(), cv=cv, scoring='r2')
    return scores.mean(), scores.std()

# Parametri per l'ottimizzazione bayesiana
def xgb_evaluate(max_depth, learning_rate, n_estimators, gamma, min_child_weight, subsample, colsample_bytree, random_state):
    params = {
        'max_depth': int(max_depth),
        'learning_rate': learning_rate,
        'n_estimators': int(n_estimators),
        'gamma': gamma,
        'min_child_weight': min_child_weight,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'random_state': int(random_state)
    }
    xgb = XGBRegressor(**params)
    cv_result = cross_val_model(xgb, features_X, targets_X, cv=KFold(n_splits=5))
    return cv_result[0]  # Return the mean R^2 score

# Definizione dei parametri da ottimizzare
param_bounds = {
    'max_depth': (1, 25),
    'learning_rate': (0.01, 0.5),
    'n_estimators': (50, 500),
    'gamma': (0, 0.5),
    'min_child_weight': (1, 20),
    'subsample': (0.3, 1.0),
    'colsample_bytree': (0.6, 1.0),
    'random_state': (1, 100)
}

# Ottimizzazione bayesiana
optimizer = BayesianOptimization(f=xgb_evaluate, pbounds=param_bounds, random_state=50)
optimizer.maximize(init_points=10, n_iter=1000)

# Migliori iperparametri
best_params = optimizer.max['params']
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])

print(f"Migliori iperparametri trovati: {best_params}")

# Train the model with the best parameters
final_model = XGBRegressor(**best_params)
final_model.fit(features_X, targets_X.values.ravel())

# Split dei dati per valutazione finale
X_train, X_test, y_train, y_test = train_test_split(features_X, targets_X, test_size=0.2, random_state=50)
final_model.fit(X_train, y_train)
r2, mse, mae, rmse, y_pred = evaluate_model(final_model, X_test, y_test)

print(f"R^2: {r2}")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")

# Plot dei valori reali vs. predetti
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Valori Reali')
plt.ylabel('Valori Predetti')
plt.title('Valori Reali vs. Valori Predetti')
plt.grid(True)
plt.show()


|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | random... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [30m1         | [30m-0.56     | [30m0.7978    | [30m0.114     | [30m0.1352    | [30m10.51     | [30m8.169     | [30m498.5     | [30m41.41     | [30m0.8403    |
| [30m2         | [30m-0.7941   | [30m0.9042    | [30m0.155     | [30m0.1798    | [30m9.442     | [30m3.764     | [30m487.7     | [30m91.01     | [30m0.692     |
| [30m3         | [30m-0.6373   | [30m0.7254    | [30m0.4441    | [30m0.3405    | [30m10.39     | [30m10.64     | [30m285.8     | [30m92.87     | [30m0.7       |
| [30m4         | [30m-1.057    | [30m0.8673    | [30m0.02613   | [30m0.1703    | [30m2.354     | [30m4.417     | [30m466.7     | [30m93.86     | [30m0.7999    |
| [30m5         | [30m-2.044    | [30m0.8931    | [30m0.2309    

XGBoostError: Invalid Parameter format for seed expect long but value='53.908405191152'