In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import optuna
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

num_trials = 100;

# Load data
shear_strength = pd.read_csv('shear_strength.csv', header=None)
new_or_old = pd.read_csv('new_or_old.csv', header=None)
load_type = pd.read_csv('load_type.csv', header=None)
wall_l = pd.read_csv('wall_l.csv', header=None)
wall_h = pd.read_csv('wall_h.csv', header=None)
wall_t = pd.read_csv('wall_t.csv', header=None)
leaf_num = pd.read_csv('leaf_num.csv', header=None)
bond_pattern = pd.read_csv('bond_pattern.csv', header=None)
ft_mortar = pd.read_csv('ft_mortar.csv', header=None)
ft_brick = pd.read_csv('ft_brick.csv', header=None)

# Combine features and target
X = pd.concat([new_or_old, wall_l, wall_t, leaf_num, bond_pattern, ft_mortar, ft_brick], axis=1)
X.columns = ['new_or_old', 'wall_l', 'wall_t', 'leaf_num', 'bond_pattern', 'ft_mortar', 'ft_brick']
y_raw = shear_strength
y_raw.columns = ['shear_strength']
y = np.log(y_raw)  # log-transform (natural log)
y.columns = ['shear_strength']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

trial_results = []

# Optuna objective function
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 1000)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.5, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 200)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])
    subsample = trial.suggest_float("subsample", 0.5, 1.0)

    model = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        subsample=subsample,
        random_state=0
    )

    cv = KFold(n_splits=10, shuffle=True, random_state=0)
    scores = cross_val_score(model, X_train, y_train.values.ravel(), cv=cv, scoring="neg_mean_squared_error")
    return -scores.mean()

# Run Optuna optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=num_trials)

# Best parameters
print("Best parameters found by Optuna:", study.best_params)

trial_results = sorted(
    [(t.number, t.value, t.params) for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE],
    key=lambda x: x[1]
)[:10]

print("\nTop 10 Hyperparameter Combinations and Scores:")
for rank, (trial_number, mse, params) in enumerate(trial_results, start=1):
    model = GradientBoostingRegressor(
        n_estimators=params["n_estimators"],
        learning_rate=params["learning_rate"],
        max_depth=params["max_depth"],
        min_samples_split=params["min_samples_split"],
        min_samples_leaf=params["min_samples_leaf"],
        max_features=params["max_features"],
        subsample=params["subsample"],
        random_state=0
    )
    
    model.fit(X_train, y_train.values.ravel())

    y_train_pred_log = model.predict(X_train)
    y_test_pred_log = model.predict(X_test)

    y_train_pred = np.exp(y_train_pred_log)
    y_test_pred = np.exp(y_test_pred_log)

    y_train_np = y_raw.iloc[y_train.index].to_numpy().squeeze()
    y_test_np = y_raw.iloc[y_test.index].to_numpy().squeeze()
    
    # Calculate metrics for training set
    r2_train = r2_score(y_train_np, y_train_pred)
    rmse_train = root_mean_squared_error(y_train_np, y_train_pred)
    ratio_train = y_train_np / y_train_pred
    mean_ratio_train = np.mean(ratio_train)
    cov_ratio_p_train = np.cov(ratio_train, rowvar=False, ddof=1)
    cov_ratio_p_train = cov_ratio_p_train * 100
    
    # Calculate metrics for testing set
    r2_test = r2_score(y_test_np, y_test_pred)
    rmse_test = root_mean_squared_error(y_test_np, y_test_pred)
    ratio_test = y_test_np / y_test_pred
    mean_ratio_test = np.mean(ratio_test)
    cov_ratio_p_test = np.cov(ratio_test, rowvar=False, ddof=1)
    cov_ratio_p_test = cov_ratio_p_test * 100
    
    # Print results
    print(f"Rank {rank}: Trial {trial_number}")
    print(f"Parameters: {params}")
    print(f"Training Set - R²: {r2_train:.3f}, RMSE: {rmse_train:.3f}, mean_ratio: {mean_ratio_train:.3f}, cov_ratio(%): {cov_ratio_p_train:.1f}")
    print(f"Testing Set  - R²: {r2_test:.3f}, RMSE: {rmse_test:.3f}, mean_ratio: {mean_ratio_test:.3f}, cov_ratio(%): {cov_ratio_p_test:.1f}\n")
    

[I 2025-04-03 21:13:12,317] A new study created in memory with name: no-name-dced0524-b3b0-462f-931b-24bd4db19a02
[I 2025-04-03 21:13:27,054] Trial 0 finished with value: 0.2738937680152601 and parameters: {'n_estimators': 878, 'learning_rate': 0.13871691111084622, 'max_depth': 195, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'subsample': 0.7980934688130048}. Best is trial 0 with value: 0.2738937680152601.
[I 2025-04-03 21:13:30,028] Trial 1 finished with value: 0.8243387379660249 and parameters: {'n_estimators': 188, 'learning_rate': 0.003963340237820319, 'max_depth': 83, 'min_samples_split': 4, 'min_samples_leaf': 19, 'max_features': 'log2', 'subsample': 0.7217200969096822}. Best is trial 0 with value: 0.2738937680152601.
[I 2025-04-03 21:13:43,496] Trial 2 finished with value: 0.6539725883445409 and parameters: {'n_estimators': 873, 'learning_rate': 0.004652245670197514, 'max_depth': 187, 'min_samples_split': 16, 'min_samples_leaf': 18, 'max_features': 'sq

Best parameters found by Optuna: {'n_estimators': 825, 'learning_rate': 0.012782286566708418, 'max_depth': 177, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'subsample': 0.9990257725293431}

Top 10 Hyperparameter Combinations and Scores:
Rank 1: Trial 65
Parameters: {'n_estimators': 825, 'learning_rate': 0.012782286566708418, 'max_depth': 177, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'subsample': 0.9990257725293431}
Training Set - R²: 0.951, RMSE: 0.097, mean_ratio: 1.006, cov_ratio(%): 1.1
Testing Set  - R²: 0.865, RMSE: 0.170, mean_ratio: 1.066, cov_ratio(%): 18.9

Rank 2: Trial 66
Parameters: {'n_estimators': 731, 'learning_rate': 0.014189768014958177, 'max_depth': 178, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'subsample': 0.9811745010443043}
Training Set - R²: 0.951, RMSE: 0.097, mean_ratio: 1.006, cov_ratio(%): 1.1
Testing Set  - R²: 0.863, RMSE: 0.172, mean_ratio: 1.054, cov_ratio(%): 17.8

R