In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.tree import DecisionTreeRegressor
import optuna
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

num_trials = 100;

# Load data
shear_strength = pd.read_csv('shear_strength.csv', header=None)
new_or_old = pd.read_csv('new_or_old.csv', header=None)
load_type = pd.read_csv('load_type.csv', header=None)
wall_l = pd.read_csv('wall_l.csv', header=None)
wall_h = pd.read_csv('wall_h.csv', header=None)
wall_t = pd.read_csv('wall_t.csv', header=None)
leaf_num = pd.read_csv('leaf_num.csv', header=None)
bond_pattern = pd.read_csv('bond_pattern.csv', header=None)
ft_mortar = pd.read_csv('ft_mortar.csv', header=None)
ft_brick = pd.read_csv('ft_brick.csv', header=None)

# Combine features and target
X = pd.concat([new_or_old, wall_l, wall_t, leaf_num, bond_pattern, ft_mortar, ft_brick], axis=1)
X.columns = ['new_or_old', 'wall_l', 'wall_t', 'leaf_num', 'bond_pattern', 'ft_mortar', 'ft_brick']
y_raw = shear_strength
y_raw.columns = ['shear_strength']
y = np.log(y_raw)  # log-transform (natural log)
y.columns = ['shear_strength']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

trial_results = []

# Optuna objective function
def objective(trial):
    max_depth = trial.suggest_int("max_depth", 1, 200)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])
    
    model = DecisionTreeRegressor(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=0
    )

    cv = KFold(n_splits=10, shuffle=True, random_state=0)
    scores = cross_val_score(model, X_train, y_train.values.ravel(), cv=cv, scoring="neg_mean_squared_error")
    return -scores.mean()

# Run Optuna optimization
sampler = optuna.samplers.TPESampler(seed=5)
study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=num_trials)

# Best parameters
print("Best parameters found by Optuna:", study.best_params)

trial_results = sorted(
    [(t.number, t.value, t.params) for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE],
    key=lambda x: x[1]
)[:10] 

print("\nTop 10 Hyperparameter Combinations and Scores:")
for rank, (trial_number, mse, params) in enumerate(trial_results, start=1):
    model = DecisionTreeRegressor(
        max_depth=params["max_depth"],
        min_samples_split=params["min_samples_split"],
        min_samples_leaf=params["min_samples_leaf"],
        max_features=params["max_features"],
        random_state=0
    )
    
    model.fit(X_train, y_train.values.ravel())

    y_train_pred_log = model.predict(X_train)
    y_test_pred_log = model.predict(X_test)

    y_train_pred = np.exp(y_train_pred_log)
    y_test_pred = np.exp(y_test_pred_log)

    y_train_np = y_raw.iloc[y_train.index].to_numpy().squeeze()
    y_test_np = y_raw.iloc[y_test.index].to_numpy().squeeze()
    
    # Calculate metrics for training set
    r2_train = r2_score(y_train_np, y_train_pred)
    rmse_train = root_mean_squared_error(y_train_np, y_train_pred)
    ratio_train = y_train_np / y_train_pred
    mean_ratio_train = np.mean(ratio_train)
    cov_ratio_p_train = np.cov(ratio_train, rowvar=False, ddof=1)
    cov_ratio_p_train = cov_ratio_p_train * 100
    
    # Calculate metrics for testing set
    r2_test = r2_score(y_test_np, y_test_pred)
    rmse_test = root_mean_squared_error(y_test_np, y_test_pred)
    ratio_test = y_test_np / y_test_pred
    mean_ratio_test = np.mean(ratio_test)
    cov_ratio_p_test = np.cov(ratio_test, rowvar=False, ddof=1)
    cov_ratio_p_test = cov_ratio_p_test * 100
    
    # Print results
    print(f"Rank {rank}: Trial {trial_number}")
    print(f"Parameters: {params}")
    print(f"Training Set - R²: {r2_train:.3f}, RMSE: {rmse_train:.3f}, mean_ratio: {mean_ratio_train:.3f}, cov_ratio(%): {cov_ratio_p_train:.1f}")
    print(f"Testing Set  - R²: {r2_test:.3f}, RMSE: {rmse_test:.3f}, mean_ratio: {mean_ratio_test:.3f}, cov_ratio(%): {cov_ratio_p_test:.1f}\n")
    

[I 2025-04-14 10:09:28,896] A new study created in memory with name: no-name-0e50e6f1-a748-4282-8b30-e00c0dcc7f98
[I 2025-04-14 10:09:28,931] Trial 0 finished with value: 0.5207411564829705 and parameters: {'max_depth': 45, 'min_samples_split': 18, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.5207411564829705.
[I 2025-04-14 10:09:28,978] Trial 1 finished with value: 0.5097781504669585 and parameters: {'max_depth': 154, 'min_samples_split': 11, 'min_samples_leaf': 6, 'max_features': None}. Best is trial 1 with value: 0.5097781504669585.
[I 2025-04-14 10:09:29,025] Trial 2 finished with value: 0.7754149488454439 and parameters: {'max_depth': 89, 'min_samples_split': 5, 'min_samples_leaf': 18, 'max_features': 'log2'}. Best is trial 1 with value: 0.5097781504669585.
[I 2025-04-14 10:09:29,080] Trial 3 finished with value: 0.5830570601450449 and parameters: {'max_depth': 126, 'min_samples_split': 13, 'min_samples_leaf': 12, 'max_features': 'log2'}. Best is t

Best parameters found by Optuna: {'max_depth': 154, 'min_samples_split': 19, 'min_samples_leaf': 12, 'max_features': None}

Top 10 Hyperparameter Combinations and Scores:
Rank 1: Trial 37
Parameters: {'max_depth': 154, 'min_samples_split': 19, 'min_samples_leaf': 12, 'max_features': None}
Training Set - R²: 0.651, RMSE: 0.258, mean_ratio: 1.169, cov_ratio(%): 53.5
Testing Set  - R²: 0.617, RMSE: 0.287, mean_ratio: 1.264, cov_ratio(%): 45.9

Rank 2: Trial 43
Parameters: {'max_depth': 175, 'min_samples_split': 18, 'min_samples_leaf': 12, 'max_features': None}
Training Set - R²: 0.651, RMSE: 0.258, mean_ratio: 1.169, cov_ratio(%): 53.5
Testing Set  - R²: 0.617, RMSE: 0.287, mean_ratio: 1.264, cov_ratio(%): 45.9

Rank 3: Trial 44
Parameters: {'max_depth': 198, 'min_samples_split': 20, 'min_samples_leaf': 12, 'max_features': None}
Training Set - R²: 0.651, RMSE: 0.258, mean_ratio: 1.169, cov_ratio(%): 53.5
Testing Set  - R²: 0.617, RMSE: 0.287, mean_ratio: 1.264, cov_ratio(%): 45.9

Rank 4: