In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.svm import SVR
import optuna
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

num_trials = 100;

# Load data
shear_strength = pd.read_csv('shear_strength.csv', header=None)
new_or_old = pd.read_csv('new_or_old.csv', header=None)
load_type = pd.read_csv('load_type.csv', header=None)
wall_l = pd.read_csv('wall_l.csv', header=None)
wall_h = pd.read_csv('wall_h.csv', header=None)
wall_t = pd.read_csv('wall_t.csv', header=None)
leaf_num = pd.read_csv('leaf_num.csv', header=None)
bond_pattern = pd.read_csv('bond_pattern.csv', header=None)
ft_mortar = pd.read_csv('ft_mortar.csv', header=None)
ft_brick = pd.read_csv('ft_brick.csv', header=None)

# Combine features and target
X = pd.concat([new_or_old, wall_l, wall_t, leaf_num, bond_pattern, ft_mortar, ft_brick], axis=1)
X.columns = ['new_or_old', 'wall_l', 'wall_t', 'leaf_num', 'bond_pattern', 'ft_mortar', 'ft_brick']
y_raw = shear_strength
y_raw.columns = ['shear_strength']
y = np.log(y_raw)  # log-transform (natural log)
y.columns = ['shear_strength']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

trial_results = []

# Optuna objective function
def objective(trial):
    C = trial.suggest_float("C", 0.01, 100.0, log=True)
    epsilon = trial.suggest_float("epsilon", 1e-4, 1.0, log=True)
    kernel = trial.suggest_categorical("kernel", ["rbf", "linear", "poly", "sigmoid"])

    if kernel in ["rbf", "poly", "sigmoid"]:
        gamma = trial.suggest_categorical("gamma", ["scale", "auto"])
    else:
        gamma = "scale" 

    model = SVR(
        C=C,
        epsilon=epsilon,
        kernel=kernel,
        gamma=gamma
    )

    cv = KFold(n_splits=10, shuffle=True, random_state=0)
    scores = cross_val_score(model, X_train, y_train.values.ravel(), cv=cv, scoring="neg_mean_squared_error")
    return -scores.mean()

# Run Optuna optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=num_trials)

# Best parameters
print("Best parameters found by Optuna:", study.best_params)

trial_results = sorted(
    [(t.number, t.value, t.params) for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE],
    key=lambda x: x[1]
)[:10] 

print("\nTop 10 Hyperparameter Combinations and Scores:")
for rank, (trial_number, mse, params) in enumerate(trial_results, start=1):
    model = SVR(
        C=params["C"],
        epsilon=params["epsilon"],
        kernel=params["kernel"],
        gamma=params.get("gamma", "scale") 
    )
    
    # Fit model
    model.fit(X_train, y_train.values.ravel())

    y_train_pred_log = model.predict(X_train)
    y_test_pred_log = model.predict(X_test)

    y_train_pred = np.exp(y_train_pred_log)
    y_test_pred = np.exp(y_test_pred_log)

    y_train_np = y_raw.iloc[y_train.index].to_numpy().squeeze()
    y_test_np = y_raw.iloc[y_test.index].to_numpy().squeeze()
    
    # Calculate metrics for training set
    r2_train = r2_score(y_train_np, y_train_pred)
    rmse_train = root_mean_squared_error(y_train_np, y_train_pred)
    ratio_train = y_train_np / y_train_pred
    mean_ratio_train = np.mean(ratio_train)
    cov_ratio_p_train = np.cov(ratio_train, rowvar=False, ddof=1)
    cov_ratio_p_train = cov_ratio_p_train * 100
    
    # Calculate metrics for testing set
    r2_test = r2_score(y_test_np, y_test_pred)
    rmse_test = root_mean_squared_error(y_test_np, y_test_pred)
    ratio_test = y_test_np / y_test_pred
    mean_ratio_test = np.mean(ratio_test)
    cov_ratio_p_test = np.cov(ratio_test, rowvar=False, ddof=1)
    cov_ratio_p_test = cov_ratio_p_test * 100
    
    # Print results
    print(f"Rank {rank}: Trial {trial_number}")
    print(f"Parameters: {params}")
    print(f"Training Set - R²: {r2_train:.3f}, RMSE: {rmse_train:.3f}, mean_ratio: {mean_ratio_train:.3f}, cov_ratio(%): {cov_ratio_p_train:.1f}")
    print(f"Testing Set  - R²: {r2_test:.3f}, RMSE: {rmse_test:.3f}, mean_ratio: {mean_ratio_test:.3f}, cov_ratio(%): {cov_ratio_p_test:.1f}\n")
    

[I 2025-04-04 10:59:50,955] A new study created in memory with name: no-name-984a073c-866d-40e9-8677-0327003d5cb3
[I 2025-04-04 10:59:50,994] Trial 0 finished with value: 0.81968629647966 and parameters: {'C': 0.04086299349152498, 'epsilon': 0.005982097225061503, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.81968629647966.
[I 2025-04-04 10:59:51,036] Trial 1 finished with value: 0.8567780522226236 and parameters: {'C': 0.031457624734351576, 'epsilon': 0.1621627513565354, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 0 with value: 0.81968629647966.
[I 2025-04-04 10:59:51,096] Trial 2 finished with value: 0.7196898254099477 and parameters: {'C': 0.5991165468659588, 'epsilon': 0.0006265452158603236, 'kernel': 'poly', 'gamma': 'auto'}. Best is trial 2 with value: 0.7196898254099477.
[I 2025-04-04 10:59:51,141] Trial 3 finished with value: 0.4099424119948353 and parameters: {'C': 9.308025095138955, 'epsilon': 0.0014598522209710928, 'kernel': 'rbf', 'gamma': 'auto'}. 

Best parameters found by Optuna: {'C': 1.5139663948074236, 'epsilon': 0.16259922229457582, 'kernel': 'rbf', 'gamma': 'scale'}

Top 10 Hyperparameter Combinations and Scores:
Rank 1: Trial 63
Parameters: {'C': 1.5139663948074236, 'epsilon': 0.16259922229457582, 'kernel': 'rbf', 'gamma': 'scale'}
Training Set - R²: 0.878, RMSE: 0.152, mean_ratio: 1.077, cov_ratio(%): 19.6
Testing Set  - R²: 0.820, RMSE: 0.196, mean_ratio: 1.080, cov_ratio(%): 22.5

Rank 2: Trial 83
Parameters: {'C': 2.1660966175726264, 'epsilon': 0.17147261093211327, 'kernel': 'rbf', 'gamma': 'scale'}
Training Set - R²: 0.883, RMSE: 0.149, mean_ratio: 1.089, cov_ratio(%): 20.0
Testing Set  - R²: 0.819, RMSE: 0.197, mean_ratio: 1.108, cov_ratio(%): 26.7

Rank 3: Trial 71
Parameters: {'C': 1.253250216905487, 'epsilon': 0.11587168312861032, 'kernel': 'rbf', 'gamma': 'scale'}
Training Set - R²: 0.875, RMSE: 0.154, mean_ratio: 1.062, cov_ratio(%): 21.1
Testing Set  - R²: 0.827, RMSE: 0.193, mean_ratio: 1.049, cov_ratio(%): 19