In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
import optuna
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

num_trials = 100;

# Load data
shear_strength = pd.read_csv('shear_strength.csv', header=None)
new_or_old = pd.read_csv('new_or_old.csv', header=None)
load_type = pd.read_csv('load_type.csv', header=None)
wall_l = pd.read_csv('wall_l.csv', header=None)
wall_h = pd.read_csv('wall_h.csv', header=None)
wall_t = pd.read_csv('wall_t.csv', header=None)
leaf_num = pd.read_csv('leaf_num.csv', header=None)
bond_pattern = pd.read_csv('bond_pattern.csv', header=None)
ft_mortar = pd.read_csv('ft_mortar.csv', header=None)
ft_brick = pd.read_csv('ft_brick.csv', header=None)

# Combine features and target
X = pd.concat([new_or_old, wall_l, wall_t, leaf_num, bond_pattern, ft_mortar, ft_brick], axis=1)
X.columns = ['new_or_old', 'wall_l', 'wall_t', 'leaf_num', 'bond_pattern', 'ft_mortar', 'ft_brick']
y_raw = shear_strength 
y_raw.columns = ['shear_strength']
y = np.log(y_raw)  # log-transform (natural log)
y.columns = ['shear_strength']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

trial_results = []

# Optuna objective function
def objective(trial):
    n_neighbors = trial.suggest_int("n_neighbors", 1, 50)
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    p = trial.suggest_int("p", 1, 2)  # 1 = Manhattan, 2 = Euclidean

    model = KNeighborsRegressor(
        n_neighbors=n_neighbors,
        weights=weights,
        p=p,
        n_jobs=-1
    )

    cv = KFold(n_splits=10, shuffle=True, random_state=0)
    scores = cross_val_score(model, X_train, y_train.values.ravel(), cv=cv, scoring="neg_mean_squared_error")
    return -scores.mean()

# Run Optuna optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=num_trials)

# Best parameters
print("Best parameters found by Optuna:", study.best_params)

trial_results = sorted(
    [(t.number, t.value, t.params) for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE],
    key=lambda x: x[1]
)[:10]

print("\nTop 10 Hyperparameter Combinations and Scores:")
for rank, (trial_number, mse, params) in enumerate(trial_results, start=1):
    model = KNeighborsRegressor(
        n_neighbors=params["n_neighbors"],
        weights=params["weights"],
        p=params["p"],
        n_jobs=-1
    )
    
    model.fit(X_train, y_train.values.ravel())

    y_train_pred_log = model.predict(X_train)
    y_test_pred_log = model.predict(X_test)

    y_train_pred = np.exp(y_train_pred_log)
    y_test_pred = np.exp(y_test_pred_log)

    y_train_np = y_raw.iloc[y_train.index].to_numpy().squeeze()
    y_test_np = y_raw.iloc[y_test.index].to_numpy().squeeze()
    
    # Calculate metrics for training set
    r2_train = r2_score(y_train_np, y_train_pred)
    rmse_train = root_mean_squared_error(y_train_np, y_train_pred)
    ratio_train = y_train_np / y_train_pred
    mean_ratio_train = np.mean(ratio_train)
    cov_ratio_p_train = np.cov(ratio_train, rowvar=False, ddof=1)
    cov_ratio_p_train = cov_ratio_p_train * 100
    
    # Calculate metrics for testing set
    r2_test = r2_score(y_test_np, y_test_pred)
    rmse_test = root_mean_squared_error(y_test_np, y_test_pred)
    ratio_test = y_test_np / y_test_pred
    mean_ratio_test = np.mean(ratio_test)
    cov_ratio_p_test = np.cov(ratio_test, rowvar=False, ddof=1)
    cov_ratio_p_test = cov_ratio_p_test * 100
    
    # Print results
    print(f"Rank {rank}: Trial {trial_number}")
    print(f"Parameters: {params}")
    print(f"Training Set - R²: {r2_train:.3f}, RMSE: {rmse_train:.3f}, mean_ratio: {mean_ratio_train:.3f}, cov_ratio(%): {cov_ratio_p_train:.1f}")
    print(f"Testing Set  - R²: {r2_test:.3f}, RMSE: {rmse_test:.3f}, mean_ratio: {mean_ratio_test:.3f}, cov_ratio(%): {cov_ratio_p_test:.1f}\n")
    

[I 2025-04-04 01:04:19,194] A new study created in memory with name: no-name-9efccb29-3cdb-48c2-b87a-4fe384ec9c74
[W 2025-04-04 01:04:19,351] Trial 0 failed with parameters: {'n_neighbors': 38, 'weights': 'distance', 'p': 2} because of the following error: The value nan is not acceptable.
[W 2025-04-04 01:04:19,354] Trial 0 failed with value nan.
[I 2025-04-04 01:04:19,586] Trial 1 finished with value: 0.28151413911350026 and parameters: {'n_neighbors': 9, 'weights': 'distance', 'p': 1}. Best is trial 1 with value: 0.28151413911350026.
[I 2025-04-04 01:04:19,827] Trial 2 finished with value: 0.2759150690892734 and parameters: {'n_neighbors': 14, 'weights': 'distance', 'p': 1}. Best is trial 2 with value: 0.2759150690892734.
[W 2025-04-04 01:04:19,989] Trial 3 failed with parameters: {'n_neighbors': 45, 'weights': 'distance', 'p': 1} because of the following error: The value nan is not acceptable.
[W 2025-04-04 01:04:19,989] Trial 3 failed with value nan.
[I 2025-04-04 01:04:20,145] Tri

Best parameters found by Optuna: {'n_neighbors': 2, 'weights': 'distance', 'p': 1}

Top 10 Hyperparameter Combinations and Scores:
Rank 1: Trial 14
Parameters: {'n_neighbors': 2, 'weights': 'distance', 'p': 1}
Training Set - R²: 0.933, RMSE: 0.113, mean_ratio: 1.031, cov_ratio(%): 1.7
Testing Set  - R²: 0.792, RMSE: 0.211, mean_ratio: 1.129, cov_ratio(%): 23.6

Rank 2: Trial 2
Parameters: {'n_neighbors': 14, 'weights': 'distance', 'p': 1}
Training Set - R²: 0.951, RMSE: 0.097, mean_ratio: 1.005, cov_ratio(%): 1.1
Testing Set  - R²: 0.872, RMSE: 0.166, mean_ratio: 0.989, cov_ratio(%): 13.1

Rank 3: Trial 11
Parameters: {'n_neighbors': 22, 'weights': 'distance', 'p': 1}
Training Set - R²: 0.951, RMSE: 0.097, mean_ratio: 1.005, cov_ratio(%): 1.1
Testing Set  - R²: 0.877, RMSE: 0.163, mean_ratio: 0.976, cov_ratio(%): 12.4

Rank 4: Trial 1
Parameters: {'n_neighbors': 9, 'weights': 'distance', 'p': 1}
Training Set - R²: 0.951, RMSE: 0.097, mean_ratio: 1.005, cov_ratio(%): 1.1
Testing Set  - 