In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import random

# -----------------------------
# Load dataset
# -----------------------------
data = pd.read_csv("Udata.csv")

X = data.iloc[:, 0:5].values   # First 5 columns (features)
y = data.iloc[:, 5].values     # 6th column (label)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# -----------------------------
# Define objective function
# -----------------------------
def fitness_function(params):
    depth, learning_rate, iterations = params
    model = CatBoostRegressor(
        depth=int(depth),
        learning_rate=learning_rate,
        iterations=int(iterations),
        loss_function='RMSE',
        verbose=0
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse

# -----------------------------
# Whale Optimization Algorithm (WOA)
# -----------------------------
def whale_optimization(fitness_func, n_whales=10, max_iter=20):
    # Search space for CatBoost hyperparameters
    depth_bounds = (4, 12)
    lr_bounds = (0.01, 0.3)
    iter_bounds = (100, 1000)

    # Initialize whales randomly
    whales = []
    for _ in range(n_whales):
        depth = random.randint(*depth_bounds)
        lr = random.uniform(*lr_bounds)
        iters = random.randint(*iter_bounds)
        whales.append([depth, lr, iters])

    # Evaluate initial fitness
    fitness = [fitness_func(w) for w in whales]
    best_whale = whales[np.argmin(fitness)]
    best_score = min(fitness)

    for t in range(max_iter):
        a = 2 - t * (2 / max_iter)  # linearly decreases from 2 to 0

        for i in range(n_whales):
            r = random.random()
            A = 2 * a * r - a
            C = 2 * r
            p = random.random()

            if p < 0.5:
                if abs(A) < 1:
                    # Encircling prey
                    D = abs(C * np.array(best_whale) - np.array(whales[i]))
                    whales[i] = np.array(best_whale) - A * D
                else:
                    # Search for prey (random whale)
                    rand_whale = whales[random.randint(0, n_whales - 1)]
                    D = abs(C * np.array(rand_whale) - np.array(whales[i]))
                    whales[i] = np.array(rand_whale) - A * D
            else:
                # Spiral updating position
                D = abs(np.array(best_whale) - np.array(whales[i]))
                b = 1
                l = (random.random() * 2 - 1)
                whales[i] = np.array(D * np.exp(b * l) * np.cos(2 * np.pi * l) + np.array(best_whale))

            # Ensure bounds
            whales[i][0] = np.clip(whales[i][0], *depth_bounds)
            whales[i][1] = np.clip(whales[i][1], *lr_bounds)
            whales[i][2] = np.clip(whales[i][2], *iter_bounds)

        # Evaluate fitness
        fitness = [fitness_func(w) for w in whales]
        current_best = whales[np.argmin(fitness)]
        current_score = min(fitness)

        if current_score < best_score:
            best_whale = current_best
            best_score = current_score

        print(f"Iteration {t+1}/{max_iter}, Best RMSE = {best_score:.4f}")

    return best_whale, best_score

# -----------------------------
# Run WOA + CatBoost
# -----------------------------
best_params, best_rmse = whale_optimization(fitness_function, n_whales=10, max_iter=20)
print("Best Parameters (depth, learning_rate, iterations):", best_params)
print("Best RMSE:", best_rmse)

# -----------------------------
# Train final CatBoost with best params
# -----------------------------
final_model = CatBoostRegressor(
    depth=int(best_params[0]),
    learning_rate=best_params[1],
    iterations=int(best_params[2]),
    loss_function='RMSE',
    verbose=100
)

final_model.fit(X_train, y_train)
final_preds = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, final_preds))

print("Final Test RMSE:", final_rmse)


Iteration 1/20, Best RMSE = 0.0121
Iteration 2/20, Best RMSE = 0.0119
Iteration 3/20, Best RMSE = 0.0118
Iteration 4/20, Best RMSE = 0.0115
Iteration 5/20, Best RMSE = 0.0115
Iteration 6/20, Best RMSE = 0.0115
Iteration 7/20, Best RMSE = 0.0115
Iteration 8/20, Best RMSE = 0.0115
Iteration 9/20, Best RMSE = 0.0115
Iteration 10/20, Best RMSE = 0.0115
Iteration 11/20, Best RMSE = 0.0114
Iteration 12/20, Best RMSE = 0.0114
Iteration 13/20, Best RMSE = 0.0114
Iteration 14/20, Best RMSE = 0.0114
Iteration 15/20, Best RMSE = 0.0114
Iteration 16/20, Best RMSE = 0.0114
Iteration 17/20, Best RMSE = 0.0114
Iteration 18/20, Best RMSE = 0.0114
Iteration 19/20, Best RMSE = 0.0114
Iteration 20/20, Best RMSE = 0.0114
Best Parameters (depth, learning_rate, iterations): [1.02046464e+01 2.51836572e-01 7.99533268e+02]
Best RMSE: 0.011440499636254944
0:	learn: 0.2298135	total: 7.02ms	remaining: 5.6s
100:	learn: 0.0137502	total: 1.02s	remaining: 7.09s
200:	learn: 0.0093034	total: 2.12s	remaining: 6.31s
300: