In [59]:

from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
# transforms X into clustered dataset with. k - number of clusters in the range [0, k-1]
def transform_to_clustered_dataset_train(X: np.ndarray, Xy: pd.DataFrame, cluster_predictor, k: int):
    clusters = cluster_predictor.predict(Xy)
    zero_columns = pd.DataFrame(np.zeros((X.shape[0], k)), columns=[f'class_{i}' for i in range(k)])
    res = np.concatenate((X, zero_columns), axis=1)
    for i in range(res.shape[0]):
        cluster = clusters[i]
        res[i, X.shape[1] + cluster] = 1.0 
    return res

def transform_to_clustered_dataset_test(X: np.ndarray, kmeans_model: KMeans, k: int):
    centers = kmeans_model.cluster_centers_[:, :-1]
    clusters = []
    for row in X:
        min_dist = np.Infinity
        cluster = -1
        for i in range(k):
            cur_dist = np.linalg.norm(row - centers[i])
            if cur_dist < min_dist:
                min_dist = cur_dist
                cluster = i
        if cluster == -1:
            raise "Error"
        clusters.append(cluster)
        
    zero_columns = pd.DataFrame(np.zeros((X.shape[0], k)), columns=[f'class_{i}' for i in range(k)])
    res = np.concatenate((X, zero_columns), axis=1)
    for i in range(res.shape[0]):
        cluster = clusters[i]
        res[i, X.shape[1] + cluster] = 1.0 
    return res

In [60]:
# read datasets
train_df = pd.read_csv('data/train3.csv')
test_df = pd.read_csv('data/test3.csv')
val_df = pd.read_csv('data/validate3.csv')

In [61]:
from metrics import print_metrics
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
from sklearn.preprocessing import MaxAbsScaler
import numpy as np

# "dir_costs", "traffic_costs_s_r", "lost_trips_costs_s_r"
y1_name, y2_name, y3_name = "traffic_costs_s_r", "dir_costs", "lost_trips_costs_s_r"


def f(clusters_num, learning_rate, max_depth, alpha, n_estimators):
    train_y1, train_y2, train_y3 = train_df[y1_name], train_df[y2_name], train_df[y3_name]
    test_y1, test_y2, test_y3 = test_df[y1_name], test_df[y2_name], test_df[y3_name]

    # scale features
    X_train = train_df.drop(columns=[y1_name, y2_name, y3_name])
    scaler = MaxAbsScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    
    X_test = test_df.drop(columns=[y1_name, y2_name, y3_name])
    X_test = scaler.transform(X_test)
    
    # prepare dataset from training kmeans
    X_train_y1 = np.concatenate((X_train, train_df[y1_name].values.reshape(-1, 1)), axis=1)
    scaler_y1 = MaxAbsScaler()
    scaler_y1.fit(X_train_y1)
    X_train_y1 = scaler_y1.transform(X_train_y1)
    
    optimal_n_clusters = int(clusters_num)
    
    kmeans = KMeans(n_clusters=int(optimal_n_clusters), random_state=42, n_init=10)
    kmeans.fit(X_train_y1)
    
    X_train_clustered = transform_to_clustered_dataset_train(X_train, X_train_y1, kmeans, optimal_n_clusters)
    X_test_clustered = transform_to_clustered_dataset_test(X_test, kmeans, optimal_n_clusters)
    
    # Create an instance of GradientBoostingRegressor
    gb_regressor = GradientBoostingRegressor(n_estimators=int(n_estimators), learning_rate=learning_rate, max_depth=int(max_depth), random_state=42)
    
    # Fit the regressor to the training data
    gb_regressor.fit(X_train_clustered, train_y1)
    
    X_train_clustered_GBR = np.concatenate((X_train_clustered, gb_regressor.predict(X_train_clustered).reshape(-1, 1)), axis=1)
    X_test_clustered_GBR = np.concatenate((X_test_clustered, gb_regressor.predict(X_test_clustered).reshape(-1, 1)), axis=1)

    scaler_GBR = MaxAbsScaler()
    scaler_GBR.fit(X_train_clustered_GBR)
    X_train_clustered_GBR = scaler_GBR.transform(X_train_clustered_GBR)
    X_test_clustered_GBR = scaler_GBR.transform(X_test_clustered_GBR)
    
    # Create a Ridge regression model
    ridge_reg = Ridge(alpha=alpha)
    
    # Train the model
    ridge_reg.fit(X_train_clustered_GBR, train_y1)
    
    pred_y = ridge_reg.predict(X_test_clustered_GBR)
    r2 = r2_score(test_y1, pred_y)

    return r2

def print_train_test_validate(clusters_num, learning_rate, max_depth, alpha, n_estimators):
    train_y1, train_y2, train_y3 = train_df[y1_name], train_df[y2_name], train_df[y3_name]
    test_y1, test_y2, test_y3 = test_df[y1_name], test_df[y2_name], test_df[y3_name]
    val_y1, val_y2, val_y3 = val_df[y1_name], val_df[y2_name], val_df[y3_name]

    # scale features
    X_train = train_df.drop(columns=[y1_name, y2_name, y3_name])
    scaler = MaxAbsScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    
    X_test = test_df.drop(columns=[y1_name, y2_name, y3_name])
    X_test = scaler.transform(X_test)
    
    X_val = val_df.drop(columns=[y1_name, y2_name, y3_name])
    X_val = scaler.transform(X_val)
    
    # prepare dataset from training kmeans
    X_train_y1 = np.concatenate((X_train, train_df[y1_name].values.reshape(-1, 1)), axis=1)
    scaler_y1 = MaxAbsScaler()
    scaler_y1.fit(X_train_y1)
    X_train_y1 = scaler_y1.transform(X_train_y1)
    
    optimal_n_clusters = int(clusters_num)
    
    kmeans = KMeans(n_clusters=optimal_n_clusters, random_state=42, n_init=10)
    kmeans.fit(X_train_y1)
    
    X_train_clustered = transform_to_clustered_dataset_train(X_train, X_train_y1, kmeans, optimal_n_clusters)
    X_test_clustered = transform_to_clustered_dataset_test(X_test, kmeans, optimal_n_clusters)
    X_val_clustered = transform_to_clustered_dataset_test(X_val, kmeans, optimal_n_clusters)
    
    # Create an instance of GradientBoostingRegressor
    gb_regressor = GradientBoostingRegressor(n_estimators=int(n_estimators), learning_rate=learning_rate, max_depth=int(max_depth), random_state=42)
    
    # Fit the regressor to the training data
    gb_regressor.fit(X_train_clustered, train_y1)
    
    X_train_clustered_GBR = np.concatenate((X_train_clustered, gb_regressor.predict(X_train_clustered).reshape(-1, 1)), axis=1)
    X_test_clustered_GBR = np.concatenate((X_test_clustered, gb_regressor.predict(X_test_clustered).reshape(-1, 1)), axis=1)
    X_val_clustered_GBR = np.concatenate((X_val_clustered, gb_regressor.predict(X_val_clustered).reshape(-1, 1)), axis=1)
    
    scaler_GBR = MaxAbsScaler()
    scaler_GBR.fit(X_train_clustered_GBR)
    X_train_clustered_GBR = scaler_GBR.transform(X_train_clustered_GBR)
    X_test_clustered_GBR = scaler_GBR.transform(X_test_clustered_GBR)
    X_val_clustered_GBR = scaler_GBR.transform(X_val_clustered_GBR)
    
    # Create a Ridge regression model
    ridge_reg = Ridge(alpha=alpha)
    
    # Train the model
    ridge_reg.fit(X_train_clustered_GBR, train_y1)
    
    
    print("------ test metrics ------")
    print_metrics(test_y1, ridge_reg.predict(X_test_clustered_GBR))
    
    print("------ train metrics ------")
    print_metrics(train_y1, ridge_reg.predict(X_train_clustered_GBR))
    
    print("------ val metrics ------")
    print_metrics(val_y1, ridge_reg.predict(X_val_clustered_GBR))


In [62]:
from bayes_opt import BayesianOptimization

pbounds = {'clusters_num': (2, 5), 
           'learning_rate': (0.0001, 0.5),
           'max_depth': (1, 300),
           'alpha': (0, 10),
           'n_estimators':(1, 100)}

optimizer = BayesianOptimization(
    f=f,
    pbounds=pbounds,
    random_state=1)

optimizer.maximize(
    init_points=2,
    n_iter=100)
print(optimizer.max)

|   iter    |  target   |   alpha   | cluste... | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.5393   [0m | [0m4.17     [0m | [0m4.161    [0m | [0m0.0001572[0m | [0m91.4     [0m | [0m15.53    [0m |
| [95m2        [0m | [95m0.6968   [0m | [95m0.9234   [0m | [95m2.559    [0m | [95m0.1728   [0m | [95m119.6    [0m | [95m54.34    [0m |
| [0m3        [0m | [0m0.696    [0m | [0m0.9744   [0m | [0m2.045    [0m | [0m0.3161   [0m | [0m121.0    [0m | [0m52.79    [0m |
| [0m4        [0m | [0m0.6798   [0m | [0m0.0      [0m | [0m2.0      [0m | [0m0.5      [0m | [0m150.4    [0m | [0m84.15    [0m |
| [0m5        [0m | [0m0.6779   [0m | [0m10.0     [0m | [0m2.0      [0m | [0m0.5      [0m | [0m94.45    [0m | [0m100.0    [0m |
| [0m6        [0m | [0m0.6853   [0m | [0m0.4108   [0m | [0m4.063    [0m | [0m0.449    [0m | [0m188.4

In [63]:
print_train_test_validate(optimizer.max['params']['clusters_num'],
                          optimizer.max['params']['learning_rate'],
                          optimizer.max['params']['max_depth'],
                          optimizer.max['params']['alpha'],
                          optimizer.max['params']['n_estimators'])

------ test metrics ------
Mean Squared Error (MSE):              539546095525101.8125000000
Root Mean Squared Error (RMSE):        23228131.5547570847
Mean Absolute Error (MAE):             9667125.9069930166
R-squared (R²):                        0.7471480013
Mean Absolute Percentage Error (MAPE): 1.4258608869
Max Error (ME):                        231067723.4253291488
Median Absolute Error (MedAE):         3852532.4280663133
------ train metrics ------
Mean Squared Error (MSE):              16639377555004.9335937500
Root Mean Squared Error (RMSE):        4079139.3154690037
Mean Absolute Error (MAE):             2419947.9389392314
R-squared (R²):                        0.9882311441
Mean Absolute Percentage Error (MAPE): 0.2473506453
Max Error (ME):                        42311107.1860775054
Median Absolute Error (MedAE):         1601284.4643022688
------ val metrics ------
Mean Squared Error (MSE):              314302144365528.3750000000
Root Mean Squared Error (RMSE):        1772856