In [1]:
import pandas as pd
from sklearn.preprocessing import MaxAbsScaler

# read datasets
train_df = pd.read_csv('data/train2.csv')
test_df = pd.read_csv('data/test2.csv')

y1_name, y2_name, y3_name = "dir_costs", "traffic_costs_s_r", "lost_trips_costs_s_r"
train_y1, train_y2, train_y3 = train_df[y1_name], train_df[y2_name], train_df[y3_name]
test_y1, test_y2, test_y3 = test_df[y1_name], test_df[y2_name], test_df[y3_name]

# scale features
X_train = train_df.drop(columns=[y1_name, y2_name, y3_name])
scaler = MaxAbsScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

X_test = test_df.drop(columns=[y1_name, y2_name, y3_name])
X_test = scaler.transform(X_test)


<h1> Train GBR model using random hyperparameters </h1>

In [2]:
from metrics import print_metrics
from sklearn.ensemble import GradientBoostingRegressor

# Create an instance of GradientBoostingRegressor
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the regressor to the training data
gb_regressor.fit(X_train, train_y3)

print("------ test metrics ------")
print_metrics(test_y3, gb_regressor.predict(X_test))

print("------ train metrics ------")
print_metrics(train_y3, gb_regressor.predict(X_train))


------ test metrics ------
Mean Squared Error (MSE):              782183991403215.2500000000
Root Mean Squared Error (RMSE):        27967552.4743087813
Mean Absolute Error (MAE):             14350359.1337993350
R-squared (R²):                        0.6896669741
Mean Absolute Percentage Error (MAPE): 0.1501666605
Max Error (ME):                        230163727.4108173549
Median Absolute Error (MedAE):         6162288.9150794968
------ train metrics ------
Mean Squared Error (MSE):              271620877573933.7812500000
Root Mean Squared Error (RMSE):        16480924.6577348858
Mean Absolute Error (MAE):             9855529.0997086167
R-squared (R²):                        0.8992476870
Mean Absolute Percentage Error (MAPE): 0.1181805488
Max Error (ME):                        114024938.1598487198
Median Absolute Error (MedAE):         5655040.8151182309


<h1> Optimize Gradient boost parameters using Differential evolution</h1>

In [3]:
from sklearn.metrics import r2_score
from scipy.optimize import differential_evolution

# define objective function
def objective_function(params, train_X, train_y, test_X, test_y):
    regressor = GradientBoostingRegressor(n_estimators=int(params[0]), learning_rate=params[1], max_depth=int(params[2]), random_state=42)
    regressor.fit(train_X, train_y)
    pred_y = regressor.predict(test_X)
    r2 = r2_score(test_y, pred_y)
    return -r2

In [4]:
optimization_res = differential_evolution(func=objective_function, 
                                          bounds=[(2, 300), (0.0001, 0.5), (2, 10)], 
                                          updating='deferred',
                                          workers=10, 
                                          disp=True,
                                          tol=0.00001,
                                          atol=0.00001,
                                          maxiter=50,
                                          args=(X_train, train_y3, X_test, test_y3))
print(optimization_res)

differential_evolution step 1: f(x)= -0.692742
differential_evolution step 2: f(x)= -0.695701
differential_evolution step 3: f(x)= -0.695701
differential_evolution step 4: f(x)= -0.697608
differential_evolution step 5: f(x)= -0.705371
differential_evolution step 6: f(x)= -0.705371
differential_evolution step 7: f(x)= -0.70554
differential_evolution step 8: f(x)= -0.70554
differential_evolution step 9: f(x)= -0.711373
differential_evolution step 10: f(x)= -0.711373
differential_evolution step 11: f(x)= -0.711373
differential_evolution step 12: f(x)= -0.711373
differential_evolution step 13: f(x)= -0.711373
differential_evolution step 14: f(x)= -0.711373
differential_evolution step 15: f(x)= -0.711373
differential_evolution step 16: f(x)= -0.711373
differential_evolution step 17: f(x)= -0.711373
differential_evolution step 18: f(x)= -0.711373
differential_evolution step 19: f(x)= -0.711373
differential_evolution step 20: f(x)= -0.711373
differential_evolution step 21: f(x)= -0.711373
dif

In [5]:
optimization_res.x

array([2.40196902e+02, 1.51313415e-01, 2.31307748e+00])

In [6]:
from metrics import print_metrics
from sklearn.ensemble import GradientBoostingRegressor

# Create an instance of GradientBoostingRegressor
gb_regressor = GradientBoostingRegressor(n_estimators=int(optimization_res.x[0]), learning_rate=optimization_res.x[1], max_depth=int(optimization_res.x[2]), random_state=42)

# Fit the regressor to the training data
gb_regressor.fit(X_train, train_y3)

print("------ test metrics ------")
print_metrics(test_y3, gb_regressor.predict(X_test))

print("------ train metrics ------")
print_metrics(train_y3, gb_regressor.predict(X_train))


------ test metrics ------
Mean Squared Error (MSE):              723888876129478.1250000000
Root Mean Squared Error (RMSE):        26905183.0718446895
Mean Absolute Error (MAE):             14499530.3755175173
R-squared (R²):                        0.7127956749
Mean Absolute Percentage Error (MAPE): 0.1680235227
Max Error (ME):                        234231889.0085158348
Median Absolute Error (MedAE):         7469819.9944704622
------ train metrics ------
Mean Squared Error (MSE):              243482517207629.6875000000
Root Mean Squared Error (RMSE):        15603926.3394707702
Mean Absolute Error (MAE):             9855775.2816351615
R-squared (R²):                        0.9096850470
Mean Absolute Percentage Error (MAPE): 0.1261398595
Max Error (ME):                        108583489.6071152687
Median Absolute Error (MedAE):         6326936.7840599492
