In [4]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import math

In [5]:
# Load the data
data = pd.read_csv('../datasets/complete_data/df.csv')

# Change column names to match naming convention
data = data.rename(columns={'start MTU (UTC)': 'ds', 'Day-ahead Price [EUR/MWh] BZN|NO1': 'y'})

# Split the data into training and testing sets
X = data.drop(['y', 'ds'], axis=1)
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2]
}

In [6]:
# Create an instance of the GradientBoostingRegressor class
model = GradientBoostingRegressor()

# Create an instance of the GridSearchCV class
grid_search = GridSearchCV(model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print(best_params)

In [7]:
# Train the model using the best hyperparameters
model = GradientBoostingRegressor(**best_params)
model.fit(X_train, y_train)

In [8]:
# Make a prediction on the testing set
predicted_values = model.predict(X_test)

# Calculate the mean absolute error (MAE) between the predicted and actual values
mae = mean_absolute_error(y_test, predicted_values)
print("Mean absolute error:            ", mae)

# Calculate the mean absolute percentage error (MAE) between the predicted and actual values
mape = mean_absolute_percentage_error(y_test, predicted_values)
print("Mean absolute percentage error: ", mape)

# Calculating the mean squared error (MSE) between the predicted and actual values
mse = mean_squared_error(y_test, predicted_values)
print("Mean squared error:             ", mse)

# Calculating the root mean squared error (RMSE) between the predicted and actual values
rmse = math.sqrt(mse)
print("Rood mean squared error:        ", rmse)

Mean absolute error:             1.0431643260838601
Mean absolute percentage error:  0.008799413568678712
Mean squared error:              8.510767696815819
Rood mean squared error:         2.9173220077351454


In [9]:
# Get feature importance
importance = model.feature_importances_

# Create a dictionary of feature names and importance scores
feature_importance = dict(zip(X.columns, importance))

# Print the feature importance scores in descending order
for feature, importance in sorted(feature_importance.items(), key=lambda x: x[1], reverse=True):
    print("{}: {:.2f}%".format(feature, importance*100))

Day-ahead Price [EUR/MWh] BZN|NO5: 99.74%
Waste - BZN|NO2: 0.05%
Day-ahead Price [EUR/MWh] BZN|SE3: 0.04%
Other - BZN|SE3: 0.03%
Hydro Run-of-river and poundage - BZN|NO3: 0.02%
Wind Onshore - BZN|NO3: 0.02%
Hydro Run-of-river and poundage - BZN|NO1: 0.01%
Hydro Run-of-river and poundage - BZN|NO5: 0.01%
Hydro Pumped Storage Aggregated- BZN|NO2: 0.01%
Hydro Water Reservoir - BZN|NO3: 0.01%
Hydro Water Reservoir - BZN|SE3: 0.01%
Hydro Water Reservoir - BZN|NO1: 0.01%
Fossil Gas - BZN|NO5: 0.01%
Solar - BZN|SE3: 0.01%
Actual Total Load [MW] - BZN|NO5: 0.01%
Hydro Water Reservoir - BZN|NO2: 0.00%
Actual Total Load [MW] - BZN|SE3: 0.00%
Wind Onshore - BZN|SE3: 0.00%
CBF BZN|NO5 > BZN|NO1 [MW]: 0.00%
CBF BZN|NO2 > BZN|NO1 [MW]: 0.00%
Stored Energy Value Water Reservoirs and Hydro Storage Plants [MWh] - BZN|SE3: 0.00%
Wind Onshore - BZN|NO1: 0.00%
Day-ahead Price [EUR/MWh] BZN|NO3: 0.00%
CBF BZN|NO3 > BZN|NO1 [MW]: 0.00%
Wind Onshore - BZN|NO2: 0.00%
Nuclear - BZN|SE3: 0.00%
CBF BZN|NO1 > BZ

In [None]:
# Plot the predicted values against the actual values
plt.plot(y_test, predicted_values, 'o')
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.show()