In [11]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import math

In [12]:
# Load the data
data = pd.read_csv('../datasets/complete_data/df.csv')

# Change column names to match naming convention
data = data.rename(columns={'start MTU (UTC)': 'ds', 'Day-ahead Price [EUR/MWh] BZN|NO1': 'y'})

data['ds'] = pd.to_datetime(data['ds'])
data.set_index('ds', inplace=True)

In [13]:
# Split the data into training and testing sets
train, test = train_test_split(data, test_size=0.2, random_state=0)

In [14]:
# Lagged features
def create_lagged_dataset(df):
    # creating a copy of the dataframe
    lagged_df = df.copy()

    # Adding lagged features for target variable
    lagged_df['y'] = lagged_df['y'].shift(-1)

    # Dropping the last row containing NaN values
    lagged_df.dropna(inplace=True)

    # Creating a dataframe with lagged features with 24 steps for each of the original features
    lagged_df = pd.concat([lagged_df.shift(i) for i in range(24)], axis=1)

    # Removing the NaN rows that have been created in the beginning of the dataset
    lagged_df.dropna(inplace=True)

    return lagged_df

In [15]:
# Create lagged dataset
lagged_train = create_lagged_dataset(train)
lagged_test = create_lagged_dataset(test)

# Split the training and testing sets into X_train , y_train , X_test , and y_test
X_train = lagged_train.drop(['y'], axis=1)
y_train = lagged_train['y']

X_test = lagged_test.drop(['y'], axis=1)
y_test = lagged_test['y']

# Print the shape of the training, validation and testing datasets
print("Training set shape:  ", X_train.shape, y_train.shape)
print("Testing set shape:   ", X_test.shape, y_test.shape)

Training set shape:   (6984, 840) (6984, 24)
Testing set shape:    (1728, 840) (1728, 24)


In [16]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2]
}

# Create an instance of the GradientBoostingRegressor class
model = GradientBoostingRegressor()

# Create an instance of the GridSearchCV class
grid_search = GridSearchCV(model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print(best_params)

ValueError: 
All the 135 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
108 fits failed with the following error:
Traceback (most recent call last):
  File "D:\Skole\6. Semester\Bachelor\Time-series-energy-price-prediction-bidding-zone-N01\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Skole\6. Semester\Bachelor\Time-series-energy-price-prediction-bidding-zone-N01\venv\Lib\site-packages\sklearn\ensemble\_gb.py", line 437, in fit
    y = column_or_1d(y, warn=True)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\Skole\6. Semester\Bachelor\Time-series-energy-price-prediction-bidding-zone-N01\venv\Lib\site-packages\sklearn\utils\validation.py", line 1202, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (5587, 24) instead.

--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "D:\Skole\6. Semester\Bachelor\Time-series-energy-price-prediction-bidding-zone-N01\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Skole\6. Semester\Bachelor\Time-series-energy-price-prediction-bidding-zone-N01\venv\Lib\site-packages\sklearn\ensemble\_gb.py", line 437, in fit
    y = column_or_1d(y, warn=True)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\Skole\6. Semester\Bachelor\Time-series-energy-price-prediction-bidding-zone-N01\venv\Lib\site-packages\sklearn\utils\validation.py", line 1202, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (5588, 24) instead.


In [None]:
# Train the model using the best hyperparameters
model = GradientBoostingRegressor(**best_params)
model.fit(X_train, y_train)

In [None]:
# Make a prediction on the testing set
predicted_values = model.predict(X_test)

# Calculate the mean absolute error (MAE) between the predicted and actual values
mae = mean_absolute_error(y_test, predicted_values)
print("Mean absolute error:            ", mae)

# Calculate the mean absolute percentage error (MAE) between the predicted and actual values
mape = mean_absolute_percentage_error(y_test, predicted_values)
print("Mean absolute percentage error: ", mape)

# Calculating the mean squared error (MSE) between the predicted and actual values
mse = mean_squared_error(y_test, predicted_values)
print("Mean squared error:             ", mse)

# Calculating the root mean squared error (RMSE) between the predicted and actual values
rmse = math.sqrt(mse)
print("Rood mean squared error:        ", rmse)

In [None]:
# Get feature importance
importance = model.feature_importances_

# Create a dictionary of feature names and importance scores
feature_importance = dict(zip(X_train.columns, importance))

# Print the feature importance scores in descending order
for feature, importance in sorted(feature_importance.items(), key=lambda x: x[1], reverse=True):
    print("{}: {:.2f}%".format(feature, importance*100))

In [None]:
# Plot the predicted values against the actual values
plt.plot(y_test, predicted_values, 'o')
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.show()