In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
# Load the time series dataset
data = pd.read_csv('../datasets/complete_data/df.csv')

# Change column names to match naming convention
data = data.rename(columns={'start MTU (UTC)': 'ds', 'Day-ahead Price [EUR/MWh] BZN|NO1': 'y'})

# Convert the 'ds' column to datetime type
data['ds'] = pd.to_datetime(data['ds'])

In [None]:
# Extract the year, month, day and hour from the 'ds' column
data['year'] = data['ds'].dt.year
data['month'] = data['ds'].dt.month
data['day'] = data['ds'].dt.day
data['hour'] = data['ds'].dt.hour



In [None]:
# Lagged features
def create_lagged_dataset(df):
    # creating a copy of the dataframe
    lagged_df = df.copy()

    # Adding lagged features for target variable
    lagged_df['y'] = lagged_df['y'].shift(-1)

    # Dropping the last row containing NaN values
    lagged_df.dropna(inplace=True)

    # Creating a dataframe with lagged features with 24 steps for each of the original features
    lagged_df = pd.concat([lagged_df.shift(i) for i in range(24)], axis=1)

    # Removing the NaN rows that have been created in the beginning of the dataset
    lagged_df.dropna(inplace=True)

    return lagged_df

In [None]:
lagged_df = create_lagged_dataset(data)

In [None]:
# Split the dataset into training, validation and testing sets
X = lagged_df.drop(['y', 'ds'], axis=1)
y = lagged_df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0, shuffle=False)

# Print the shape of the training, validation and testing sets
print("Training set shape:   ", X_train.shape, y_train.shape)
print("Validation set shape: ", X_val.shape, y_val.shape)
print("Testing set shape:    ", X_test.shape, y_test.shape)

In [None]:
# Create a Linear Regression model
model = LinearRegression()

# Fit the model to the training set
model.fit(X_train, y_train)

In [None]:
# Make a prediction on the testing set
predicted_values = model.predict(X_test)

# Calculate the mean absolute error (MAE) between the predicted and actual values
mae = mean_absolute_error(y_test, predicted_values)
print("Mean absolute error:            ", mae)

# Calculate the mean absolute percentage error (MAE) between the predicted and actual values
mape = mean_absolute_percentage_error(y_test, predicted_values)
print("Mean absolute percentage error: ", mape)

# Calculating the mean squared error (MSE) between the predicted and actual values
mse = mean_squared_error(y_test, predicted_values)
print("Mean squared error:             ", mse)

# Calculating the root mean squared error (RMSE) between the predicted and actual values
rmse = math.sqrt(mse)
print("Rood mean squared error:        ", rmse)


In [None]:
# Get the absolute values of the coefficients
coefficients = np.abs(model.coef_)

# Create a dictionary of feature names and importance scores
feature_importance = dict(zip(X.columns, coefficients))

# Print the feature importance in descending order
for feature, importance in sorted(feature_importance.items(), key=lambda x: x[1], reverse=True):
    print("{}: {:.2f}%".format(feature, importance*100))

In [None]:
# Plot the predicted values against the actual values
plt.figure(figsize=(12, 6))
plt.plot(y_test.values, label='Actual', color='blue')
plt.plot(predicted_values, label='Predicted', color='red')
plt.title('Actual vs Predicted')
plt.ylabel('Price [EUR/MWh]')
plt.legend(loc='upper left')
plt.show()