In [219]:
import os
import sys
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)
utils_dir = os.path.join(parent_dir, "src", "utils")
sys.path.append(utils_dir)
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from process_data import process_data
from weighted_accuracy import weighted_accuracy_scorer
from plot_learning_curves import plot_learning_curves

In [220]:
X_path= os.path.join("..", "data","input", "X_train_Wwou3IE.csv")
X_preprocessed = pd.read_csv(X_path, delimiter=',')
y_path= os.path.join("..", "data","input", "y_train_jJtXgMX.csv")
y_preprocessed = pd.read_csv(y_path, delimiter=',')

In [221]:
cold_rate = {
    1: 100,  # January
    2: 90,   # February
    3: 70,   # March
    4: 50,   # April
    5: 30,   # May
    6: 10,   # June
    7: 0,    # July
    8: 5,    # August
    9: 20,   # September
    10: 40,  # October
    11: 60,  # November
    12: 80   # December
}

X_preprocessed['DELIVERY_START'] = pd.to_datetime(X_preprocessed['DELIVERY_START'], utc = True)

X_preprocessed['month'] = X_preprocessed['DELIVERY_START'].dt.month
X_preprocessed['cold_rate'] = X_preprocessed['month'].map(cold_rate)
X_preprocessed.drop('month', axis=1, inplace=True)

X_preprocessed

Unnamed: 0,DELIVERY_START,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price,cold_rate
0,2022-01-01 01:00:00+00:00,49439.0,3386.0,11487.0,44118.0,3035.0,0.0,79.248348,0.000000,,100
1,2022-01-01 02:00:00+00:00,46511.0,3386.0,11487.0,44118.0,3143.0,0.0,61.776532,0.000000,,100
2,2022-01-01 03:00:00+00:00,45158.0,3386.0,11487.0,44118.0,3288.0,0.0,44.291112,0.000000,,100
3,2022-01-01 04:00:00+00:00,44779.0,3386.0,11487.0,44118.0,3447.0,0.0,36.127588,0.000000,,100
4,2022-01-01 05:00:00+00:00,45284.0,3386.0,11487.0,44118.0,3679.0,0.0,30.983023,0.000000,,100
...,...,...,...,...,...,...,...,...,...,...,...
10600,2023-03-29 17:00:00+00:00,50814.0,3386.0,11952.0,38320.0,7552.0,651.0,247.408490,7.821622,108.11,70
10601,2023-03-29 18:00:00+00:00,50628.0,3386.0,11952.0,38320.0,8338.0,109.0,155.795012,2.534054,125.66,70
10602,2023-03-29 19:00:00+00:00,48201.0,3386.0,11952.0,38320.0,9115.0,0.0,126.884684,0.000000,138.01,70
10603,2023-03-29 20:00:00+00:00,47967.0,3386.0,11952.0,38320.0,9636.0,0.0,156.669189,0.000000,136.74,70


In [222]:
X = process_data(X_preprocessed.copy(deep=True), "predicted_spot_price", None, "standard")
y = process_data(y_preprocessed.copy(deep=True), None, None, None)

In [223]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [224]:
model = LinearRegression()
model.fit(X_train, y_train)

In [225]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Training R^2: {train_r2:.2f}, Training MSE: {train_mse:.2f}")
print(f"Testing R^2: {test_r2:.2f}, Testing MSE: {test_mse:.2f}")

Training R^2: 0.00, Training MSE: 1554.19
Testing R^2: 0.01, Testing MSE: 998.38


In [226]:
#plot_learning_curves(model, X_train, y_train, X_test, y_test)

In [227]:
model = LinearRegression()
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert scores to positive to represent Mean Squared Error (MSE)
mse_scores = -scores

# Calculate the root mean squared error (RMSE) from MSE
rmse_scores = np.sqrt(mse_scores)

print("Scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print("Standard deviation:", rmse_scores.std())

Scores: [23.91247996 49.98194333 57.22693659 27.09167125 16.98709076]
Mean RMSE: 35.04002437915392
Standard deviation: 15.674354785500233


In [228]:
from sklearn.linear_model import Ridge
model = Ridge(alpha = 1000)
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert scores to positive to represent Mean Squared Error (MSE)
mse_scores = -scores

# Calculate the root mean squared error (RMSE) from MSE
rmse_scores = np.sqrt(mse_scores)

print("Scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print("Standard deviation:", rmse_scores.std())

Scores: [23.70459668 49.7431715  57.14180623 27.0065     16.94964291]
Mean RMSE: 34.90914346170718
Standard deviation: 15.651689902173006


In [229]:
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_features.fit_transform(X_train)
X_poly = poly_features.transform(X)
X_test_poly = poly_features.transform(X_test)
model = LinearRegression()
model.fit(X_train_poly, y_train)
y_train_pred = model.predict(X_train_poly)
y_test_pred = model.predict(X_test_poly)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Training R^2: {train_r2:.2f}, Training MSE: {train_mse:.2f}")
print(f"Testing R^2: {test_r2:.2f}, Testing MSE: {test_mse:.2f}")

Training R^2: 0.02, Training MSE: 1524.58
Testing R^2: 0.01, Testing MSE: 1002.14


In [230]:
#plot_learning_curves(model, X_train, y_train, X_test, y_test)

In [231]:
model = LinearRegression()
scores = cross_val_score(model, X_poly, y, cv=5, scoring='neg_mean_squared_error')

# Convert scores to positive to represent Mean Squared Error (MSE)
mse_scores = -scores

# Calculate the root mean squared error (RMSE) from MSE
rmse_scores = np.sqrt(mse_scores)

print("Scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print("Standard deviation:", rmse_scores.std())

Scores: [29.33080911 51.35939985 61.05343202 29.3070203  20.57925275]
Mean RMSE: 38.32598280545683
Standard deviation: 15.255216614675247


In [232]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Assuming X_train, y_train, X_test, and y_test are already defined

# Transforming the data to include polynomial features
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)

# Initializing the Ridge regression model with regularization strength alpha
# Note: You can adjust the alpha value to see how it affects the model's performance
alpha = 100000000  # This is a common starting point for regularization strength
ridge_model = Ridge(alpha=alpha)

# Fitting the model to the polynomial features
ridge_model.fit(X_train_poly, y_train)

# Predicting on both training and testing sets
y_train_pred = ridge_model.predict(X_train_poly)
y_test_pred = ridge_model.predict(X_test_poly)

# Calculating R^2 and Mean Squared Error (MSE) for both training and testing sets
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Training R^2: {train_r2:.2f}, Training MSE: {train_mse:.2f}")
print(f"Testing R^2: {test_r2:.2f}, Testing MSE: {test_mse:.2f}")

Training R^2: 0.00, Training MSE: 1560.98
Testing R^2: -0.00, Testing MSE: 1011.16


In [233]:
ridge_model = Ridge(alpha=alpha)
scores = cross_val_score(ridge_model, X_poly, y, cv=5, scoring='neg_mean_squared_error')

# Convert scores to positive to represent Mean Squared Error (MSE)
mse_scores = -scores

# Calculate the root mean squared error (RMSE) from MSE
rmse_scores = np.sqrt(mse_scores)

print("Scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print("Standard deviation:", rmse_scores.std())

Scores: [23.41873371 49.33468324 57.09980814 27.09802576 16.9203883 ]
Mean RMSE: 34.77432783160903
Standard deviation: 15.601726242580124


In [234]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Create a decision tree regressor model
tree_reg = DecisionTreeRegressor(max_depth=5)  # You can adjust the max_depth as needed

# Fit the model to the training data
tree_reg.fit(X_train, y_train)

# Predict on the training set and the test set
y_train_pred = tree_reg.predict(X_train)
y_test_pred = tree_reg.predict(X_test)

# Calculate the performance metrics
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Print the performance metrics
print(f"Training RMSE: {train_rmse:.2f}, Training R^2: {train_r2:.2f}")
print(f"Testing RMSE: {test_rmse:.2f}, Testing R^2: {test_r2:.2f}")


Training RMSE: 38.19, Training R^2: 0.07
Testing RMSE: 30.86, Testing R^2: 0.06


In [235]:
tree_reg = DecisionTreeRegressor(max_depth=2)
scores = cross_val_score(tree_reg, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert scores to positive to represent Mean Squared Error (MSE)
mse_scores = -scores

# Calculate the root mean squared error (RMSE) from MSE
rmse_scores = np.sqrt(mse_scores)

print("Scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print("Standard deviation:", rmse_scores.std())

Scores: [23.42558606 49.70112953 57.06211571 27.09356241 16.90664129]
Mean RMSE: 34.83780700124896
Standard deviation: 15.662548126499201


In [236]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

"""# Generate a synthetic regression dataset
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
"""

# Initialize the Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model on the training data
gbr.fit(X_train, y_train)

# Predict on the testing data
y_pred = gbr.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")


  y = column_or_1d(y, warn=True)


Mean Squared Error (MSE): 746.17
R-squared (R2): 0.26


In [237]:
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

"""# Generate a synthetic regression dataset
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=42)
"""
# Initialize the Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Perform 5-fold cross-validation and compute the cross-validation scores
# Note: By default, cross_val_score uses R^2 as the score to evaluate. 
# For MSE, we need to specify 'neg_mean_squared_error' as the scoring parameter,
# and later convert it to positive MSE scores.
cv_scores = cross_val_score(gbr, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE scores to positive values
mse_scores = -cv_scores

# Calculate the root mean squared error (RMSE) from MSE scores
rmse_scores = np.sqrt(mse_scores)

print("Cross-validation RMSE scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print("Standard deviation of RMSE:", rmse_scores.std())


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Cross-validation RMSE scores: [27.61757548 52.37860743 57.79683512 28.08778445 19.66052006]
Mean RMSE: 37.108264504939
Standard deviation of RMSE: 15.080227796317239


More complex model

In [238]:
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

# Initialize the Gradient Boosting Regressor with increased complexity
gbr_complex = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)

# Perform 5-fold cross-validation and compute the cross-validation scores
cv_scores_complex = cross_val_score(gbr_complex, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE scores to positive values
mse_scores_complex = -cv_scores_complex

# Calculate the root mean squared error (RMSE) from MSE scores
rmse_scores_complex = np.sqrt(mse_scores_complex)

print("Cross-validation RMSE scores:", rmse_scores_complex)
print("Mean RMSE:", rmse_scores_complex.mean())
print("Standard deviation of RMSE:", rmse_scores_complex.std())


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Cross-validation RMSE scores: [34.23703866 54.29118488 58.16062986 31.05137532 22.09896375]
Mean RMSE: 39.967838495161764
Standard deviation of RMSE: 13.912425926595034


Easier model

In [239]:
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

# Initialize the Gradient Boosting Regressor with increased complexity
gbr_complex = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=2, random_state=42)

# Perform 5-fold cross-validation and compute the cross-validation scores
cv_scores_complex = cross_val_score(gbr_complex, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE scores to positive values
mse_scores_complex = -cv_scores_complex

# Calculate the root mean squared error (RMSE) from MSE scores
rmse_scores_complex = np.sqrt(mse_scores_complex)

print("Cross-validation RMSE scores:", rmse_scores_complex)
print("Mean RMSE:", rmse_scores_complex.mean())
print("Standard deviation of RMSE:", rmse_scores_complex.std())

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Cross-validation RMSE scores: [24.75300424 50.09622651 57.48569086 27.27478572 17.07339804]
Mean RMSE: 35.33662107489588
Standard deviation of RMSE: 15.61395975016201


From the more complex model and the easier model, we deduce if we were overfitting or not : We were overfitting

In [240]:
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

# Initialize the Gradient Boosting Regressor with increased complexity
gbr_complex = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=2, random_state=42)

# Perform 5-fold cross-validation and compute the cross-validation scores
cv_scores_complex = cross_val_score(gbr_complex, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE scores to positive values
mse_scores_complex = -cv_scores_complex

# Calculate the root mean squared error (RMSE) from MSE scores
rmse_scores_complex = np.sqrt(mse_scores_complex)

print("Cross-validation RMSE scores:", rmse_scores_complex)
print("Mean RMSE:", rmse_scores_complex.mean())
print("Standard deviation of RMSE:", rmse_scores_complex.std())

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Cross-validation RMSE scores: [24.75300424 50.09622651 57.48569086 27.27478572 17.07339804]
Mean RMSE: 35.33662107489588
Standard deviation of RMSE: 15.61395975016201


In [241]:
from sklearn.model_selection import GridSearchCV

# Parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting stages to be run
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate shrinks the contribution of each tree
    'max_depth': [3, 4, 5],  # Maximum depth of the individual regression estimators
    'min_samples_split': [2, 4, 6],  # The minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 3],  # The minimum number of samples required to be at a leaf node
}

# Initialize the Gradient Boosting Regressor
gbr = GradientBoostingRegressor(random_state=42)

# Initialize the Grid Search model
grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit the Grid Search model
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score (negative MSE): ", grid_search.best_score_)

# Predict on the testing data using the best found parameters
y_pred = grid_search.predict(X_test)

# Evaluate the model with the best parameters
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE) with best parameters: {mse:.2f}")
print(f"R-squared (R2) with best parameters: {r2:.2f}")

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [None]:
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
import numpy as np

# Initialize the Support Vector Regressor
svr = SVR(kernel='linear')

# Perform 5-fold cross-validation and compute the cross-validation scores
# Note: By default, cross_val_score uses R^2 as the score to evaluate. 
# For MSE, we need to specify 'neg_mean_squared_error' as the scoring parameter,
# and later convert it to positive MSE scores.
cv_scores = cross_val_score(svr, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE scores to positive values
mse_scores = -cv_scores

# Calculate the root mean squared error (RMSE) from MSE scores
rmse_scores = np.sqrt(mse_scores)

print("Cross-validation RMSE scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print("Standard deviation of RMSE:", rmse_scores.std())