# Predicting the eolian energy production

This Notebook aims at predicting the energy producte by wind turbines.

It uses weather data extracted from the MeteoFrance numerical models, as well as history of productions provided by RTE.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
filename_ssrd_regions = "./all_si10_regions.parquet"
filename_energy_preduction = "../../clean_datasets/rte_agg_daily_2014_2024.csv"

In [None]:
df_ssrd_regions = pd.read_parquet(filename_ssrd_regions)
# sanitise the column names
region_names = [
    col.replace(" ", "_").replace("'", "_").replace("-", "_").lower()
    for col in df_ssrd_regions.columns
]
df_ssrd_regions.columns = region_names
region_names = df_ssrd_regions.columns
df_ssrd_regions = df_ssrd_regions.resample(
    "D"
).mean()  # Doing so to add empty rows for missing days
df_ssrd_regions.plot(figsize=(15, 10))
df_ssrd_regions["days_from_start"] = [
    (date - df_ssrd_regions.index[0]).days for date in df_ssrd_regions.index
]
df_ssrd_regions.head()

In [None]:
df_energy_preduction = pd.read_csv(filename_energy_preduction, index_col=0)[
    ["Eolien", "Solaire"]
]
df_energy_preduction.index = pd.to_datetime(df_energy_preduction.index)
df_energy_preduction.head(), df_energy_preduction.tail()

In [None]:
df_energy_preduction.index

In [None]:
# align the indexes of the two dataframes
data = pd.concat([df_ssrd_regions, df_energy_preduction], join="inner", axis=1)
data.head()

In [None]:
from statsmodels.formula.api import ols

# split test for time series
from sklearn.model_selection import TimeSeriesSplit

# Modeling

4 models are tested :
- Only Total wind speed (no region details)
- Only regions Wind Speed
- Total Wind Speed + time
- Regions wind Speed + tim

In [None]:
exo_vars = region_names
data["mean_wind"] = data[exo_vars].mean(axis=1)
endog_var = "Eolien"

In [None]:
tscv = TimeSeriesSplit(n_splits=30, test_size=3)  # testing on 3 days forcast

In [None]:
def test_model(formula="Eolien ~ mean_wind"):
    mod_1_mape = []
    for i, (train_index, test_index) in enumerate(tscv.split(data)):
        model_1 = ols(formula, data=data.iloc[train_index]).fit()
        if i == 0:
            first_test_index = test_index
            first_model_1 = model_1
        predictions = model_1.predict(data.iloc[test_index])
        error = data.iloc[test_index]["Eolien"] - predictions
        mape = (error.abs() / data.iloc[test_index]["Eolien"]).mean()
        mod_1_mape.append(mape)
    last_test_index = test_index
    last_model_1 = model_1
    return mod_1_mape, first_test_index, first_model_1, last_test_index, last_model_1


formula_1 = "Eolien ~ mean_wind"
mod_1_mape, first_test_index, first_model_1, last_test_index, last_model_1 = test_model(
    formula=formula_1
)

In [None]:
ax = data.plot(y="Eolien", label="True")
first_model_1.predict(data.iloc[first_test_index]).plot(
    ax=ax, label="First Test Predicted"
)
last_model_1.predict(data.iloc[last_test_index]).plot(
    ax=ax, label="Last Test Predicted"
)
ax.legend()

In [None]:
fig, ax = plt.subplots()
ax.hist(mod_1_mape, bins=20)
ax.set_title("MAPE distribution for model 1")
ax.set_xlabel("MAPE")

In [None]:
formula_2 = f"Eolien ~ {' + '.join(exo_vars)}"
print(formula_2)
mod_2_mape, first_test_index, first_model_2, last_test_index, last_model_2 = test_model(
    formula_2
)

In [None]:
fig, ax = plt.subplots()
ax.hist(mod_2_mape, bins=20)
ax.set_title("MAPE distribution for model 2")

In [None]:
formula_3 = formula_1 + " + days_from_start"
mod_3_mape, first_test_index, first_model_3, last_test_index, last_model_3 = test_model(
    formula_3
)
formula_4 = formula_2 + " + days_from_start"
mod_4_mape, first_test_index, first_model_4, last_test_index, last_model_4 = test_model(
    formula_4
)

In [None]:
# display the MAPE distribution for all models (KDE)
fig, ax = plt.subplots()
for i, mape in enumerate([mod_1_mape, mod_2_mape, mod_3_mape, mod_4_mape]):
    pd.Series(mape).plot.kde(ax=ax, label=f"Model {i+1}")
ax.set_title("MAPE distribution for all models")
ax.legend()

In [None]:
# print mean MAPE for all models
for i, mape in enumerate([mod_1_mape, mod_2_mape, mod_3_mape, mod_4_mape]):
    print(f"Model {i+1} mean MAPE: {np.mean(mape):.2%}")

# Conclusion

In contrast with the photo-voltaic power prediction, the eolien is a bit more consistent with the expected trend :
- using regional data features is better than global wind values (even with the time trend added to the global value)
- adding the time trend to the model improve the performances

The mean performance of model 4 (12.5% error) is quite good !

In [None]:
data[["Eolien", "Solaire"]].mean()

As the production of the wind turbine is around 2 time higher than the Sun production, the performance of the wind energy prediction model is more important for the overall performance of the project.