# Prediction de la production ENR

Ce notebook present l'analyse de la production ENR a partir des données météo.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
filename_rte = "clean_datasets/rte_agg_daily_2014_2024.csv"
filename_weather = "clean_datasets/solar_wind_data.csv"

In [None]:
data = pd.read_csv(filename_rte)
data.set_index("Date", inplace=True)
data.index = pd.to_datetime(data.index)
# set all 0 to nan
data = data.dropna()
data = data.iloc[:-1]
data

In [None]:
data.info()

In [None]:
data.drop(["Type_de_jour_TEMPO", "sun", "wind"], axis=1).mean().sort_values(
    ascending=False
).plot(kind="bar", figsize=(15, 5))

## Some investigations

In [None]:
ax1 = data[["Eolien", "Solaire"]].plot()
ax1.set_xlabel("Date")
ax1.set_ylabel("MW")
ax1.set_title("Wind and Solar Power daily Production in France")

In [None]:
ax1 = data["wind"].plot()
ax1 = data["sun"].plot(ax=ax1, secondary_y=True)

ax1.set_xlabel("Date")
ax1.set_ylabel("wind speed (m/s)", c="b")
ax1.set_ylabel("Solar Flux (J/m^2)", c="orange")
ax1.set_title("Wind speed and Solar Flux in France")

## Regardons les correlations entre l'énergie produite et la production ENR

In [None]:
X_sun = data["sun"]
Y_pv = data["Solaire"]

fig, ax1 = plt.subplots()
ax1.plot(X_sun, Y_pv, "o", label="Data")
ax1.set_xlabel("Daily Solar irradiance (J/m^2)")
ax1.set_ylabel("Daily Solar production (MWh)")
ax1.set_title("Solar production vs Solar irradiance")
ax1.set_ylim(ymin=0)
ax1.set_xlim(xmin=0)
ax1.grid()
# hide the top and right spines
ax1.spines["top"].set_visible(False)
ax1.spines["right"].set_visible(False)

In [None]:
X_sun = data["wind"]
Y_pv = data["Eolien"]

fig, ax1 = plt.subplots()
ax1.plot(X_sun, Y_pv, "o", label="Data")
ax1.set_xlabel("Daily mean wind speed (m/s)")
ax1.set_ylabel("Daily Wind production (MWh)")
ax1.set_title("Eolian production vs Wind speed")
ax1.set_ylim(ymin=0)
ax1.set_xlim(xmin=0)
ax1.grid()
# hide the top and right spines
ax1.spines["top"].set_visible(False)
ax1.spines["right"].set_visible(False)

In [None]:
data["Fossile"] = data["Gaz"] + data["Charbon"] + data["Fioul"]
data["nonENR"] = (
    data["Fossile"]
    + data["Nucléaire"]
    + data["Hydraulique"]
    + data["Bioénergies"]
    + data["Pompage"]
)
data["ENR"] = data["Eolien"] + data["Solaire"]

# prediction of the ENR productions

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error

In [None]:
train_date = "2023-12-31"

In [None]:
data_train = data.loc[data.index < train_date]
data_test = data.loc[data.index >= train_date]

## Prediction du solair



In [None]:
def fit_model(data_train, data_test, features, target):
    X_train = data_train[features]
    y_train = data_train[target]
    X_test = data_test[features]
    y_test = data_test[target]

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    score = model.score(X_train, y_train)
    return model, mape, rmse, score

In [None]:
model_sun, mape_sun, rmse_sun, score_sun = fit_model(
    data_train, data_test, ["sun"], "Solaire"
)

print(f"Score: {score_sun}")
print(f"MARE: {mape_sun}")

In [None]:
def plot_model(data_train, data_test, model, features, target):
    fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(15, 5))
    ax1.scatter(data_train[target], model.predict(data_train[features]), label="train")
    ax1.scatter(data_test[target], model.predict(data_test[features]), label="test")
    ax1.legend()
    ax1.set_xlabel(f"True {target} production")
    ax1.set_ylabel(f"Predicted {target} production")
    ax1.set_title(f"{target} production prediction \n Train vs Test")

    factor = 1e-3
    ax2.plot(data.index, data[target] * factor, label="Data")
    ax2.plot(
        data_test.index, model.predict(data_test[features]) * factor, label="Prediction"
    )

    ax_line = pd.to_datetime(train_date).timestamp() / 86400
    ax2.axvline(ax_line, color="k", linestyle="--", label="Train/Test split date")
    ax2.legend()
    ax2.set_title(f"Prediction de la production {target} quotidienne")
    ax2.set_ylabel(f"{target} production (GWh)")
    fig.autofmt_xdate()


plot_model(data_train, data_test, model_sun, ["sun"], "Solaire")

## conclusion

La production solaire est sous-estimé en 2024, certainement lié à une croissance du nombre d'installation.

## Wind

In [None]:
model_wind, mape_wind, rmse_wind, score_wind = fit_model(
    data_train, data_test, ["wind"], "Eolien"
)

print(f"Score: {score_wind}")
print(f"MARE: {mape_wind}")

In [None]:
plot_model(data_train, data_test, model_wind, ["wind"], "Eolien")

# Somme des ENR

Peut-on fitter directement la somme des deux ?

In [None]:
model_enr, mape_enr, rmse_enr, score_enr = fit_model(
    data_train, data_test, ["sun", "wind"], "ENR"
)

print(f"Score: {score_enr}")
print(f"MARE: {mape_enr}")

In [None]:
plot_model(data_train, data_test, model_enr, ["sun", "wind"], "ENR")

## conclusion

La production eolien est bien plus important que la production solair, donc on retrouve principalement la même performance que le modèle vent seul.

Ce n'est pas si mauvais, étant donné la faible qualité des données d'entrée.

# prediction of the Fossile Energy form weather data


In [None]:
model_fossile, mape_fossile, rmse_fossile, score_fossile = fit_model(
    data_train, data_test, ["sun", "wind", "Prévision_J-1"], "Fossile"
)

print(f"Score: {score_fossile}")
print(f"MARE: {mape_fossile}")

plot_model(
    data_train, data_test, model_fossile, ["sun", "wind", "Prévision_J-1"], "Fossile"
)

In [None]:
model_nonenr, mape_nonenr, rmse_nonenr, score_nonenr = fit_model(
    data_train, data_test, ["sun", "wind", "Prévision_J-1"], "nonENR"
)

print(f"Score: {score_nonenr}")
print(f"MARE: {mape_nonenr}")

plot_model(
    data_train, data_test, model_nonenr, ["sun", "wind", "Prévision_J-1"], "nonENR"
)

## Conclusion

La prévision des énergies fossiles est très mauvaise, en particulier par rapport à la prediction de la production non-ENR.
Celà est en particulier dû a l'importance du nucleaire et de l'hydrolique dans le mix électrique Francais, deux source d'énergies décarbonnées mais controlables(indépendants de la météo).

## Discussion

Possibilitées pour améliorer la prédiction:

1. le nucléaire et l'hydrolique peuvent avoir une certaine inertie dans leurs production. Inclure un Lag peut potentielement permetre de mieux prédire leurs production future.
2. La prédiction ENR est elle-même mauvaise... Pour l'améliorer, il peut être intéressant de réduire la zone géographique d'aggreggation, par example au niveau de la région, afin de permettre des variations locale de la météo.


# prediction des classes Tempos

In [None]:
data["weekend"] = data.index.weekday.isin([5, 6]).astype(int)

In [None]:
import seaborn as sns

sns.pairplot(
    data[data["weekend"] == 0][["wind", "sun", "Prévision_J-1", "Type_de_jour_TEMPO"]],
    hue="Type_de_jour_TEMPO",
)

In [None]:
from sklearn.svm import SVC

In [None]:
model_tempo = SVC(class_weight="balanced")

train_date = "2023-12-31"
features = ["sun", "wind", "Prévision_J-1", "weekend"]
code = {"BLEU": 0, "BLANC": 1, "ROUGE": 2}
data["target"] = data["Type_de_jour_TEMPO"].map(code)

data_train = data.loc[data.index < train_date].loc[data["weekend"] == 0]
data_test = data.loc[data.index >= train_date].loc[data["weekend"] == 0]

display(data_train[features + ["target"]])
display(data_test["target"].value_counts())
model_tempo.fit(data_train[features], data_train["target"])

In [None]:
score = model_tempo.score(data_test[features], data_test["target"])
print("Score: ", score)

In [None]:
performance_matrix = pd.DataFrame(
    index=["True Blue", "True White", "True Red"],
    columns=["Pred Blue", "Pred White", "Pred Red"],
)
prevision = model_tempo.predict(data_test[features])
true_class = data_test["target"]
for label_true, code_true in code.items():
    for label_pred, code_pred in code.items():
        performance_matrix.iloc[code_true, code_pred] = np.sum(
            (prevision == code_pred) & (true_class == code_true)
        )

In [None]:
performance_matrix

In [None]:
model_tempo_unbalanced = SVC(class_weight=None)

model_tempo_unbalanced.fit(data_train[features], data_train["target"])
score = model_tempo_unbalanced.score(data_test[features], data_test["target"])
print("Score: ", score)

prevision = model_tempo_unbalanced.predict(data_test[features])
for label_true, code_true in code.items():
    for label_pred, code_pred in code.items():
        performance_matrix.iloc[code_true, code_pred] = np.sum(
            (prevision == code_pred) & (true_class == code_true)
        )
performance_matrix

In [None]:
from sklearn.tree import DecisionTreeClassifier

model_tempo_tree = DecisionTreeClassifier(class_weight=None)

model_tempo_tree.fit(data_train[features], data_train["target"])
score = model_tempo_tree.score(data_test[features], data_test["target"])
print("Score: ", score)

prevision = model_tempo_tree.predict(data_test[features])
for label_true, code_true in code.items():
    for label_pred, code_pred in code.items():
        performance_matrix.iloc[code_true, code_pred] = np.sum(
            (prevision == code_pred) & (true_class == code_true)
        )
performance_matrix

In [None]:
model_tempo_tree_balanced = DecisionTreeClassifier(class_weight="balanced")

model_tempo_tree_balanced.fit(data_train[features], data_train["target"])
score = model_tempo_tree_balanced.score(data_test[features], data_test["target"])
print("Score: ", score)

prevision = model_tempo_tree_balanced.predict(data_test[features])
for label_true, code_true in code.items():
    for label_pred, code_pred in code.items():
        performance_matrix.iloc[code_true, code_pred] = np.sum(
            (prevision == code_pred) & (true_class == code_true)
        )
performance_matrix

In [None]:
model_tempo_tree_balanced = DecisionTreeClassifier(class_weight="balanced")
features_cheating = ["ENR", "Prévision_J-1"]
model_tempo_tree_balanced.fit(data_train[features_cheating], data_train["target"])
score = model_tempo_tree_balanced.score(
    data_test[features_cheating], data_test["target"]
)
print("Score: ", score)

prevision = model_tempo_tree_balanced.predict(data_test[features_cheating])
for label_true, code_true in code.items():
    for label_pred, code_pred in code.items():
        performance_matrix.iloc[code_true, code_pred] = np.sum(
            (prevision == code_pred) & (true_class == code_true)
        )
performance_matrix

# Conclusion

La classification supervisée des jours tempo ne marche pas si directement que ça.
Lorsqu'on regarde le pairplot, on voit que les classes sont très mélangées. 

de plus, il y a beaucoup plus de jours bleu que de jours rouge et blanc (dataset imbalanced)

Ainsi, si on ne corrige pas la difference des classes, le model SVM ne prédit que des jours bleu. Si on corrige, le model SVM prédit beaucoup trop de jours rouges.

Un model de Decision Tree est un peu plus nuancé, mais avec un score pas si bon que ça.

Même en trichant, et en utilisant les véritable données de production ENR mesurée, la prédiction laisse a désirer...

## Discussion

Une solution serait d'implémenter directement le modèle utilisé par RTE : https://www.services-rte.com/files/live/sites/services-rte/files/pdf/20160106_Methode_de_choix_des_jours_Tempo.pdf