# Other General Data Prep

## 2025 Regional Data Prep

### Consumption Data - "reg_2025_cons.csv"

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
df_reg_2025 = pd.read_csv(r"C:\Users\Henri\Documents\GitHub\Predi_Conso_Elec_Region\Predi_Conso_Elec_Region\data\eco2mix-regional-tr_2025_reg.csv", sep=";", encoding="utf-8")

In [None]:
df_filtered = df_original[(df_original['Région'] == 'Auvergne-Rhône-Alpes') & (df_original['Datetime'].dt.year >= 2023)].copy()

In [3]:
# Use the "date_heure" field as the timestamp. Convert it to datetime.
df_reg_2025["Datetime"] = pd.to_datetime(df_reg_2025["Date - Heure"], utc=True)

# Select only the needed columns (e.g., datetime, consommation, and optionally region)
df_reg_2025 = df_reg_2025[["Datetime", "Consommation (MW)", "Région"]].copy()

# Sort by datetime
df_reg_2025.sort_values("Datetime", inplace=True)

# Then remove the timezone information (e.g., "+00:00")
df_reg_2025["Datetime"] = df_reg_2025["Datetime"].dt.tz_convert("Europe/Paris")

df_reg_2025.set_index("Datetime", inplace=True)


In [None]:
df_reg_2025.to_csv("reg_2025_cons.csv")

: 

### Temperature Data - "reg_2025_temperature.csv"

In [1]:
import pandas as pd
import os

#### Concatenating Monthly Real Time Values (01, 02, 03 2025)

In [2]:

data_dir = 'C:\\Users\\Henri\\Documents\\Data Science Bootcamp\\Projet File Rouge\\Energy\\MeteoFrance\\MeteoFrance\\synop_2025'

df_list = []

for filename in os.listdir(data_dir):
    if filename.endswith('.csv'):
        file_path = os.path.join(data_dir, filename)
        df = pd.read_csv(file_path)
        df_list.append(df)

combined_df = pd.concat(df_list, ignore_index=True)

combined_df.to_csv('2025_weather_data.csv', index=False)

In [None]:
directory = os.path.dirname('2025_weather_data.csv')
print(directory)




In [None]:
file_path = os.path.abspath('2025_weather_data.csv')
print(f"Full path : {file_path}")

Full path : c:\Users\Henri\Documents\GitHub\Predi_Conso_Elec_Region\Predi_Conso_Elec_Region\notebooks\2025_weather_data.csv


In [31]:
wdata_2025 = pd.read_csv('2025_weather_data.csv',  sep=";")

In [25]:
wdata_2025.describe

<bound method NDFrame.describe of        numer_sta            date    pmer  tend cod_tend   dd         ff  \
0           7005  20250101000000  102190  -160        8  200   2.500000   
1           7015  20250101000000  102060  -220        8  200  11.900000   
2           7020  20250101000000  101980  -150        7  200  15.000000   
3           7027  20250101000000  102340  -160        8  190   8.600000   
4           7037  20250101000000  102480  -140        8  210   9.800000   
...          ...             ...     ...   ...      ...  ...        ...   
45600      81401  20250407120000  101310   200        3  250   0.700000   
45601      81405  20250407120000  101310   210        3  140   2.500000   
45602      81408  20250407120000  101320   260        2  200   1.400000   
45603      81415  20250407120000      mq   230        2   40   1.400000   
45604      89642  20250407120000   97470    40        1  140   9.300000   

                t          td   u  ... nnuage2 ctype2 hnuage2 nnu

#### Processing Dataset

##### Maladapted code for current dictionary format

In [None]:
from utils.dictionaries import weather_stations

wdata_2025.rename(columns={'numer_sta':"Station_ID"}, inplace=True)

wdata_2025["Station_ID"] = wdata_2025["Station_ID"].astype(str).str.zfill(5)

wdata_2025["Region"] = wdata_2025["Station_ID"].map(lambda x: weather_stations.get(str(x), ("Unknown", "Unknown"))[0])
wdata_2025["Station Name"] = wdata_2025["Station_ID"].map(lambda x: weather_stations.get(str(x), ("Unknown", "Unknown"))[1])


##### Fixed code for current dictionary format

In [None]:
from utils.dictionaries import weather_stations

# Step 1: Build a flat mapping from Station_ID to (Region, Station Name)
station_lookup = {}
for region, stations in weather_stations.items():
    for station in stations:
        station_id = station["ID"]
        station_name = station["Nom"]
        station_lookup[station_id] = (region, station_name)

# Step 2: Normalize station IDs in your dataset
wdata_2025.rename(columns={'numer_sta': "Station_ID"}, inplace=True)
wdata_2025["Station_ID"] = wdata_2025["Station_ID"].astype(str).str.zfill(5)

# Step 3: Map region and station name from the new lookup dictionary
wdata_2025["Region"] = wdata_2025["Station_ID"].map(lambda x: station_lookup.get(x, ("Unknown", "Unknown"))[0])
wdata_2025["Station Name"] = wdata_2025["Station_ID"].map(lambda x: station_lookup.get(x, ("Unknown", "Unknown"))[1])


In [34]:

wdata_2025.to_csv("updated_weather_data.csv", index=False)

In [33]:
selected_columns = ["Station_ID", "Station Name", "Region", "date", "t"]
wdata_filtered = wdata_2025[selected_columns].copy()
print(wdata_filtered.head())

  Station_ID     Station Name           Region            date           t
0      07005        ABBEVILLE  Hauts-de-France  20250101000000  281.150000
1      07015    LILLE-LESQUIN  Hauts-de-France  20250101000000  280.950000
2      07020  PTE DE LA HAGUE        Normandie  20250101000000  284.050000
3      07027   CAEN-CARPIQUET        Normandie  20250101000000  280.350000
4      07037       ROUEN-BOOS        Normandie  20250101000000  279.950000


In [35]:
wdata_filtered.describe

<bound method NDFrame.describe of       Station_ID     Station Name           Region            date           t
0          07005        ABBEVILLE  Hauts-de-France  20250101000000  281.150000
1          07015    LILLE-LESQUIN  Hauts-de-France  20250101000000  280.950000
2          07020  PTE DE LA HAGUE        Normandie  20250101000000  284.050000
3          07027   CAEN-CARPIQUET        Normandie  20250101000000  280.350000
4          07037       ROUEN-BOOS        Normandie  20250101000000  279.950000
...          ...              ...              ...             ...         ...
45600      81401          Unknown          Unknown  20250407120000  298.250000
45601      81405          Unknown          Unknown  20250407120000  300.450000
45602      81408          Unknown          Unknown  20250407120000  300.150000
45603      81415          Unknown          Unknown  20250407120000  297.750000
45604      89642          Unknown          Unknown  20250407120000  257.350000

[45605 rows x 5 c

In [36]:
import numpy as np

In [37]:
# Replace "mq" with NaN
wdata_filtered["t"] = wdata_filtered["t"].replace("mq", np.nan)

In [38]:
wdata_filtered.isna().sum()

Station_ID         0
Station Name       0
Region             0
date               0
t               1622
dtype: int64

In [39]:
nan_by_region = wdata_filtered[wdata_filtered["t"].isna()].groupby("Region").size()

In [40]:
print(nan_by_region)

Region
Auvergne-Rhône-Alpes            7
Bretagne                       49
Normandie                      53
Occitanie                       1
Provence-Alpes-Côte d'Azur    761
Unknown                       751
dtype: int64


In [41]:
nan_by_station = wdata_filtered[wdata_filtered["t"].isna()].groupby("Station Name").size()

In [42]:
print(nan_by_station)

Station Name
BELLE ILE-LE TALUT     49
CAP CEPET             761
MONTELIMAR              7
PTE DE LA HAGUE        53
TOULOUSE-BLAGNAC        1
Unknown               751
dtype: int64


In [43]:
len(wdata_filtered)

45605

In [44]:
# Convert temperature from Kelvin to Celsius (ignoring NaN values)
wdata_filtered["t"] = wdata_filtered["t"].astype(float) - 273.15

In [45]:
print(wdata_filtered.head())

  Station_ID     Station Name           Region            date     t
0      07005        ABBEVILLE  Hauts-de-France  20250101000000   8.0
1      07015    LILLE-LESQUIN  Hauts-de-France  20250101000000   7.8
2      07020  PTE DE LA HAGUE        Normandie  20250101000000  10.9
3      07027   CAEN-CARPIQUET        Normandie  20250101000000   7.2
4      07037       ROUEN-BOOS        Normandie  20250101000000   6.8


In [46]:
# Convert to datetime format
wdata_filtered["Datetime"] = pd.to_datetime(wdata_filtered["date"], format="%Y%m%d%H%M%S")

In [47]:
wdata_filtered.head()

Unnamed: 0,Station_ID,Station Name,Region,date,t,Datetime
0,7005,ABBEVILLE,Hauts-de-France,20250101000000,8.0,2025-01-01
1,7015,LILLE-LESQUIN,Hauts-de-France,20250101000000,7.8,2025-01-01
2,7020,PTE DE LA HAGUE,Normandie,20250101000000,10.9,2025-01-01
3,7027,CAEN-CARPIQUET,Normandie,20250101000000,7.2,2025-01-01
4,7037,ROUEN-BOOS,Normandie,20250101000000,6.8,2025-01-01


In [48]:
# Drop "date" column
wdata_filtered = wdata_filtered.drop(columns=["date"])

In [49]:
# Rename the month column for clarity
wdata_filtered.rename(columns={"Region": "Région"}, inplace=True)

In [50]:
# Convert "t" to numeric, coerce non-numeric values to NaN
wdata_filtered["t"] = pd.to_numeric(wdata_filtered["t"], errors="coerce")

# Check for non-numeric values
print(wdata_filtered["t"].isna().sum())  # Count NaN values


1622


In [51]:
# Group by "Région" and "Datetime" and compute the mean for "t"
wdata_grouped = wdata_filtered.groupby(["Région", "Datetime"], as_index=False)["t"].mean()

# Verify the output
print(wdata_grouped.head(10))


                 Région            Datetime       t
0  Auvergne-Rhône-Alpes 2025-01-01 00:00:00  -2.775
1  Auvergne-Rhône-Alpes 2025-01-01 03:00:00  -2.925
2  Auvergne-Rhône-Alpes 2025-01-01 06:00:00  -2.300
3  Auvergne-Rhône-Alpes 2025-01-01 09:00:00   2.300
4  Auvergne-Rhône-Alpes 2025-01-01 12:00:00   8.550
5  Auvergne-Rhône-Alpes 2025-01-01 15:00:00  11.050
6  Auvergne-Rhône-Alpes 2025-01-01 18:00:00   6.000
7  Auvergne-Rhône-Alpes 2025-01-01 21:00:00   5.500
8  Auvergne-Rhône-Alpes 2025-01-02 00:00:00   5.075
9  Auvergne-Rhône-Alpes 2025-01-02 03:00:00   4.375


In [52]:
# Ensure 'Datetime' is present and not an index
if "Datetime" not in wdata_grouped.columns:
    wdata_grouped.reset_index(inplace=True)  # Move 'Datetime' back as a column if it's an index

# Convert 'Datetime' to proper datetime format
wdata_grouped["Datetime"] = pd.to_datetime(wdata_grouped["Datetime"])

# Remove multi-index issues by resetting everything
wdata_grouped = wdata_grouped.reset_index(drop=True)  # Ensure a clean dataframe

# Set Datetime as index (for resampling)
wdata_grouped.set_index("Datetime", inplace=True)

# Perform resampling per region
wdata_resampled = (
    wdata_grouped.groupby("Région", group_keys=False)  # Group by region but avoid inserting it twice
    .resample("15min")  # Resample to 15-minute intervals
    .interpolate(method="linear")  # Interpolate missing values
)

# Reset index while ensuring 'Région' doesn't get duplicated
wdata_resampled.reset_index(inplace=True)

# Verify results
print(wdata_resampled.head(100))


              Datetime                Région         t
0  2025-01-01 00:00:00  Auvergne-Rhône-Alpes -2.775000
1  2025-01-01 00:15:00                   NaN -2.787500
2  2025-01-01 00:30:00                   NaN -2.800000
3  2025-01-01 00:45:00                   NaN -2.812500
4  2025-01-01 01:00:00                   NaN -2.825000
..                 ...                   ...       ...
95 2025-01-01 23:45:00                   NaN  5.110417
96 2025-01-02 00:00:00  Auvergne-Rhône-Alpes  5.075000
97 2025-01-02 00:15:00                   NaN  5.016667
98 2025-01-02 00:30:00                   NaN  4.958333
99 2025-01-02 00:45:00                   NaN  4.900000

[100 rows x 3 columns]


  .interpolate(method="linear")  # Interpolate missing values


In [53]:
# Fill NaN values in "Région" column using forward fill
wdata_resampled["Région"] = wdata_resampled["Région"].ffill()

# Verify that NaN values are filled
print(wdata_resampled["Région"].isna().sum())  # Should return 0 if all NaNs are filled
print(wdata_resampled.head(10))  # Check results


0
             Datetime                Région       t
0 2025-01-01 00:00:00  Auvergne-Rhône-Alpes -2.7750
1 2025-01-01 00:15:00  Auvergne-Rhône-Alpes -2.7875
2 2025-01-01 00:30:00  Auvergne-Rhône-Alpes -2.8000
3 2025-01-01 00:45:00  Auvergne-Rhône-Alpes -2.8125
4 2025-01-01 01:00:00  Auvergne-Rhône-Alpes -2.8250
5 2025-01-01 01:15:00  Auvergne-Rhône-Alpes -2.8375
6 2025-01-01 01:30:00  Auvergne-Rhône-Alpes -2.8500
7 2025-01-01 01:45:00  Auvergne-Rhône-Alpes -2.8625
8 2025-01-01 02:00:00  Auvergne-Rhône-Alpes -2.8750
9 2025-01-01 02:15:00  Auvergne-Rhône-Alpes -2.8875


In [54]:
wdata_resampled.to_csv("reg_2025_temperature.csv", index=False)

## Checking for elec consumption data for PACA Region in raw dataset

## Processing 2023 and 2024 PACA consumption data 

In [1]:
import pandas as pd

In [35]:
df_paca = pd.read_csv(r"C:\Users\Henri\Documents\GitHub\Predi_Conso_Elec_Region\Predi_Conso_Elec_Region\data\eco2mix-regional-cons-def.csv", sep=';', encoding="utf-8")

  df_paca = pd.read_csv(r"C:\Users\Henri\Documents\GitHub\Predi_Conso_Elec_Region\Predi_Conso_Elec_Region\data\eco2mix-regional-cons-def.csv", sep=';', encoding="utf-8")


In [36]:
df_paca = df_paca[df_paca["Région"] == "Provence-Alpes-Côte d'Azur"].copy()

In [37]:
df_paca["Datetime"] = pd.to_datetime(df_paca["Date - Heure"], utc=True)

In [18]:
df_paca.columns

Index(['Code INSEE région', 'Région', 'Nature', 'Date', 'Heure',
       'Date - Heure', 'Consommation (MW)', 'Thermique (MW)', 'Nucléaire (MW)',
       'Eolien (MW)', 'Solaire (MW)', 'Hydraulique (MW)', 'Pompage (MW)',
       'Bioénergies (MW)', 'Ech. physiques (MW)', 'Stockage batterie',
       'Déstockage batterie', 'Eolien terrestre', 'Eolien offshore',
       'TCO Thermique (%)', 'TCH Thermique (%)', 'TCO Nucléaire (%)',
       'TCH Nucléaire (%)', 'TCO Eolien (%)', 'TCH Eolien (%)',
       'TCO Solaire (%)', 'TCH Solaire (%)', 'TCO Hydraulique (%)',
       'TCH Hydraulique (%)', 'TCO Bioénergies (%)', 'TCH Bioénergies (%)',
       'Column 30', 'Datetime'],
      dtype='object')

In [38]:
df_paca = df_paca[["Région", "Datetime", "Consommation (MW)"]]

In [40]:
df_paca["Datetime"] = pd.to_datetime(df_paca["Datetime"]).dt.tz_convert("Europe/Paris").copy()

In [41]:
df_paca = df_paca[df_paca["Datetime"].dt.year >= 2023].copy()

In [42]:
df_paca.head()

Unnamed: 0,Région,Datetime,Consommation (MW)
2103557,Provence-Alpes-Côte d'Azur,2023-01-01 00:00:00+01:00,4724.0
2103564,Provence-Alpes-Côte d'Azur,2023-01-01 00:30:00+01:00,4392.0
2103583,Provence-Alpes-Côte d'Azur,2023-01-01 01:00:00+01:00,4372.0
2103593,Provence-Alpes-Côte d'Azur,2023-01-01 01:30:00+01:00,4392.0
2103603,Provence-Alpes-Côte d'Azur,2023-01-01 02:00:00+01:00,4516.0


In [43]:
len(df_paca)

1488

In [44]:
df_paca.tail()

Unnamed: 0,Région,Datetime,Consommation (MW)
2121354,Provence-Alpes-Côte d'Azur,2023-01-31 21:30:00+01:00,6088.0
2121360,Provence-Alpes-Côte d'Azur,2023-01-31 22:00:00+01:00,5827.0
2121379,Provence-Alpes-Côte d'Azur,2023-01-31 22:30:00+01:00,5878.0
2121389,Provence-Alpes-Côte d'Azur,2023-01-31 23:00:00+01:00,6176.0
2121406,Provence-Alpes-Côte d'Azur,2023-01-31 23:30:00+01:00,6139.0


In [57]:
df_paca_2024 = pd.read_csv(r"C:\Users\Henri\Documents\GitHub\Predi_Conso_Elec_Region\Predi_Conso_Elec_Region\data\eco2mix-regional-tr.csv", sep=";", encoding="utf-8")

In [58]:
len(df_paca_2024)

758652

In [None]:
df_paca_2024 = df_paca_2024[df_paca_2024["Région"] == "Provence-Alpes-Côte d'Azur"].copy()

In [None]:
df_paca_2024["Datetime"] = pd.to_datetime(df_paca_2024["Date - Heure"], utc=True)

In [63]:
df_paca_2024 = df_paca_2024.sort_values(by="Datetime").reset_index(drop=True)

In [64]:
df_paca_2024.head()

Unnamed: 0,Code INSEE région,Région,Nature,Date,Heure,Date - Heure,Consommation (MW),Thermique (MW),Nucléaire (MW),Eolien (MW),...,TCO Eolien (%),TCH Eolien (%),TCO Solaire (%),TCH Solaire (%),TCO Hydraulique (%),TCH Hydraulique (%),TCO Bioénergies (%),TCH Bioénergies (%),Column 68,Datetime
0,93,Provence-Alpes-Côte d'Azur,Données temps réel,2023-02-01,00:00,2023-02-01T00:00:00+01:00,5902.0,533.0,,42.0,...,0.71,43.3,0.0,0.0,30.19,54.45,1.54,29.93,,2023-01-31 23:00:00+00:00
1,93,Provence-Alpes-Côte d'Azur,Données temps réel,2023-02-01,00:15,2023-02-01T00:15:00+01:00,5807.0,616.0,,41.0,...,0.71,42.27,0.0,0.0,30.69,54.45,1.57,29.93,,2023-01-31 23:15:00+00:00
2,93,Provence-Alpes-Côte d'Azur,Données temps réel,2023-02-01,00:30,2023-02-01T00:30:00+01:00,5660.0,663.0,,39.0,...,0.69,40.21,0.0,0.0,30.87,53.38,1.61,29.93,,2023-01-31 23:30:00+00:00
3,93,Provence-Alpes-Côte d'Azur,Données temps réel,2023-02-01,00:45,2023-02-01T00:45:00+01:00,5632.0,707.0,,39.0,...,0.69,40.21,0.0,0.0,29.65,51.02,1.63,30.26,,2023-01-31 23:45:00+00:00
4,93,Provence-Alpes-Côte d'Azur,Données temps réel,2023-02-01,01:00,2023-02-01T01:00:00+01:00,5539.0,772.0,,38.0,...,0.69,39.18,0.0,0.0,30.44,51.51,1.7,30.92,,2023-02-01 00:00:00+00:00


In [66]:
df_paca_2024 =df_paca_2024[["Région", "Datetime", "Consommation (MW)"]]

In [68]:
df_paca_2024["Datetime"] = pd.to_datetime(df_paca_2024["Datetime"]).dt.tz_convert("Europe/Paris").copy()

In [69]:
df_paca_2024 = df_paca_2024[df_paca_2024["Datetime"].dt.year < 2025].copy()

In [70]:
df_paca_2024.tail()

Unnamed: 0,Région,Datetime,Consommation (MW)
67195,Provence-Alpes-Côte d'Azur,2024-12-31 22:45:00+01:00,5471.0
67196,Provence-Alpes-Côte d'Azur,2024-12-31 23:00:00+01:00,5596.0
67197,Provence-Alpes-Côte d'Azur,2024-12-31 23:15:00+01:00,5876.0
67198,Provence-Alpes-Côte d'Azur,2024-12-31 23:30:00+01:00,5791.0
67199,Provence-Alpes-Côte d'Azur,2024-12-31 23:45:00+01:00,5718.0


In [71]:
df_paca.to_csv(r"paca_cons_2023.csv", index=False)

In [72]:
df_paca_2024.to_csv(r"paca_cons_2024.csv", index=False)

## Concatenating 2023 and 2024 PACA consumption data

In [148]:
import pandas as pd
import os

# Define the filenames
csv_files = [
    r"paca_cons_2023.csv",
    r"paca_cons_2024.csv"
]


In [149]:
# Load and concatenate them
dfs = [pd.read_csv(file, parse_dates=["Datetime"]) for file in csv_files]

In [150]:
paca_cons_2023_2024 = pd.concat(dfs)

# Optional: sort by time
paca_cons_2023_2024 = paca_cons_2023_2024.sort_values(by="Datetime").reset_index(drop=True)

In [151]:
paca_cons_2023_2024.columns

Index(['Région', 'Datetime', 'Consommation (MW)'], dtype='object')

In [152]:
paca_cons_2023_2024["Datetime"] = pd.to_datetime(paca_cons_2023_2024["Datetime"], utc=True).copy()

In [153]:
paca_cons_2023_2024["Datetime"] = pd.to_datetime(paca_cons_2023_2024["Datetime"]).dt.tz_convert("Europe/Paris").copy()

In [154]:
paca_cons_2023_2024.head()

Unnamed: 0,Région,Datetime,Consommation (MW)
0,Provence-Alpes-Côte d'Azur,2023-01-01 00:00:00+01:00,4724.0
1,Provence-Alpes-Côte d'Azur,2023-01-01 00:30:00+01:00,4392.0
2,Provence-Alpes-Côte d'Azur,2023-01-01 01:00:00+01:00,4372.0
3,Provence-Alpes-Côte d'Azur,2023-01-01 01:30:00+01:00,4392.0
4,Provence-Alpes-Côte d'Azur,2023-01-01 02:00:00+01:00,4516.0


## Checking for and removing duplicates

In [None]:
# Check for duplicates in the 'DateTime' column
duplicate_check = paca_cons_2023_2024.duplicated(subset="Datetime")
print(paca_cons_2023_2024[duplicate_check])  # Display duplicate rows if any


                           Région                   Datetime  \
6585   Provence-Alpes-Côte d'Azur  2023-03-26 03:00:00+02:00   
6587   Provence-Alpes-Côte d'Azur  2023-03-26 03:15:00+02:00   
6589   Provence-Alpes-Côte d'Azur  2023-03-26 03:30:00+02:00   
6591   Provence-Alpes-Côte d'Azur  2023-03-26 03:45:00+02:00   
42201  Provence-Alpes-Côte d'Azur  2024-03-31 03:00:00+02:00   
42203  Provence-Alpes-Côte d'Azur  2024-03-31 03:15:00+02:00   
42205  Provence-Alpes-Côte d'Azur  2024-03-31 03:30:00+02:00   
42207  Provence-Alpes-Côte d'Azur  2024-03-31 03:45:00+02:00   

       Consommation (MW)  
6585                 0.0  
6587                 0.0  
6589                 0.0  
6591                 0.0  
42201             3788.0  
42203             3992.0  
42205             3809.0  
42207             3766.0  


In [None]:
# Check for duplicates within each region
duplicates = paca_cons_2023_2024.duplicated(subset=['Datetime'], keep=False)
if duplicates.any():
    print("Duplicate rows detected:")
    print(paca_cons_2023_2024[duplicates])


Duplicate rows detected:
                           Région                   Datetime  \
6584   Provence-Alpes-Côte d'Azur  2023-03-26 03:00:00+02:00   
6585   Provence-Alpes-Côte d'Azur  2023-03-26 03:00:00+02:00   
6586   Provence-Alpes-Côte d'Azur  2023-03-26 03:15:00+02:00   
6587   Provence-Alpes-Côte d'Azur  2023-03-26 03:15:00+02:00   
6588   Provence-Alpes-Côte d'Azur  2023-03-26 03:30:00+02:00   
6589   Provence-Alpes-Côte d'Azur  2023-03-26 03:30:00+02:00   
6590   Provence-Alpes-Côte d'Azur  2023-03-26 03:45:00+02:00   
6591   Provence-Alpes-Côte d'Azur  2023-03-26 03:45:00+02:00   
42200  Provence-Alpes-Côte d'Azur  2024-03-31 03:00:00+02:00   
42201  Provence-Alpes-Côte d'Azur  2024-03-31 03:00:00+02:00   
42202  Provence-Alpes-Côte d'Azur  2024-03-31 03:15:00+02:00   
42203  Provence-Alpes-Côte d'Azur  2024-03-31 03:15:00+02:00   
42204  Provence-Alpes-Côte d'Azur  2024-03-31 03:30:00+02:00   
42205  Provence-Alpes-Côte d'Azur  2024-03-31 03:30:00+02:00   
42206  Provence

In [157]:
# Check for duplicates within each region
duplicates = paca_cons_2023_2024.duplicated(subset=['Datetime'], keep=False)
if duplicates.any():
    print("Duplicate rows detected:")
    print(paca_cons_2023_2024[duplicates])


In [156]:
# Drop the second occurrence of each duplicated datetime
paca_cons_2023_2024 = paca_cons_2023_2024[~paca_cons_2023_2024.duplicated(subset=['Datetime'], keep='first')]


In [None]:
paca_cons_2023_2024.columns

Index(['Datetime', 'Région', 'Consommation (MW)'], dtype='object')

## Checking the number of "0.0" consumption values in PACA dataframe

In [None]:
zero_sum = paca_cons_2023_2024[paca_cons_2023_2024['Consommation (MW)'] == 0.0].shape[0]
print(f"Number of zero consumption entries: {zero_sum}")

Number of zero consumption entries: 139


In [146]:
zero_sum = paca_cons_2023_2024[paca_cons_2023_2024['Consommation (MW)'] == 0.0].shape[0]
print(f"Number of zero consumption entries: {zero_sum}")

Number of zero consumption entries: 143


In [None]:
len(paca_cons_2023_2024)

68680

In [136]:
# Convert Datetime column to date only (if it's not already)
paca_cons_2023_2024["Date"] = paca_cons_2023_2024["Datetime"].dt.date

# Filter rows where consumption is 0.0
zero_consumption = paca_cons_2023_2024[paca_cons_2023_2024["Consommation (MW)"] == 0.0]

# Count how many zero-consumption entries occurred per day
zero_counts_by_day = zero_consumption.groupby("Date").size().reset_index(name="Zero_Counts")

# Sort to see days with the most zeros first (optional)
zero_counts_by_day = zero_counts_by_day.sort_values("Zero_Counts", ascending=False)

# Display the result
print(zero_counts_by_day)


          Date  Zero_Counts
51  2023-07-11           40
36  2023-05-29           21
50  2023-07-07           16
2   2023-03-26            8
8   2023-04-09            3
20  2023-04-28            2
22  2023-04-30            2
21  2023-04-29            2
52  2023-07-16            2
27  2023-05-07            2
18  2023-04-26            2
4   2023-03-31            1
5   2023-04-05            1
9   2023-04-10            1
13  2023-04-14            1
12  2023-04-13            1
3   2023-03-29            1
1   2023-03-25            1
10  2023-04-11            1
6   2023-04-07            1
7   2023-04-08            1
0   2023-03-24            1
19  2023-04-27            1
15  2023-04-21            1
16  2023-04-23            1
17  2023-04-25            1
14  2023-04-15            1
11  2023-04-12            1
25  2023-05-03            1
24  2023-05-02            1
29  2023-05-18            1
30  2023-05-20            1
32  2023-05-24            1
31  2023-05-23            1
33  2023-05-25      

In [141]:
# Convert to datetime if needed
paca_data = paca_cons_2023_2024[paca_cons_2023_2024["Région"] == "Provence-Alpes-Côte d'Azur"].copy()
paca_data["Datetime"] = pd.to_datetime(paca_data["Datetime"])

# Filter for July 11, 2023 where consumption is zero
zero_paca_july11 = paca_data[
    (paca_data["Datetime"].dt.date == pd.to_datetime("2023-07-11").date()) &
    (paca_data["Consommation (MW)"] == 0.0)
]

# Show timestamps
print(zero_paca_july11[["Datetime", "Consommation (MW)"]])


                       Datetime  Consommation (MW)
16848 2023-07-11 00:00:00+02:00                0.0
16849 2023-07-11 00:15:00+02:00                0.0
16850 2023-07-11 00:30:00+02:00                0.0
16851 2023-07-11 00:45:00+02:00                0.0
16852 2023-07-11 01:00:00+02:00                0.0
16853 2023-07-11 01:15:00+02:00                0.0
16854 2023-07-11 01:30:00+02:00                0.0
16855 2023-07-11 01:45:00+02:00                0.0
16856 2023-07-11 02:00:00+02:00                0.0
16857 2023-07-11 02:15:00+02:00                0.0
16858 2023-07-11 02:30:00+02:00                0.0
16859 2023-07-11 02:45:00+02:00                0.0
16860 2023-07-11 03:00:00+02:00                0.0
16861 2023-07-11 03:15:00+02:00                0.0
16862 2023-07-11 03:30:00+02:00                0.0
16863 2023-07-11 03:45:00+02:00                0.0
16864 2023-07-11 04:00:00+02:00                0.0
16865 2023-07-11 04:15:00+02:00                0.0
16866 2023-07-11 04:30:00+02:00

## Counting the number of 0 values in the training data for each region

In [138]:
merged_full_df = pd.read_csv(r"C:\Users\Henri\Documents\GitHub\Predi_Conso_Elec_Region\Predi_Conso_Elec_Region\data\merged_full.csv", parse_dates=["Datetime"])

In [139]:
merged_full = merged_full_df[merged_full_df["Datetime"].dt.year >= 2023].copy()

In [142]:
# List of regions (excluding PACA)
regions = [
    "Nouvelle-Aquitaine", "Occitanie", "Île-de-France", "Auvergne-Rhône-Alpes", 
    "Grand Est", "Normandie", "Bretagne", "Hauts-de-France", "Pays de la Loire",
    "Centre-Val de Loire", "Bourgogne-Franche-Comté"
]

# Filter only for these regions
filtered_data = merged_full[merged_full["Région"].isin(regions)].copy()

# Convert to datetime if necessary
filtered_data["Datetime"] = pd.to_datetime(filtered_data["Datetime"])

# Filter zero values and extract the date part
zero_data = filtered_data[filtered_data["Consommation (MW)"] == 0.0].copy()
zero_data["Date"] = zero_data["Datetime"].dt.date

# Count zero values per day per region
zero_counts = zero_data.groupby(["Région", "Date"]).size().reset_index(name="Zero_Counts")

# View the result
print(zero_counts)


                   Région        Date  Zero_Counts
0    Auvergne-Rhône-Alpes  2023-03-26            8
1    Auvergne-Rhône-Alpes  2023-04-10            2
2    Auvergne-Rhône-Alpes  2023-04-11            1
3    Auvergne-Rhône-Alpes  2023-04-13            1
4    Auvergne-Rhône-Alpes  2023-04-14            1
..                    ...         ...          ...
317         Île-de-France  2023-07-06            1
318         Île-de-France  2023-07-07           16
319         Île-de-France  2023-07-11           40
320         Île-de-France  2023-07-16            2
321         Île-de-France  2023-12-01            1

[322 rows x 3 columns]


In [143]:
pd.set_option("display.max_rows", None)

print(zero_counts)

                      Région        Date  Zero_Counts
0       Auvergne-Rhône-Alpes  2023-03-26            8
1       Auvergne-Rhône-Alpes  2023-04-10            2
2       Auvergne-Rhône-Alpes  2023-04-11            1
3       Auvergne-Rhône-Alpes  2023-04-13            1
4       Auvergne-Rhône-Alpes  2023-04-14            1
5       Auvergne-Rhône-Alpes  2023-04-15            1
6       Auvergne-Rhône-Alpes  2023-04-21            1
7       Auvergne-Rhône-Alpes  2023-04-23            1
8       Auvergne-Rhône-Alpes  2023-04-25            1
9       Auvergne-Rhône-Alpes  2023-04-26            2
10      Auvergne-Rhône-Alpes  2023-04-27            1
11      Auvergne-Rhône-Alpes  2023-04-28            2
12      Auvergne-Rhône-Alpes  2023-04-29            1
13      Auvergne-Rhône-Alpes  2023-04-30            2
14      Auvergne-Rhône-Alpes  2023-05-02            1
15      Auvergne-Rhône-Alpes  2023-05-03            1
16      Auvergne-Rhône-Alpes  2023-05-06            1
17      Auvergne-Rhône-Alpes

## Further processing data

In [144]:
paca_cons_2023_2024.reset_index(inplace=True)

In [158]:
pd.to_datetime(paca_cons_2023_2024["Datetime"], utc=True)
paca_cons_2023_2024.set_index("Datetime", inplace=True)
paca_cons_2023_2024 = paca_cons_2023_2024.resample("15min").interpolate(method="linear")
paca_cons_2023_2024.reset_index(inplace=True)

  paca_cons_2023_2024 = paca_cons_2023_2024.resample("15min").interpolate(method="linear")


In [159]:
paca_cons_2023_2024.describe

<bound method NDFrame.describe of                        Datetime                      Région  Consommation (MW)
0     2023-01-01 00:00:00+01:00  Provence-Alpes-Côte d'Azur        4724.000000
1     2023-01-01 00:15:00+01:00                         NaN        4558.000000
2     2023-01-01 00:30:00+01:00  Provence-Alpes-Côte d'Azur        4392.000000
3     2023-01-01 00:45:00+01:00                         NaN        4382.000000
4     2023-01-01 01:00:00+01:00  Provence-Alpes-Côte d'Azur        4372.000000
5     2023-01-01 01:15:00+01:00                         NaN        4382.000000
6     2023-01-01 01:30:00+01:00  Provence-Alpes-Côte d'Azur        4392.000000
7     2023-01-01 01:45:00+01:00                         NaN        4454.000000
8     2023-01-01 02:00:00+01:00  Provence-Alpes-Côte d'Azur        4516.000000
9     2023-01-01 02:15:00+01:00                         NaN        4447.500000
10    2023-01-01 02:30:00+01:00  Provence-Alpes-Côte d'Azur        4379.000000
11    2023-01-01 0

In [160]:
# Fill NaN values in "Région" column using forward fill
paca_cons_2023_2024["Région"] = paca_cons_2023_2024["Région"].ffill()

In [162]:
paca_temp_2023_2024 = merged_full[
    (merged_full["Région"] == "Provence-Alpes-Côte d'Azur") & 
    (merged_full["t"])
].copy()


In [163]:
pd.reset_option("display.max_rows")

In [164]:
paca_temp_2023_2024.describe

<bound method NDFrame.describe of                    Datetime                      Région          t  \
2454887 2023-01-01 00:00:00  Provence-Alpes-Côte d'Azur  11.800000   
2454888 2023-01-01 00:15:00  Provence-Alpes-Côte d'Azur  11.750000   
2454889 2023-01-01 00:30:00  Provence-Alpes-Côte d'Azur  11.700000   
2454890 2023-01-01 00:45:00  Provence-Alpes-Côte d'Azur  11.650000   
2454891 2023-01-01 01:00:00  Provence-Alpes-Côte d'Azur  11.600000   
...                     ...                         ...        ...   
2525047 2024-12-31 20:00:00  Provence-Alpes-Côte d'Azur   5.083333   
2525048 2024-12-31 20:15:00  Provence-Alpes-Côte d'Azur   5.037500   
2525049 2024-12-31 20:30:00  Provence-Alpes-Côte d'Azur   4.991667   
2525050 2024-12-31 20:45:00  Provence-Alpes-Côte d'Azur   4.945833   
2525051 2024-12-31 21:00:00  Provence-Alpes-Côte d'Azur   4.900000   

         Consommation (MW)  Day-ahead (EUR/MWh)  DayOfWeek  IsWeekend  \
2454887                NaN               1.0600     

In [166]:
paca_cons_2023_2024.describe

<bound method NDFrame.describe of                        Datetime                      Région  Consommation (MW)
0     2023-01-01 00:00:00+01:00  Provence-Alpes-Côte d'Azur             4724.0
1     2023-01-01 00:15:00+01:00  Provence-Alpes-Côte d'Azur             4558.0
2     2023-01-01 00:30:00+01:00  Provence-Alpes-Côte d'Azur             4392.0
3     2023-01-01 00:45:00+01:00  Provence-Alpes-Côte d'Azur             4382.0
4     2023-01-01 01:00:00+01:00  Provence-Alpes-Côte d'Azur             4372.0
...                         ...                         ...                ...
70171 2024-12-31 22:45:00+01:00  Provence-Alpes-Côte d'Azur             5471.0
70172 2024-12-31 23:00:00+01:00  Provence-Alpes-Côte d'Azur             5596.0
70173 2024-12-31 23:15:00+01:00  Provence-Alpes-Côte d'Azur             5876.0
70174 2024-12-31 23:30:00+01:00  Provence-Alpes-Côte d'Azur             5791.0
70175 2024-12-31 23:45:00+01:00  Provence-Alpes-Côte d'Azur             5718.0

[70176 rows x 3 c

In [165]:
print(len(paca_cons_2023_2024))
print(len(paca_temp_2023_2024))

70176
70165


In [None]:
paca_temp_2023_2024["Datetime"] = paca_temp_2023_2024["Datetime"].dt.tz_localize(None).copy()

NonExistentTimeError: 2023-03-26 02:00:00

In [175]:
paca_temp_2023_2024.drop(["Consommation (MW)"], axis=1, inplace=True)

In [171]:
paca_cons_2023_2024["Datetime"] = paca_cons_2023_2024["Datetime"].dt.tz_localize(None).copy()

In [176]:
paca_full = pd.merge(paca_cons_2023_2024, paca_temp_2023_2024, on=["Région", "Datetime"], how="inner")

In [177]:
paca_full.to_csv(r"paca_full.csv", index=False)

## New Weather API data (predictive // Open-Weather API)

In [55]:
%pip install openmeteo-requests
%pip install requests-cache retry-requests numpy pandas

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting openmeteo-requests
  Downloading openmeteo_requests-1.4.0-py3-none-any.whl.metadata (9.7 kB)
Collecting openmeteo-sdk>=1.4.0 (from openmeteo-requests)
  Downloading openmeteo_sdk-1.19.0-py3-none-any.whl.metadata (935 bytes)
Collecting flatbuffers==25.2.10 (from openmeteo-sdk>=1.4.0->openmeteo-requests)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Downloading openmeteo_requests-1.4.0-py3-none-any.whl (6.0 kB)
Downloading openmeteo_sdk-1.19.0-py3-none-any.whl (15 kB)
Downloading flatbuffers-25.2.10-py2.py3-none-any.whl (30 kB)
Installing collected packages: flatbuffers, openmeteo-sdk, openmeteo-requests
Successfully installed flatbuffers-25.2.10 openmeteo-requests-1.4.0 openmeteo-sdk-1.19.0



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: C:\Users\Henri\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting requests-cache
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting retry-requests
  Downloading retry_requests-2.0.0-py3-none-any.whl.metadata (2.6 kB)
Collecting cattrs>=22.2 (from requests-cache)
  Downloading cattrs-24.1.3-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache)
  Downloading url_normalize-2.2.0-py3-none-any.whl.metadata (4.9 kB)
Downloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
Downloading retry_requests-2.0.0-py3-none-any.whl (15 kB)
Downloading cattrs-24.1.3-py3-none-any.whl (66 kB)
Downloading url_normalize-2.2.0-py3-none-any.whl (14 kB)
Installing collected packages: url-normalize, cattrs, retry-requests, requests-cache
Successfully installed cattrs-24.1.3 requests-cache-1.2.1 retry-requests-2.0.0 url-normalize-2.2.0



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: C:\Users\Henri\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
from utils.dictionaries import weather_coordinates
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry

# Setup API client
cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

# === Choose region ===
region_name = "Auvergne-Rhône-Alpes"
cities = weather_coordinates[region_name]

# === Iterate over cities in the region ===
all_forecasts = []

for location in cities:
    params = {
        "latitude": location["latitude"],
        "longitude": location["longitude"],
        "hourly": "temperature_2m",
        "models": "meteofrance_seamless"
    }
    responses = openmeteo.weather_api("https://api.open-meteo.com/v1/forecast", params=params)

    # First response (only one per call in this case)
    response = responses[0]

    hourly = response.Hourly()
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()

    hourly_data = {
        "city": location["city"],
        "latitude": location["latitude"],
        "longitude": location["longitude"],
        "Datetime": pd.date_range(
            start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
            end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=hourly.Interval()),
            inclusive="left"
        ),
        "temperature_2m": hourly_temperature_2m
    }

    df_city = pd.DataFrame(hourly_data)
    all_forecasts.append(df_city)

# Combine all city forecasts
df_region_forecast = pd.concat(all_forecasts).reset_index(drop=True)


In [None]:
print(df_region_forecast.head())

   city   latitude  longitude                  datetime  temperature_2m
0  Lyon  45.764043   4.835659 2025-04-07 00:00:00+00:00       11.691500
1  Lyon  45.764043   4.835659 2025-04-07 01:00:00+00:00       10.691500
2  Lyon  45.764043   4.835659 2025-04-07 02:00:00+00:00        9.591499
3  Lyon  45.764043   4.835659 2025-04-07 03:00:00+00:00        8.941500
4  Lyon  45.764043   4.835659 2025-04-07 04:00:00+00:00        8.191500


In [59]:
df_region_forecast.describe

<bound method NDFrame.describe of                  city   latitude  longitude                  datetime  \
0                Lyon  45.764043   4.835659 2025-04-07 00:00:00+00:00   
1                Lyon  45.764043   4.835659 2025-04-07 01:00:00+00:00   
2                Lyon  45.764043   4.835659 2025-04-07 02:00:00+00:00   
3                Lyon  45.764043   4.835659 2025-04-07 03:00:00+00:00   
4                Lyon  45.764043   4.835659 2025-04-07 04:00:00+00:00   
..                ...        ...        ...                       ...   
667  Clermont-Ferrand  45.777222   3.087025 2025-04-13 19:00:00+00:00   
668  Clermont-Ferrand  45.777222   3.087025 2025-04-13 20:00:00+00:00   
669  Clermont-Ferrand  45.777222   3.087025 2025-04-13 21:00:00+00:00   
670  Clermont-Ferrand  45.777222   3.087025 2025-04-13 22:00:00+00:00   
671  Clermont-Ferrand  45.777222   3.087025 2025-04-13 23:00:00+00:00   

     temperature_2m  
0         11.691500  
1         10.691500  
2          9.591499  
3

In [61]:
df_region_forecast.columns

Index(['city', 'latitude', 'longitude', 'datetime', 'temperature_2m'], dtype='object')

In [60]:
df_region_forecast.isna().sum()

city                0
latitude            0
longitude           0
datetime            0
temperature_2m    212
dtype: int64

In [63]:
df_region_forecast = df_region_forecast.rename(columns={"temperature_2m": "t"}).copy()

In [65]:
# Average temperature across cities at each datetime
df_avg = df_region_forecast.groupby("datetime")["t"].mean().reset_index()

In [67]:
df_avg["Région"] = "Auvergne-Rhône-Alpes"

In [69]:
df_avg = df_avg.rename(columns={"datetime": "Datetime"}).copy()

In [72]:
df_avg.describe

<bound method NDFrame.describe of                                    t                Région
Datetime                                                  
2025-04-07 00:00:00+00:00  11.424625  Auvergne-Rhône-Alpes
2025-04-07 01:00:00+00:00  10.587125  Auvergne-Rhône-Alpes
2025-04-07 02:00:00+00:00   9.737124  Auvergne-Rhône-Alpes
2025-04-07 03:00:00+00:00   8.974625  Auvergne-Rhône-Alpes
2025-04-07 04:00:00+00:00   8.037125  Auvergne-Rhône-Alpes
...                              ...                   ...
2025-04-13 19:00:00+00:00        NaN  Auvergne-Rhône-Alpes
2025-04-13 20:00:00+00:00        NaN  Auvergne-Rhône-Alpes
2025-04-13 21:00:00+00:00        NaN  Auvergne-Rhône-Alpes
2025-04-13 22:00:00+00:00        NaN  Auvergne-Rhône-Alpes
2025-04-13 23:00:00+00:00        NaN  Auvergne-Rhône-Alpes

[168 rows x 2 columns]>

In [71]:
# Set Datetime as index (for resampling)
df_avg.set_index("Datetime", inplace=True)

# Perform resampling per region
wdata_resampled = (
    df_avg.groupby("Région", group_keys=False)  # Group by region but avoid inserting it twice
    .resample("15min")  # Resample to 15-minute intervals
    .interpolate(method="linear")  # Interpolate missing values
)

# Reset index while ensuring 'Région' doesn't get duplicated
wdata_resampled.reset_index(inplace=True)

  .interpolate(method="linear")  # Interpolate missing values


In [74]:
# Fill NaN values in "Région" column using forward fill
wdata_resampled["Région"] = wdata_resampled["Région"].ffill()

In [75]:
wdata_resampled.describe

<bound method NDFrame.describe of                      Datetime          t                Région
0   2025-04-07 00:00:00+00:00  11.424625  Auvergne-Rhône-Alpes
1   2025-04-07 00:15:00+00:00  11.215250  Auvergne-Rhône-Alpes
2   2025-04-07 00:30:00+00:00  11.005875  Auvergne-Rhône-Alpes
3   2025-04-07 00:45:00+00:00  10.796500  Auvergne-Rhône-Alpes
4   2025-04-07 01:00:00+00:00  10.587125  Auvergne-Rhône-Alpes
..                        ...        ...                   ...
664 2025-04-13 22:00:00+00:00  19.643250  Auvergne-Rhône-Alpes
665 2025-04-13 22:15:00+00:00  19.643250  Auvergne-Rhône-Alpes
666 2025-04-13 22:30:00+00:00  19.643250  Auvergne-Rhône-Alpes
667 2025-04-13 22:45:00+00:00  19.643250  Auvergne-Rhône-Alpes
668 2025-04-13 23:00:00+00:00  19.643250  Auvergne-Rhône-Alpes

[669 rows x 3 columns]>

## Concatenating latest weather predictions into reg_2025_temperature.csv

In [None]:
from utils.dictionaries import weather_coordinates, region_abbr_caps_dict
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry

# Setup API client
cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

def regional_temperature_prediction(region_name):
    """Fetches and processes the temperature forecast for a given French region"""
    cities = weather_coordinates[region_name]
    all_forecasts = []

    for location in cities:
        params = {
            "latitude": location["latitude"],
            "longitude": location["longitude"],
            "hourly": "temperature_2m",
            "models": "meteofrance_seamless"
        }
        responses = openmeteo.weather_api("https://api.open-meteo.com/v1/forecast", params=params)
        response = responses[0]

        hourly = response.Hourly()
        hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()

        hourly_data = {
            "city": location["city"],
            "latitude": location["latitude"],
            "longitude": location["longitude"],
            "Datetime": pd.date_range(
                start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
                end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
                freq=pd.Timedelta(seconds=hourly.Interval()),
                inclusive="left"
            ),
            "temperature_2m": hourly_temperature_2m
        }

        df_city = pd.DataFrame(hourly_data)
        all_forecasts.append(df_city)

    # === Processing ===
    df_region_forecast = pd.concat(all_forecasts).reset_index(drop=True)
    df_region_forecast = df_region_forecast.rename(columns={"temperature_2m": "t"})

    # Average temperature across cities
    df_avg = df_region_forecast.groupby("Datetime")["t"].mean().reset_index()
    df_avg["Région"] = region_name

    # Resample to 15-minute intervals and interpolate
    df_avg.set_index("Datetime", inplace=True)
    wdata_resampled = (
        df_avg.groupby("Région", group_keys=False)
        .resample("15min")
        .interpolate(method="linear")
    )
    wdata_resampled.reset_index(inplace=True)
    wdata_resampled["Région"] = wdata_resampled["Région"].ffill()

    return wdata_resampled


In [None]:
# Store all regional predictions
all_regions_temp = []

for region in region_abbr_caps_dict:
    print(f"🔄 Fetching forecast for {region}...")
    df_region = regional_temperature_prediction(region)
    all_regions_temp.append(df_region)

# Combine all into one big dataframe
df_national_temp_forecast = pd.concat(all_regions_temp).reset_index(drop=True)

# Optional: Save to CSV
df_national_temp_forecast.to_csv("regional_temp_forecast.csv", index=False)


🔄 Fetching forecast for Nouvelle-Aquitaine...
🔄 Fetching forecast for Occitanie...
🔄 Fetching forecast for Île-de-France...
🔄 Fetching forecast for Auvergne-Rhône-Alpes...
🔄 Fetching forecast for Grand Est...
🔄 Fetching forecast for Bretagne...


  .interpolate(method="linear")
  .interpolate(method="linear")
  .interpolate(method="linear")
  .interpolate(method="linear")
  .interpolate(method="linear")
  .interpolate(method="linear")
  .interpolate(method="linear")


🔄 Fetching forecast for Provence-Alpes-Côte d'Azur...
🔄 Fetching forecast for Hauts-de-France...


  .interpolate(method="linear")


🔄 Fetching forecast for Pays de la Loire...


  .interpolate(method="linear")


🔄 Fetching forecast for Centre-Val de Loire...


  .interpolate(method="linear")


🔄 Fetching forecast for Bourgogne-Franche-Comté...


  .interpolate(method="linear")


OSError: Cannot save file into a non-existent directory: 'data'

In [3]:
df_national_temp_forecast.describe

<bound method NDFrame.describe of                       Datetime          t                   Région
0    2025-04-07 00:00:00+00:00  12.421249       Nouvelle-Aquitaine
1    2025-04-07 00:15:00+00:00  12.230625       Nouvelle-Aquitaine
2    2025-04-07 00:30:00+00:00  12.040000       Nouvelle-Aquitaine
3    2025-04-07 00:45:00+00:00  11.849375       Nouvelle-Aquitaine
4    2025-04-07 01:00:00+00:00  11.658751       Nouvelle-Aquitaine
...                        ...        ...                      ...
7354 2025-04-13 22:00:00+00:00  18.699249  Bourgogne-Franche-Comté
7355 2025-04-13 22:15:00+00:00  18.699249  Bourgogne-Franche-Comté
7356 2025-04-13 22:30:00+00:00  18.699249  Bourgogne-Franche-Comté
7357 2025-04-13 22:45:00+00:00  18.699249  Bourgogne-Franche-Comté
7358 2025-04-13 23:00:00+00:00  18.699249  Bourgogne-Franche-Comté

[7359 rows x 3 columns]>

In [4]:
df_national_temp_forecast.isna().sum()

Datetime    0
t           0
Région      0
dtype: int64

In [5]:
# Optional: Save to CSV
df_national_temp_forecast.to_csv("regional_temp_forecast.csv", index=False)

### modify regional_temperature_function with the following code in order to log time at which forecast was pulled

In [None]:
from datetime import datetime

def regional_temperature_prediction(region_name):
    ...
    retrieval_time = pd.Timestamp.now(tz="UTC")  # You can change to local timezone if preferred

    df_avg["Région"] = region_name
    df_avg["Forecast_Pulled_At"] = retrieval_time  # ⬅️ log when the data was pulled

    ...
    wdata_resampled.reset_index(inplace=True)
    wdata_resampled["Région"] = wdata_resampled["Région"].ffill()
    wdata_resampled["Forecast_Pulled_At"] = wdata_resampled["Forecast_Pulled_At"].ffill()

    return wdata_resampled


# Recommended Directory Structure

project/
│
├── api/
│   ├── get_temperature_forecast.py
│   ├── get_consumption_data.py
│
├── predictions/
│   ├── run_single_prediction.py
│   ├── run_day.py
│
├── evaluation/
│   ├── evaluate_all_predictions.py
│   ├── aggregate_metrics.py
│
├── app/
│   ├── app.py  ← Your Flask app or Streamlit app
│
├── utils/
│   ├── helpers.py
│   ├── dictionaries.py
│
├── data/
│   ├── real/
│   ├── predicted/
│
└── jobs/
    ├── schedule_weather_pull.py
    ├── schedule_predictions.py
