In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
sns.set(style="whitegrid")  # nice background for plots

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)        # Afficher toutes les lignes
pd.set_option('display.max_columns', None)  

In [3]:
df_champs=pd.read_csv("Av_Champs_Elysees.csv", sep=";")

In [4]:

# Trier par date et heure croissante
def order_by_date(df_champs):
    df_champs = df_champs.sort_values(by='Date et heure de comptage')

# Si tu veux réinitialiser les index après le tri
    df_champs = df_champs.reset_index(drop=True)
    
    return(df_champs)

df_champs=order_by_date(df_champs)

In [5]:
import pandas as pd

def create_datetime_features(df, datetime_col='Date et heure de comptage'):
    """
    Convertit une colonne en datetime et crée des colonnes supplémentaires :
    - day : date sans heure
    - hour : heure
    - year : année
    - month : mois
    - weekday : jour de la semaine (0=lundi, 6=dimanche)
    - is_
    end : True si samedi ou dimanche

    Parameters:
        df : pandas.DataFrame
        datetime_col : str, nom de la colonne datetime
    Returns:
        df : pandas.DataFrame avec nouvelles colonnes
    """

    # Convertir en datetime (UTC pour homogénéité)
    df[datetime_col] = pd.to_datetime(df[datetime_col], errors='coerce', utc=True)

    # Extraire features
    df['date'] = df[datetime_col].dt.date
    df['hour'] = df[datetime_col].dt.hour
    df['year'] = df[datetime_col].dt.year
    df['month'] = df[datetime_col].dt.month
    df['weekday'] = df[datetime_col].dt.weekday  # 0=lundi, 6=dimanche
    df['is_weekend'] = df['weekday'] >= 5

    # Aperçu pour vérification
    print(df[[datetime_col, 'date', 'hour', 'year', 'month', 'weekday', 'is_weekend']].head())
    print(df.dtypes)
    
    return df

# Exemple d'utilisation
df_champs = create_datetime_features(df_champs)


  Date et heure de comptage        date  hour  year  month  weekday  \
0 2024-10-01 03:00:00+00:00  2024-10-01     3  2024     10        1   
1 2024-10-01 04:00:00+00:00  2024-10-01     4  2024     10        1   
2 2024-10-01 05:00:00+00:00  2024-10-01     5  2024     10        1   
3 2024-10-01 06:00:00+00:00  2024-10-01     6  2024     10        1   
4 2024-10-01 07:00:00+00:00  2024-10-01     7  2024     10        1   

   is_weekend  
0       False  
1       False  
2       False  
3       False  
4       False  
Identifiant arc                            int64
Libelle                                   object
Date et heure de comptage    datetime64[ns, UTC]
Débit horaire                            float64
Taux d'occupation                        float64
Etat trafic                               object
Identifiant noeud amont                    int64
Libelle noeud amont                       object
Identifiant noeud aval                     int64
Libelle noeud aval                  

In [6]:
def vacances_by_zone(df):
    vacances = pd.read_csv('vacances.csv', parse_dates=['date'])

    # Renommer pour uniformité si nécessaire
    vacances = vacances.rename(columns={
        'vacances_zone_a': 'Vacances Zone A',
        'vacances_zone_b': 'Vacances Zone B',
        'vacances_zone_c': 'Vacances Zone C',
        'nom_vacances': 'Nom Vacances'
    })

    # Créer une colonne "Vacances Toutes Zones"
    vacances['Vacances Toutes Zones'] = vacances[['Vacances Zone A', 'Vacances Zone B', 'Vacances Zone C']].any(axis=1)

    # --- Fusionner avec ton DataFrame principal df ---

    vacances['date'] = pd.to_datetime(vacances['date']).dt.date

    df = df.merge(vacances, on='date',how='left')
    
    return(df)

df_champs=vacances_by_zone(df_champs)

In [7]:
import pandas as pd

def add_school_holidays_paris(df, date_col='Date et heure de comptage'):
    """
    Ajoute une colonne indicatrice 'Vacances Scolaires' pour Paris (zone C),
    incluant vacances hiver, printemps, été, Toussaint et Noël pour 2024 et 2025.
    """
    df[date_col] = pd.to_datetime(df[date_col])

    # Vacances scolaires pour Paris (zone C) sous forme de Series
    vacances_2024_2025 = pd.Series(pd.date_range('2024-10-19', '2024-11-04').tolist() +
                                   pd.date_range('2024-12-21', '2025-01-06').tolist() +
                                   pd.date_range('2025-02-15', '2025-03-03').tolist() +
                                   pd.date_range('2025-04-12', '2025-04-28').tolist() +
                                   pd.date_range('2025-07-05', '2025-09-01').tolist() +
                                   pd.date_range('2025-10-18', '2025-11-03').tolist() +
                                   pd.date_range('2025-12-20', '2026-01-05').tolist()
                                  )

    # Colonne indicatrice
    df['Vacances Scolaires Paris'] = df[date_col].dt.date.isin(vacances_2024_2025.dt.date).astype(int)

    return df

# Exemple d'utilisation
df_champs = add_school_holidays_paris(df_champs)


In [8]:
import holidays
def create_holidays(df_champs):
# Initialize France holidays
    fr_holidays = holidays.France(years=df_champs['date'].apply(lambda x: x.year).unique())

    # Add a new column 'is_holiday': True if the day is a French public holiday
    df_champs['is_holiday'] = df_champs['date'].apply(lambda x: x in fr_holidays)
    return(df_champs)

df_champs=create_holidays(df_champs)




In [9]:
# Create a column describing the type of day
def day_type(row):
    if row['is_holiday']:
        return 'Public Holiday'
    else:
        return 'Normal Day'

df_champs['day_type'] = df_champs.apply(day_type, axis=1)


In [10]:
import numpy as np

def add_cyclic_features(df):
    """
    Ajoute des colonnes sin/cos pour les features cycliques : heure, jour de semaine, mois.
    """

    # Heure (0-23)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    # Jour de la semaine (0-6)
    df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
    df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)

    # Mois (1-12)
    df['month_sin'] = np.sin(2 * np.pi * (df['month']-1) / 12)
    df['month_cos'] = np.cos(2 * np.pi * (df['month']-1) / 12)

    # Optionnel : jour de l'année (1-365/366)
    df['day_of_year'] = df['Date et heure de comptage'].dt.dayofyear
    df['dayofyear_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['dayofyear_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)

    return df

# Exemple d'utilisation
df_champs = add_cyclic_features(df_champs)

# Vérification
df_champs[['hour', 'hour_sin', 'hour_cos', 'weekday', 'weekday_sin', 'weekday_cos',
           'month', 'month_sin', 'month_cos', 'day_of_year', 'dayofyear_sin', 'dayofyear_cos']].head()


Unnamed: 0,hour,hour_sin,hour_cos,weekday,weekday_sin,weekday_cos,month,month_sin,month_cos,day_of_year,dayofyear_sin,dayofyear_cos
0,3,0.707107,0.7071068,1,0.781831,0.62349,10,-1.0,-1.83697e-16,275,-0.999769,0.021516
1,4,0.866025,0.5,1,0.781831,0.62349,10,-1.0,-1.83697e-16,275,-0.999769,0.021516
2,5,0.965926,0.258819,1,0.781831,0.62349,10,-1.0,-1.83697e-16,275,-0.999769,0.021516
3,6,1.0,6.123234000000001e-17,1,0.781831,0.62349,10,-1.0,-1.83697e-16,275,-0.999769,0.021516
4,7,0.965926,-0.258819,1,0.781831,0.62349,10,-1.0,-1.83697e-16,275,-0.999769,0.021516


In [11]:
df_champs.columns

Index(['Identifiant arc', 'Libelle', 'Date et heure de comptage',
       'Débit horaire', 'Taux d'occupation', 'Etat trafic',
       'Identifiant noeud amont', 'Libelle noeud amont',
       'Identifiant noeud aval', 'Libelle noeud aval', 'Etat arc',
       'Date debut dispo data', 'Date fin dispo data', 'geo_point_2d',
       'geo_shape', 'date', 'hour', 'year', 'month', 'weekday', 'is_weekend',
       'Vacances Zone A', 'Vacances Zone B', 'Vacances Zone C', 'Nom Vacances',
       'Vacances Toutes Zones', 'Vacances Scolaires Paris', 'is_holiday',
       'day_type', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos',
       'month_sin', 'month_cos', 'day_of_year', 'dayofyear_sin',
       'dayofyear_cos'],
      dtype='object')

In [12]:
import pandas as pd

# Assure-toi que la colonne est bien en datetime
df_champs['Date et heure de comptage'] = pd.to_datetime(df_champs['Date et heure de comptage'], utc=True)

# Trier chronologiquement
df_champs = df_champs.sort_values('Date et heure de comptage')

# Mettre la colonne de date comme index temporairement
df_champs = df_champs.set_index('Date et heure de comptage')

# Interpolation temporelle
df_champs['Débit horaire'] = df_champs['Débit horaire'].interpolate(method='time')
df_champs['Taux d\'occupation'] = df_champs['Taux d\'occupation'].interpolate(method='time')



In [13]:
df_meteo_1 = pd.read_csv("open-meteo-48.86N2.34E50m(1).csv",sep=",",header=2)
df_meteo_2 = pd.read_csv("open-meteo-48.87N2.33E50m.csv",sep=",",header=2)
df_meteo = pd.concat([df_meteo_1,df_meteo_2],axis=0)
df_meteo = df_meteo.drop_duplicates()
df_meteo = df_meteo.drop(columns=['precipitation_probability (%)'])
def merge_meteo(df_champs):
    df_champs['date']=pd.to_datetime(df_champs['date'])

    df_meteo["time"] = pd.to_datetime(df_meteo["time"])  
    
    df_champs=df_champs.merge(df_meteo,right_on='time',left_on='date',how='left')
    return(df_champs)
df_champs=merge_meteo(df_champs)

In [14]:
df_champs.head()

Unnamed: 0,Identifiant arc,Libelle,Débit horaire,Taux d'occupation,Etat trafic,Identifiant noeud amont,Libelle noeud amont,Identifiant noeud aval,Libelle noeud aval,Etat arc,Date debut dispo data,Date fin dispo data,geo_point_2d,geo_shape,date,hour,year,month,weekday,is_weekend,Vacances Zone A,Vacances Zone B,Vacances Zone C,Nom Vacances,Vacances Toutes Zones,Vacances Scolaires Paris,is_holiday,day_type,hour_sin,hour_cos,weekday_sin,weekday_cos,month_sin,month_cos,day_of_year,dayofyear_sin,dayofyear_cos,time,temperature_2m (°C),wind_speed_10m (km/h),relative_humidity_2m (%),precipitation (mm),cloud_cover (%),weather_code (wmo code),surface_pressure (hPa),apparent_temperature (°C),rain (mm)
0,4264,AV_Champs_Elysees,259.0,2.99667,Fluide,2294,Av_Champs_Elysees-Washington,2293,Av_Champs_Elysees-Berri,Invalide,1996-10-10,2023-01-01,"48.87153587897718, 2.3017227924560624","{""coordinates"": [[2.3009951475338775, 48.87177...",2024-10-01,3,2024,10,1,False,False,False,False,,False,0,False,Normal Day,0.707107,0.7071068,0.781831,0.62349,-1.0,-1.83697e-16,275,-0.999769,0.021516,2024-10-01,13.7,11.8,80.0,0.0,100.0,3.0,1007.3,12.1,
1,4264,AV_Champs_Elysees,330.0,4.98167,Fluide,2294,Av_Champs_Elysees-Washington,2293,Av_Champs_Elysees-Berri,Invalide,1996-10-10,2023-01-01,"48.87153587897718, 2.3017227924560624","{""coordinates"": [[2.3009951475338775, 48.87177...",2024-10-01,4,2024,10,1,False,False,False,False,,False,0,False,Normal Day,0.866025,0.5,0.781831,0.62349,-1.0,-1.83697e-16,275,-0.999769,0.021516,2024-10-01,13.7,11.8,80.0,0.0,100.0,3.0,1007.3,12.1,
2,4264,AV_Champs_Elysees,618.0,7.005,Fluide,2294,Av_Champs_Elysees-Washington,2293,Av_Champs_Elysees-Berri,Invalide,1996-10-10,2023-01-01,"48.87153587897718, 2.3017227924560624","{""coordinates"": [[2.3009951475338775, 48.87177...",2024-10-01,5,2024,10,1,False,False,False,False,,False,0,False,Normal Day,0.965926,0.258819,0.781831,0.62349,-1.0,-1.83697e-16,275,-0.999769,0.021516,2024-10-01,13.7,11.8,80.0,0.0,100.0,3.0,1007.3,12.1,
3,4264,AV_Champs_Elysees,1018.0,11.30722,Fluide,2294,Av_Champs_Elysees-Washington,2293,Av_Champs_Elysees-Berri,Invalide,1996-10-10,2023-01-01,"48.87153587897718, 2.3017227924560624","{""coordinates"": [[2.3009951475338775, 48.87177...",2024-10-01,6,2024,10,1,False,False,False,False,,False,0,False,Normal Day,1.0,6.123234000000001e-17,0.781831,0.62349,-1.0,-1.83697e-16,275,-0.999769,0.021516,2024-10-01,13.7,11.8,80.0,0.0,100.0,3.0,1007.3,12.1,
4,4264,AV_Champs_Elysees,1275.0,14.72667,Fluide,2294,Av_Champs_Elysees-Washington,2293,Av_Champs_Elysees-Berri,Invalide,1996-10-10,2023-01-01,"48.87153587897718, 2.3017227924560624","{""coordinates"": [[2.3009951475338775, 48.87177...",2024-10-01,7,2024,10,1,False,False,False,False,,False,0,False,Normal Day,0.965926,-0.258819,0.781831,0.62349,-1.0,-1.83697e-16,275,-0.999769,0.021516,2024-10-01,13.7,11.8,80.0,0.0,100.0,3.0,1007.3,12.1,


In [15]:
print("Nombre de NaN restants :", df_champs['Débit horaire'].isna().sum())


Nombre de NaN restants : 0


In [16]:
df_champs[df_champs["cloud_cover (%)"].isna()].tail()

Unnamed: 0,Identifiant arc,Libelle,Débit horaire,Taux d'occupation,Etat trafic,Identifiant noeud amont,Libelle noeud amont,Identifiant noeud aval,Libelle noeud aval,Etat arc,Date debut dispo data,Date fin dispo data,geo_point_2d,geo_shape,date,hour,year,month,weekday,is_weekend,Vacances Zone A,Vacances Zone B,Vacances Zone C,Nom Vacances,Vacances Toutes Zones,Vacances Scolaires Paris,is_holiday,day_type,hour_sin,hour_cos,weekday_sin,weekday_cos,month_sin,month_cos,day_of_year,dayofyear_sin,dayofyear_cos,time,temperature_2m (°C),wind_speed_10m (km/h),relative_humidity_2m (%),precipitation (mm),cloud_cover (%),weather_code (wmo code),surface_pressure (hPa),apparent_temperature (°C),rain (mm)


In [17]:
df_champs['Taux d\'occupation'].value_counts()

Taux d'occupation
20.350560    3
22.827780    3
3.557220     3
3.935560     3
18.073890    3
2.760000     3
19.598890    3
3.657780     3
3.478890     3
15.046110    3
20.936670    3
4.891670     3
4.066110     3
18.415000    3
23.978890    3
17.633330    3
0.000000     3
17.125560    3
5.317220     3
8.036670     3
9.478340     3
1.840560     3
10.971670    3
6.452780     3
7.848330     3
23.277220    3
14.129440    3
3.860000     3
9.216110     3
23.570000    2
8.910000     2
4.395560     2
27.226670    2
4.257780     2
15.032220    2
12.542220    2
23.385000    2
14.261670    2
19.742780    2
3.091110     2
17.202220    2
5.639450     2
5.678890     2
21.881670    2
4.523330     2
11.887780    2
17.623890    2
11.406110    2
23.948330    2
3.036110     2
11.274440    2
6.066670     2
4.477780     2
9.360560     2
2.991670     2
5.202780     2
4.615000     2
11.552220    2
4.520000     2
18.967220    2
15.994450    2
26.040560    2
17.277220    2
17.063890    2
18.028890    2
2.28222

In [18]:
df_champs.columns

Index(['Identifiant arc', 'Libelle', 'Débit horaire', 'Taux d'occupation',
       'Etat trafic', 'Identifiant noeud amont', 'Libelle noeud amont',
       'Identifiant noeud aval', 'Libelle noeud aval', 'Etat arc',
       'Date debut dispo data', 'Date fin dispo data', 'geo_point_2d',
       'geo_shape', 'date', 'hour', 'year', 'month', 'weekday', 'is_weekend',
       'Vacances Zone A', 'Vacances Zone B', 'Vacances Zone C', 'Nom Vacances',
       'Vacances Toutes Zones', 'Vacances Scolaires Paris', 'is_holiday',
       'day_type', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos',
       'month_sin', 'month_cos', 'day_of_year', 'dayofyear_sin',
       'dayofyear_cos', 'time', 'temperature_2m (°C)', 'wind_speed_10m (km/h)',
       'relative_humidity_2m (%)', 'precipitation (mm)', 'cloud_cover (%)',
       'weather_code (wmo code)', 'surface_pressure (hPa)',
       'apparent_temperature (°C)', 'rain (mm)'],
      dtype='object')

In [19]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

def build_lstm_model(input_shape):
    """Petit LSTM rapide pour test de features."""
    model = Sequential([
        LSTM(32, input_shape=input_shape, return_sequences=False),
        Dropout(0.2),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

def create_sequences(X, y, seq_length=24):
    """Crée des séquences glissantes de longueur seq_length."""
    X_seq, y_seq = [], []
    for i in range(seq_length, len(X)):
        X_seq.append(X[i-seq_length:i])
        y_seq.append(y[i])
    return np.array(X_seq), np.array(y_seq)

def forward_feature_selection(df, candidate_features, target='Débit horaire', max_features=5):
    """
    Sélection progressive de features (forward selection) avec LSTM.
    Teste toutes les features seules, puis toutes les combinaisons avec la feature retenue.
    """
    # Pour accélérer
    df = df.dropna(subset=[target]).sample(frac=0.3, random_state=42)
    
    selected_features = []
    remaining_features = candidate_features.copy()
    seq_length = 24
    patience = 2

    print("\n🚀 DÉMARRAGE DE LA SÉLECTION DE FEATURES 🚀\n")

    while len(selected_features) < max_features and remaining_features:
        best_rmse = np.inf
        best_feature = None

        # On teste chaque feature candidate avec celles déjà sélectionnées
        for f in remaining_features:
            current_features = selected_features + [f]
            print(f"🔹 Test combinaison : {current_features}")

            # Normalisation
            scaler_X = MinMaxScaler()
            scaler_y = MinMaxScaler()
            X_scaled = scaler_X.fit_transform(df[current_features])
            y_scaled = scaler_y.fit_transform(df[[target]])

            # Séquences
            X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length)
            train_size = int(len(X_seq) * 0.8)
            X_train, X_test = X_seq[:train_size], X_seq[train_size:]
            y_train, y_test = y_seq[:train_size], y_seq[train_size:]

            # LSTM rapide
            model = build_lstm_model((seq_length, X_train.shape[2]))
            es = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)

            # Entraînement court
            model.fit(
                X_train, y_train,
                epochs=5,
                batch_size=32,
                validation_data=(X_test, y_test),
                shuffle=False,
                verbose=0,
                callbacks=[es]
            )

            # Évaluation
            y_pred = model.predict(X_test, verbose=0)
            y_test_inv = scaler_y.inverse_transform(y_test)
            y_pred_inv = scaler_y.inverse_transform(y_pred)
            rmse = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))

            print(f"   RMSE = {rmse:.3f}")

            if rmse < best_rmse:
                best_rmse = rmse
                best_feature = f

        # On ajoute la meilleure feature et on retire de la liste des candidates
        selected_features.append(best_feature)
        remaining_features.remove(best_feature)
        print(f"✅ Feature sélectionnée : {best_feature}")
        print(f"🏆 Combinaison optimale actuelle : {selected_features}\n")

    print(f"\n✅ FIN DE LA SÉLECTION ✅")
    print(f"Meilleures features finales : {selected_features}")
    return selected_features


ImportError: Traceback (most recent call last):
  File "c:\Jimmy\Centrale\3A\Mention\datathon BCG\.venv\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: Une routine d’initialisation d’une bibliothèque de liens dynamiques (DLL) a échoué.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

In [340]:
# Liste de toutes les features candidates
candidate_features = [
    'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'month_sin', 'month_cos',
    'dayofyear_sin', 'dayofyear_cos', 'is_weekend', 'is_holiday',
    'Vacances Scolaires Paris', 'temperature_2m (°C)', 'wind_speed_10m (km/h)',
    'relative_humidity_2m (%)', 'precipitation (mm)', 'cloud_cover (%)',
    'weather_code (wmo code)', 'surface_pressure (hPa)', 'apparent_temperature (°C)','Débit horaire'
]

# Sélection rapide
best_features = forward_feature_selection(df_champs, candidate_features, target='Débit horaire', max_features=8)



🚀 DÉMARRAGE DE LA SÉLECTION DE FEATURES 🚀

🔹 Test combinaison : ['hour_sin']
   RMSE = 287.647
🔹 Test combinaison : ['hour_cos']
   RMSE = 292.366
🔹 Test combinaison : ['weekday_sin']
   RMSE = 287.002
🔹 Test combinaison : ['weekday_cos']
   RMSE = 287.471
🔹 Test combinaison : ['month_sin']
   RMSE = 290.046
🔹 Test combinaison : ['month_cos']
   RMSE = 287.369
🔹 Test combinaison : ['dayofyear_sin']
   RMSE = 287.501
🔹 Test combinaison : ['dayofyear_cos']
   RMSE = 290.016
🔹 Test combinaison : ['is_weekend']
   RMSE = 287.131
🔹 Test combinaison : ['is_holiday']
   RMSE = 286.780
🔹 Test combinaison : ['Vacances Scolaires Paris']
   RMSE = 286.426
🔹 Test combinaison : ['temperature_2m (°C)']
   RMSE = 287.643
🔹 Test combinaison : ['wind_speed_10m (km/h)']
   RMSE = 288.291
🔹 Test combinaison : ['relative_humidity_2m (%)']
   RMSE = 288.832
🔹 Test combinaison : ['precipitation (mm)']
   RMSE = 286.302
🔹 Test combinaison : ['cloud_cover (%)']
   RMSE = 289.033
🔹 Test combinaison : ['weathe

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# On choisit le target et les features
target ='Débit horaire'
features = [
    'hour_sin', 'hour_cos', 
    'weekday_sin', 'weekday_cos', 
    'month_sin', 'month_cos', 
    'dayofyear_sin', 'dayofyear_cos',
    'is_weekend', 'is_holiday',
    
     #'Vacances Zone A', 'Vacances Zone B', 'Vacances Zone C','Vacances Toutes Zones'
     'Vacances Scolaires Paris',
     'temperature_2m (°C)', 'wind_speed_10m (km/h)',
     'precipitation (mm)', 'cloud_cover (%)'
]

#features=['precipitation (mm)', 'is_holiday', 'Vacances Scolaires Paris', 'wind_speed_10m (km/h)', 'surface_pressure (hPa)', 'temperature_2m (°C)', 'dayofyear_cos']

# On supprime les lignes où le target est NaN
df_model = df_champs.copy()

# Normalisation des features

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()



X_scaled = scaler_X.fit_transform(df_model[features])
y_scaled = scaler_y.fit_transform(df_model[[target]])


# Convertir en séquences pour LSTM
def create_sequences(X, y, seq_length=24):
    X_seq, y_seq = [], []
    for i in range(seq_length, len(X)):
        X_seq.append(X[i-seq_length:i])
        y_seq.append(y[i])
    return np.array(X_seq), np.array(y_seq)

seq_length = 168  # utilise les 24 heures précédentes pour prédire l'heure suivante
X_seq, y_seq = create_sequences(X_scaled, y_scaled, seq_length=seq_length)

# Attention au data leakage : split temporel, pas aléatoire
train_size = int(len(X_seq) * 0.9)
X_train, X_test = X_seq[:train_size], X_seq[train_size:]
y_train, y_test = y_seq[:train_size], y_seq[train_size:]


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

model = Sequential()
model.add(LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1))  # prédiction du target

model.compile(optimizer='adam', loss='mse')



history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_test, y_test),
    shuffle=False
)


Epoch 1/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 86ms/step - loss: 0.0224 - val_loss: 0.0082
Epoch 2/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 128ms/step - loss: 0.0126 - val_loss: 0.0063
Epoch 3/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 201ms/step - loss: 0.0102 - val_loss: 0.0064
Epoch 4/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 189ms/step - loss: 0.0089 - val_loss: 0.0054
Epoch 5/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 141ms/step - loss: 0.0080 - val_loss: 0.0052
Epoch 6/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 160ms/step - loss: 0.0076 - val_loss: 0.0054
Epoch 7/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 117ms/step - loss: 0.0073 - val_loss: 0.0042
Epoch 8/10
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 180ms/step - loss: 0.0070 - val_loss: 0.0040
Epoch 9/10
[1m23

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Prédiction sur le jeu de test
y_pred = model.predict(X_test)

# On repasse en vraies unités (débit horaire)
y_test_inv = scaler_y.inverse_transform(y_test)
y_pred_inv = scaler_y.inverse_transform(y_pred)

# Calcul du RMSE
rmse = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))

# Moyenne réelle du débit horaire (sur le test)
mean_target = np.mean(y_test_inv)

# Rapport relatif
relative_error = rmse / mean_target * 100

print(f"✅ RMSE sur le jeu de test : {rmse:.2f}")
print(f"📊 Moyenne du débit horaire (test) : {mean_target:.2f}")
print(f"⚖️ Erreur relative (RMSE / moyenne) : {relative_error:.2f}%")


NameError: name 'model' is not defined

In [None]:
def LSTM_72h(df_champs, features, target):
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense, Dropout, TimeDistributed, RepeatVector
    from sklearn.preprocessing import MinMaxScaler
    import numpy as np

    # Copie du dataframe
    df_model = df_champs.copy()

    # Normalisation
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()

    features_with_target = features + [target]

    X_scaled = scaler_X.fit_transform(df_model[features_with_target])
    y_scaled = scaler_y.fit_transform(df_model[[target]])


    # Création des séquences
    def create_sequences_seq2seq(X, y, seq_length=168, pred_length=72):
        X_seq, y_seq = [], []
        for i in range(seq_length, len(X) - pred_length + 1):
            X_seq.append(X[i-seq_length:i])
            y_seq.append(y[i:i+pred_length])  # séquence de sortie de 72 pas
        return np.array(X_seq), np.array(y_seq)

    seq_length = 168  # 7 jours passés
    pred_length = 72  # 3 jours futurs
    X_seq, y_seq = create_sequences_seq2seq(X_scaled, y_scaled, seq_length, pred_length)

    # Split temporel
    train_size = int(len(X_seq) * 0.9)
    X_train, X_test = X_seq[:train_size], X_seq[train_size:]
    y_train, y_test = y_seq[:train_size], y_seq[train_size:]

    # ✅ Architecture correcte pour un seq2seq LSTM
    model = Sequential()
    model.add(LSTM(128, activation='tanh', input_shape=(seq_length, X_train.shape[2]), return_sequences=False))
    model.add(Dropout(0.3))
    model.add(RepeatVector(pred_length))  # répète le contexte pour chaque futur pas
    model.add(LSTM(64, activation='tanh', return_sequences=True))
    model.add(Dropout(0.2))
    model.add(TimeDistributed(Dense(1)))  # 72 sorties (1 par heure future)


    model.compile(optimizer='adam', loss='mse')

    # Entraînement
    history = model.fit(
        X_train, y_train,
        epochs=10,
        batch_size=32,
        validation_data=(X_test, y_test),
        shuffle=False,
        verbose=1
    )

    # ✅ Retourner tout ce qui sera utile
    return model, history, scaler_X, scaler_y, X_test, y_test



In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# On choisit le target et les features
target ='Débit horaire'
features = [
    'hour_sin', 'hour_cos', 
    'weekday_sin', 'weekday_cos', 
    'month_sin', 'month_cos', 
    'dayofyear_sin', 'dayofyear_cos',
    'is_weekend', 'is_holiday',
    
     #'Vacances Zone A', 'Vacances Zone B', 'Vacances Zone C','Vacances Toutes Zones'
     'Vacances Scolaires Paris',
     'temperature_2m (°C)', 'wind_speed_10m (km/h)',
     'precipitation (mm)', 'cloud_cover (%)'
]

# Appel du modèle
model, history, scaler_X, scaler_y, X_test, y_test = LSTM_72h(df_champs, features, target)

# Prédiction sur le jeu de test
y_pred = model.predict(X_test)

# ⚠️ Les sorties ont une dimension (samples, 72, 1)
# On les aplatit pour comparer directement sur toutes les heures prédites
y_pred_flat = y_pred.reshape(-1, 1)
y_test_flat = y_test.reshape(-1, 1)

# Dénormalisation
y_pred_inv = scaler_y.inverse_transform(y_pred_flat)
y_test_inv = scaler_y.inverse_transform(y_test_flat)

# Calcul du RMSE global sur toutes les prédictions
rmse = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))

# Moyenne réelle du target sur le test
mean_target = np.mean(y_test_inv)

# Erreur relative
relative_error = rmse / mean_target * 100

print(f"✅ RMSE global (72h multi-step) : {rmse:.2f}")
print(f"📊 Moyenne du target (test) : {mean_target:.2f}")
print(f"⚖️ Erreur relative : {relative_error:.2f}%")


Epoch 1/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 137ms/step - loss: 0.0249 - val_loss: 0.0137
Epoch 2/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 132ms/step - loss: 0.0190 - val_loss: 0.0127
Epoch 3/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 137ms/step - loss: 0.0183 - val_loss: 0.0125
Epoch 4/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 146ms/step - loss: 0.0180 - val_loss: 0.0121
Epoch 5/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 143ms/step - loss: 0.0178 - val_loss: 0.0121
Epoch 6/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 130ms/step - loss: 0.0176 - val_loss: 0.0121
Epoch 7/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 126ms/step - loss: 0.0175 - val_loss: 0.0121
Epoch 8/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 160ms/step - loss: 0.0173 - val_loss: 0.0121
Epoch 9/10
[1m2

In [None]:
import torch
import numpy as np
import pandas as pd
from toto.data.util.dataset import MaskedTimeseries
from toto.inference.forecaster import TotoForecaster
from toto.model.toto import Toto
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# ============================
# 1️⃣  Préparation des données
# ============================

target = 'Débit horaire'
features = [
    'hour_sin', 'hour_cos', 
    'weekday_sin', 'weekday_cos', 
    'month_sin', 'month_cos', 
    'dayofyear_sin', 'dayofyear_cos',
    'is_weekend', 'is_holiday', 'is_school_holiday'
]

# Copie + tri temporel
df_model = df_champs.copy().sort_values('Date et heure de comptage')

# Suppression des NaN + interpolation sur le target
df_model[target] = df_model[target].interpolate(method='time')
df_model = df_model.dropna(subset=features + [target])

# Normalisation (important pour stabilité numérique)
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(df_model[features])
y_scaled = scaler_y.fit_transform(df_model[[target]])

# =========================================
# 2️⃣  Construction de la série temporelle
# =========================================

# On concatène features + target en une série unique
series = np.hstack([X_scaled, y_scaled])
series = torch.tensor(series.T, dtype=torch.float32)  # (n_vars, time_steps)

# Paramètres temporels
time_interval_seconds = torch.full((series.shape[0],), 3600)  # 1h par pas
timestamp_seconds = torch.arange(series.shape[1]) * 3600
timestamp_seconds = timestamp_seconds.repeat(series.shape[0], 1)

# Masques requis par Toto
inputs = MaskedTimeseries(
    series=series,
    padding_mask=torch.ones_like(series, dtype=torch.bool),
    id_mask=torch.zeros_like(series),
    timestamp_seconds=timestamp_seconds,
    time_interval_seconds=time_interval_seconds,
)

# ================================
# 3️⃣  Chargement du modèle Toto
# ================================

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"✅ Utilisation du device : {DEVICE}")

toto = Toto.from_pretrained("Datadog/Toto-Open-Base-1.0").to(DEVICE)
toto.compile()
forecaster = TotoForecaster(toto.model)

# =======================================
# 4️⃣  Prédiction (ex. sur les 24 prochaines heures)
# =======================================

prediction_length = 24  # heures à prédire
forecast = forecaster.forecast(
    inputs,
    prediction_length=prediction_length,
    num_samples=256,
    samples_per_batch=256,
)

# ===================================
# 5️⃣  Post-traitement et évaluation
# ===================================

median_pred = forecast.median[-1].detach().cpu().numpy()  # dernière série
pred_inv = scaler_y.inverse_transform(median_pred.reshape(-1, 1))

# Récupère la vraie valeur observée sur les dernières heures
y_true = df_model[target].values[-len(pred_inv):]
rmse = np.sqrt(mean_squared_error(y_true, pred_inv))
mean_target = np.mean(y_true)
relative_error = rmse / mean_target * 100

print(f"✅ RMSE Toto (24h) : {rmse:.2f}")
print(f"📊 Moyenne du débit horaire (test) : {mean_target:.2f}")
print(f"⚖️ Erreur relative (RMSE / moyenne) : {relative_error:.2f}%")


✅ Utilisation du device : cpu


config.json:   0%|          | 0.00/582 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

RuntimeError: [enforce fail at alloc_cpu.cpp:116] data. DefaultCPUAllocator: not enough memory: you tried to allocate 14118027264 bytes.