In [1]:
import sys
import os
sys.path.append("..") 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.xgb_forecasting import XGBoostModel
from src.lstm_forecasting import LSTMTimeSeries
from sklearn.metrics import mean_squared_error
from src.preprocess import *
from src.add_features import *
# Pour ne pas afficher de warnings inutiles
import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

In [2]:
champs_elysees_df = pd.read_csv('../data/traffic/champs_elysees.csv', sep=";")
convention_df = pd.read_csv('../data/traffic/convention.csv', sep=";")
sts_peres_df = pd.read_csv('../data/traffic/sts_peres.csv', sep=";")

In [3]:
# Champs-Elysees
df_champs_lstm = pipeline(champs_elysees_df, window=3, fill_hours=True,fillna=True)
# fill_hours=True → add missing hourly timestamps to ensure continuous time sequences (important for LSTM)  
# fillna=True → fill missing (NaN) values in the existing data

# Sts peres
sts_peres_df = treat_nan_sts_peres(sts_peres_df) # sts_peres has some months missing in the middle, we decided to aggregate the data before and after the gap
df_sts_peres_xgb = pipeline(sts_peres_df,window=3,fill_hours=False,fillna=True)

# Convention
convention_df = treat_nan_convention(convention_df) # convention has a lot of missing data in the begining, we decided to only keep some data
df_convention_xgb = pipeline(convention_df,window=3,fill_hours=False,fillna=True)

In [4]:
df_test_champs_2025, df_test_convention_2025, df_test_peres_2025 = create_test_dataset(champs_elysees_df,convention_df,sts_peres_df)


KeyError: 'date'

after examination :

    Champs-elysees --> LSTM

    convention --> xgb

    sts peres --> xgb

In [7]:
targets = ['Débit horaire', "Taux d'occupation"]

features_xgb = [
        'hour_sin', 'hour_cos', 
        'weekday_sin', 'weekday_cos', 
        'month_sin', 'month_cos', 
        'dayofyear_sin', 'dayofyear_cos',
        'is_weekend', 'is_holiday',
        'Vacances Scolaires Paris', 
        'temperature_2m (°C)', 'wind_speed_10m (km/h)',
        'precipitation (mm)', 'cloud_cover (%)',
        'Débit horaire_outlier_high', 'Débit horaire_outlier_low', 'Débit horaire_special_event',
        "Taux d'occupation_outlier_high", "Taux d'occupation_outlier_low", "Taux d'occupation_special_event"]

df_xgb_sts_peres = df_sts_peres_xgb.copy()

lags_hours = [72, 168]  # 3 days and 1 week lags
df_xgb_sts_peres, features_xgb_sts_peres = create_lag_features(df_xgb_sts_peres, targets=targets, lags_hours=lags_hours)

xgb_model_sts_peres = XGBoostModel(df_xgb_sts_peres, features_xgb_sts_peres, targets)
xgb_model_sts_peres.full_train()

Full training completed on all data.


{'Débit horaire': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.05, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=5,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=200,
              n_jobs=None, num_parallel_tree=None, ...),
 "Taux d'occupation": XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
      

In [10]:
targets = ['Débit horaire', "Taux d'occupation"]

features_xgb = [
        'hour_sin', 'hour_cos', 
        'weekday_sin', 'weekday_cos', 
        'month_sin', 'month_cos', 
        'dayofyear_sin', 'dayofyear_cos',
        'is_weekend', 'is_holiday',
        'Vacances Scolaires Paris', 
        'temperature_2m (°C)', 'wind_speed_10m (km/h)',
        'precipitation (mm)', 'cloud_cover (%)',
        'Débit horaire_outlier_high', 'Débit horaire_outlier_low', 'Débit horaire_special_event',
        "Taux d'occupation_outlier_high", "Taux d'occupation_outlier_low", "Taux d'occupation_special_event"]

df_xgb_convention = df_convention_xgb.copy()

lags_hours = [72, 168]  # 3 days and 1 week lags
df_xgb_convention, features_xgb_convention = create_lag_features(df_xgb_convention, targets=targets, lags_hours=lags_hours)

xgb_model_convention = XGBoostModel(df_xgb_convention, features_xgb_convention, targets)
xgb_model_convention.full_train()

Full training completed on all data.


{'Débit horaire': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.05, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=5,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=200,
              n_jobs=None, num_parallel_tree=None, ...),
 "Taux d'occupation": XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
      

In [11]:
targets = ['Débit horaire', "Taux d'occupation"]

features_champs = [
        'hour_sin', 'hour_cos', 
        'weekday_sin', 'weekday_cos', 
        'month_sin', 'month_cos', 
        'dayofyear_sin', 'dayofyear_cos',
        'is_weekend', 'is_holiday',
        'Vacances Scolaires Paris', 
        'temperature_2m (°C)', 'wind_speed_10m (km/h)',
        'precipitation (mm)', 'cloud_cover (%)',
        'Débit horaire_outlier_high', 'Débit horaire_outlier_low', 'Débit horaire_special_event',
        "Taux d'occupation_outlier_high", "Taux d'occupation_outlier_low", "Taux d'occupation_special_event"]

In [12]:
final_eval_size = 72
seq_length = 168

train_df_champs = df_champs_lstm.copy()

lstm_model_champs = LSTMTimeSeries(
    features=features_champs,
    targets=targets,
    seq_length=seq_length,
    pred_length=final_eval_size
)

lstm_model_champs.train_final(train_df_champs, epochs=30, batch_size=32)

Epoch 1/30
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 68ms/step - loss: 0.5676 - mae: 0.5523 - mape: 208.1865
Epoch 2/30
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 67ms/step - loss: 0.3695 - mae: 0.4330 - mape: 193.5289
Epoch 3/30
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 69ms/step - loss: 0.3120 - mae: 0.3930 - mape: 181.7106
Epoch 4/30
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 92ms/step - loss: 0.2689 - mae: 0.3645 - mape: 168.2106
Epoch 5/30
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 109ms/step - loss: 0.2672 - mae: 0.3629 - mape: 165.7587
Epoch 6/30
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 118ms/step - loss: 0.2498 - mae: 0.3491 - mape: 158.7778
Epoch 7/30
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 100ms/step - loss: 0.2140 - mae: 0.3261 - mape: 151.1414
Epoch 8/30
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3