In [101]:
import sys
import os
import pandas as pd
import numpy as np

In [102]:
champs_elysees_df = pd.read_csv('../data/traffic/champs_elysees.csv', sep=";")
convention_df = pd.read_csv('../data/traffic/convention.csv', sep=";")
sts_peres_df = pd.read_csv('../data/traffic/sts_peres.csv', sep=";")

Could be interresting to have the aval and amount data...

In [103]:
weather_df = pd.read_csv('../data/weather/weather_paris.csv')
holidays_df = pd.read_csv('../data/events/french_holidays.csv', sep=";")

There is a big sport event in the test data so could be good to add public event data but I can't find it. I might have to create it from scratch or scrap it on the web.

# 1. Preprocessing

In [104]:
champs_elysees_df

Unnamed: 0,Identifiant arc,Libelle,Date et heure de comptage,Débit horaire,Taux d'occupation,Etat trafic,Identifiant noeud amont,Libelle noeud amont,Identifiant noeud aval,Libelle noeud aval,Etat arc,Date debut dispo data,Date fin dispo data,geo_point_2d,geo_shape
0,4264,AV_Champs_Elysees,2024-12-09T05:00:00+01:00,199.0,2.20945,Fluide,2294,Av_Champs_Elysees-Washington,2293,Av_Champs_Elysees-Berri,Invalide,1996-10-10,2023-01-01,"48.87153587897718, 2.3017227924560624","{""coordinates"": [[2.3009951475338775, 48.87177..."
1,4264,AV_Champs_Elysees,2024-12-09T06:00:00+01:00,235.0,2.28778,Fluide,2294,Av_Champs_Elysees-Washington,2293,Av_Champs_Elysees-Berri,Invalide,1996-10-10,2023-01-01,"48.87153587897718, 2.3017227924560624","{""coordinates"": [[2.3009951475338775, 48.87177..."
2,4264,AV_Champs_Elysees,2024-12-09T09:00:00+01:00,1041.0,11.63222,Fluide,2294,Av_Champs_Elysees-Washington,2293,Av_Champs_Elysees-Berri,Invalide,1996-10-10,2023-01-01,"48.87153587897718, 2.3017227924560624","{""coordinates"": [[2.3009951475338775, 48.87177..."
3,4264,AV_Champs_Elysees,2025-09-02T09:00:00+02:00,1139.0,28.39222,Pré-saturé,2294,Av_Champs_Elysees-Washington,2293,Av_Champs_Elysees-Berri,Ouvert,1996-10-10,2023-01-01,"48.87153587897718, 2.3017227924560624","{""coordinates"": [[2.3009951475338775, 48.87177..."
4,4264,AV_Champs_Elysees,2025-04-03T11:00:00+02:00,1138.0,16.94889,Pré-saturé,2294,Av_Champs_Elysees-Washington,2293,Av_Champs_Elysees-Berri,Ouvert,1996-10-10,2023-01-01,"48.87153587897718, 2.3017227924560624","{""coordinates"": [[2.3009951475338775, 48.87177..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8622,4264,AV_Champs_Elysees,2024-10-30T03:00:00+01:00,322.0,3.50500,Fluide,2294,Av_Champs_Elysees-Washington,2293,Av_Champs_Elysees-Berri,Invalide,1996-10-10,2023-01-01,"48.87153587897718, 2.3017227924560624","{""coordinates"": [[2.3009951475338775, 48.87177..."
8623,4264,AV_Champs_Elysees,2024-10-31T08:00:00+01:00,,,Inconnu,2294,Av_Champs_Elysees-Washington,2293,Av_Champs_Elysees-Berri,Invalide,1996-10-10,2023-01-01,"48.87153587897718, 2.3017227924560624","{""coordinates"": [[2.3009951475338775, 48.87177..."
8624,4264,AV_Champs_Elysees,2024-10-31T07:00:00+01:00,,,Inconnu,2294,Av_Champs_Elysees-Washington,2293,Av_Champs_Elysees-Berri,Invalide,1996-10-10,2023-01-01,"48.87153587897718, 2.3017227924560624","{""coordinates"": [[2.3009951475338775, 48.87177..."
8625,4264,AV_Champs_Elysees,2024-10-31T02:00:00+01:00,501.0,6.26167,Fluide,2294,Av_Champs_Elysees-Washington,2293,Av_Champs_Elysees-Berri,Invalide,1996-10-10,2023-01-01,"48.87153587897718, 2.3017227924560624","{""coordinates"": [[2.3009951475338775, 48.87177..."


In [None]:
class Preprocessor:
    def __init__(self, df):
        self.df = df
    
    def create_datetime_features(self, df: pd.DataFrame, holidays_df: pd.DataFrame, datetime_col: str ='Date et heure de comptage') -> pd.DataFrame:
        """
        Converts a column to datetime and creates additional columns:
            - date: date without timestamp
            - hour: time
            - year: year
            - month: month
            - weekday: day of the week (0=Monday, 6=Sunday)
            - is_weekend: True if Saturday or Sunday
            - is_holiday: Binary public and school holidays
            - hour_sin, hour_cos: cyclic encoding (daily seasonality)
            - weekday_sin, weekday_cos: cyclic encoding (weekly seasonality)
            - month_sin, month_cos: cyclic encoding (weekly seasonality)


            Parameters:
                df: pandas.DataFrame, street dataset
                datetime_col: str, name of the datetime column
                holidays_df: pandas.DataFrame, French public holidays dataset
            Returns:
                df: pandas.DataFrame with new columns

        """

        # Convert to datetime (not in UTC to keep winter and summer french time)
        df['Date et heure de comptage'] = pd.to_datetime(df['Date et heure de comptage'], errors='coerce', utc=True)

        # Convert to tz-naive (Paris local time)
        df['Date et heure de comptage'] = df['Date et heure de comptage'].dt.tz_convert('Europe/Paris').dt.tz_localize(None)


        # Extract features
        df['date'] = df[datetime_col].dt.date
        df['hour'] = df[datetime_col].dt.hour
        df['year'] = df[datetime_col].dt.year
        df['month'] = df[datetime_col].dt.month
        df['weekday'] = df[datetime_col].dt.weekday  # 0=lundi, 6=dimanche
        df['is_weekend'] = df['weekday'] >= 5

        # Add cyclic features
        self.add_cyclic_features(df)

        # Add holidays
        self.add_holidays(df, holidays_df)

        return df

    def add_cyclic_features(self, df):
        """
        Ajoute des colonnes sin/cos pour les features cycliques : heure, jour de semaine, mois.
        """

        # Heure (0-23)
        df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
        df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

        # Jour de la semaine (0-6)
        df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
        df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)

        # Mois (1-12)
        df['month_sin'] = np.sin(2 * np.pi * (df['month']-1) / 12)
        df['month_cos'] = np.cos(2 * np.pi * (df['month']-1) / 12)

        # Optionnel : jour de l'année (1-365/366)
        df['day_of_year'] = df['Date et heure de comptage'].dt.dayofyear
        df['dayofyear_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
        df['dayofyear_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    
    def add_holidays(self, df: pd.DataFrame, holidays_df: pd.DataFrame):
        """
        Adds columns indicating whether a date is within French school holidays.

        Parameters:
            df (pd.DataFrame): main dataset
            holidays_df (pd.DataFrame): vacation periods dataset
            datetime_col (str): datetime column in df

        Returns:
            pd.DataFrame: updated df with 'is_holiday'
        """
        df['is_school_holiday'] = False
        
        # Convert to datetime not in UTC
        holidays_df['Date de début'] = pd.to_datetime(holidays_df['Date de début'], utc=False, errors='coerce').dt.date
        holidays_df['Date de fin'] = pd.to_datetime(holidays_df['Date de fin'], utc=False, errors='coerce').dt.date

        # Only keep Zone C (Paris)
        holidays_df = holidays_df[holidays_df['Zones'] == 'Zone C']

        for _, row in holidays_df.iterrows():
            mask = (df["date"] >= row['Date de début']) & (df["date"] <= row['Date de fin'])
            df.loc[mask, 'is_school_holiday'] = True
        
    def fill_nan(self, df: pd.DataFrame):
        df['Date et heure de comptage'] = pd.to_datetime(df['Date et heure de comptage'], utc=False)

        # Trier chronologiquement
        df = df.sort_values('Date et heure de comptage')

        # Mettre la colonne de date comme index temporairement
        df = df.set_index('Date et heure de comptage')

        # Interpolation temporelle
        df['Débit horaire'] = df['Débit horaire'].interpolate(method='time')
        df['Taux d\'occupation'] = df['Taux d\'occupation'].interpolate(method='time')
        return df
    
    def add_weather(self, df: pd.DataFrame, weather_df: pd.DataFrame, datetime_col: str = 'Date et heure de comptage'):
        """
        Adds weather features to the traffic dataframe by merging on datetime.

        Parameters:
            df (pd.DataFrame): traffic dataset
            weather_df (pd.DataFrame): weather dataset, must have 'time' column
            datetime_col (str): datetime column in traffic df

        Notes:
            - Assumes weather_df['time'] and df[datetime_col] are compatible datetimes
            - Merge is done on datetime rounded to hour
        """

        # Convert weather time to datetime (if not already)
        weather_df['time'] = pd.to_datetime(weather_df['time'], errors='coerce', utc=False)

        # Ensure traffic datetime is also datetime
        df[datetime_col] = pd.to_datetime(df[datetime_col], errors='coerce', utc=False)

        # Optional: round both to hour for exact matching
        df['hour_time'] = df[datetime_col].dt.floor('H')
        weather_df['hour_time'] = weather_df['time'].dt.floor('H')

        # Merge on the floored hour
        df = df.merge(weather_df, on='hour_time', how='left', suffixes=('', '_weather'))

        # Drop helper column if you want
        df.drop(columns=['hour_time'], inplace=True)

        return df
    
    def preprocess_all(self, holidays_df: pd.DataFrame, weather_df: pd.DataFrame, datetime_col: str='Date et heure de comptage'):
        """
        Runs all preprocessing steps:
        - Fill NaN
        - Create datetime features
        - Add weather data
        """
        # Create datetime features and holidays
        self.create_datetime_features(self.df, holidays_df)

        # Merge weather data
        self.add_weather(self.df, weather_df, datetime_col=datetime_col)

        # # Fill missing traffic values
        # self.df = self.fill_nan(self.df)
        return self.df    

In [170]:
df = champs_elysees_df.copy()

In [171]:
champs_elysees_df["Date et heure de comptage"].describe()

count                          8627
unique                         8627
top       2024-12-09T05:00:00+01:00
freq                              1
Name: Date et heure de comptage, dtype: object

In [172]:
preprocessor = Preprocessor(df)
preprocessed_df = preprocessor.preprocess_all(holidays_df=holidays_df, weather_df=weather_df)

  df['hour_time'] = df[datetime_col].dt.floor('H')
  weather_df['hour_time'] = weather_df['time'].dt.floor('H')


In [173]:
preprocessed_df.describe()

Unnamed: 0,Identifiant arc,Date et heure de comptage,Débit horaire,Taux d'occupation,Identifiant noeud amont,Identifiant noeud aval,hour,year,month,weekday,hour_sin,hour_cos,weekday_sin,weekday_cos,month_sin,month_cos,day_of_year,dayofyear_sin,dayofyear_cos,hour_time
count,8627.0,8627,8078.0,8063.0,8627.0,8627.0,8627.0,8627.0,8627.0,8627.0,8627.0,8627.0,8627.0,8627.0,8627.0,8627.0,8627.0,8627.0,8627.0,8627
mean,4264.0,2025-04-23 10:24:18.027123968,738.236073,15.300116,2294.0,2293.0,11.506665,2024.801321,6.607743,3.003246,-0.000388,-0.001047072,0.002486,-0.009274,-0.052001,-0.0075619,185.670453,-0.052241,0.009361,2025-04-23 10:24:18.027123968
min,4264.0,2024-10-01 05:00:00,0.0,0.0,2294.0,2293.0,0.0,2024.0,1.0,0.0,-1.0,-1.0,-0.974928,-0.900969,-1.0,-1.0,1.0,-0.999991,-0.999963,2024-10-01 05:00:00
25%,4264.0,2025-01-21 10:30:00,532.0,7.38167,2294.0,2293.0,6.0,2025.0,4.0,1.0,-0.707107,-0.7071068,-0.781831,-0.900969,-0.866025,-0.5,93.0,-0.867456,-0.632103,2025-01-21 10:30:00
50%,4264.0,2025-04-23 08:00:00,807.0,15.15667,2294.0,2293.0,12.0,2025.0,7.0,3.0,0.0,-1.83697e-16,0.0,-0.222521,0.0,-1.83697e-16,196.0,-0.008607,0.073095,2025-04-23 08:00:00
75%,4264.0,2025-08-03 03:30:00,944.0,21.565,2294.0,2293.0,17.0,2025.0,10.0,5.0,0.707107,0.7071068,0.781831,0.62349,0.5,0.5,281.0,0.699458,0.570242,2025-08-03 03:30:00
max,4264.0,2025-11-02 00:00:00,2190.0,77.54556,2294.0,2293.0,23.0,2025.0,12.0,6.0,1.0,1.0,0.974928,1.0,1.0,1.0,366.0,0.999991,1.0,2025-11-02 00:00:00
std,0.0,,286.353911,9.233488,0.0,0.0,6.919718,0.399029,3.419584,1.9914,0.707122,0.707173,0.706649,0.707581,0.739653,0.6710197,104.421066,0.744334,0.665782,


In [174]:
preprocessed_df.columns

Index(['Identifiant arc', 'Libelle', 'Date et heure de comptage',
       'Débit horaire', 'Taux d'occupation', 'Etat trafic',
       'Identifiant noeud amont', 'Libelle noeud amont',
       'Identifiant noeud aval', 'Libelle noeud aval', 'Etat arc',
       'Date debut dispo data', 'Date fin dispo data', 'geo_point_2d',
       'geo_shape', 'date', 'hour', 'year', 'month', 'weekday', 'is_weekend',
       'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'month_sin',
       'month_cos', 'day_of_year', 'dayofyear_sin', 'dayofyear_cos',
       'is_school_holiday', 'hour_time'],
      dtype='object')