In [14]:
import pandas as pd
import  numpy as np
import datetime
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

In [15]:
df = pd.read_csv('../csv/sncf_clean.csv')

In [16]:
df.drop(['restitution_date','nom', 'gare', 'type', 'nature'], axis=1, inplace=True)

In [17]:
df = df.loc[df["year"]<=2022]

In [18]:
df1 = df.groupby(['date', 'year', 'month', 'week', 'day', 'day_of_week']).sum().reset_index()

In [19]:
df1["date"] = pd.to_datetime(df1["date"])

In [20]:
df = df1.copy()
for i in range(1,4):
    temp = df1.copy()
    temp["date"] = temp['date'].apply(lambda x: x + datetime.timedelta(days=i)) 
    temp.drop(["year", "month", "day","week", "day_of_week"], axis=1, inplace=True)
    temp.rename(columns={"number": f"T-{i}"}, inplace=True)
    df = df.merge(temp, how='left', on='date')

In [21]:
splits = TimeSeriesSplit(n_splits=5, gap=3)

In [22]:
int_columns = ["year", "T-1", "T-2", "T-3"]
categoricals_columns = ["month", "week", "day_of_week"]

int_transformer = MinMaxScaler()
categoricals_transformers = OneHotEncoder(handle_unknown='ignore')

transformers = make_column_transformer((int_transformer, int_columns), (categoricals_transformers, categoricals_columns))

In [28]:
model = XGBRegressor(n_estimators = 10000, max_depth=16, max_leaves=0, learning_rate=0.0001, random_state=1)
pipeline = make_pipeline(transformers, model)
cv_results = cross_validate(pipeline,df.drop(["number"], axis=1), df["number"], cv=splits, scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],)
mae = -cv_results["test_neg_mean_absolute_error"]
rmse = -cv_results["test_neg_root_mean_squared_error"]
print(f"mae: {mae.mean():.2f} +/- {mae.std():.2f}")
print(f"rmse: {rmse.mean():.2f} +/- {rmse.std():.2f}")

mae: 4.33 +/- 1.17
rmse: 5.62 +/- 1.40
