In [1]:
import pandas as pd
import  numpy as np
import datetime
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

In [2]:
df = pd.read_csv('../csv/sncf_clean.csv')

In [3]:
df.drop(['restitution_date','nom', 'gare', 'type', 'nature'], axis=1, inplace=True)

In [4]:
df = df.loc[df["year"]<=2022]

In [5]:
df1 = df.groupby(['date', 'year', 'month', 'week', 'day', 'day_of_week']).sum().reset_index()

In [6]:
df1["date"] = pd.to_datetime(df1["date"])

In [7]:
def create_t_x(data):
    last_data = data.copy()
    for i in range(1,4):
        data_t = data.copy()
        data_t["date"] = data_t['date'].apply(lambda x: x + datetime.timedelta(days=i)) 
        data_t.drop(["year", "month", "day","week", "day_of_week"], axis=1, inplace=True)
        data_t.rename(columns={"number": f"T-{i}"}, inplace=True)
        last_data = last_data.merge(data_t, how='left', on='date')
    return last_data

In [8]:
last_df = create_t_x(df1)

In [9]:
splits = TimeSeriesSplit(n_splits=5, gap=3)

In [10]:
int_columns = ["year", "T-1", "T-2", "T-3"]
categoricals_columns = ["month", "week", "day_of_week"]

int_transformer = MinMaxScaler()
categoricals_transformers = OneHotEncoder(handle_unknown='ignore')

transformers = make_column_transformer((int_transformer, int_columns), (categoricals_transformers, categoricals_columns))

In [11]:
def evaluate(model, X, y, cv):
    pipeline = make_pipeline(transformers, model)
    cv_results = cross_validate(
        pipeline,
        X,
        y,
        cv=cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
    )
    mae = -cv_results["test_neg_mean_absolute_error"]
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    print(
        f"Mean Absolute Error:     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}"
    )
    return mae.mean(), rmse.mean()

In [12]:
model = XGBRegressor(
    n_estimators = 10000,
    max_depth=16,
    max_leaves=0,
    learning_rate=0.0001,
    random_state=1
)
mae, rmse = evaluate(model, last_df.drop(["number"], axis=1), last_df["number"], cv=splits)

Mean Absolute Error:     4.331 +/- 1.169
Root Mean Squared Error: 5.622 +/- 1.400
