In [None]:
%pip install scikit-learn

In [None]:
%pip install xgboost

In [None]:
%pip install lightgbm

In [1]:
import pandas as pd
import sklearn
from xgboost import XGBRegressor, XGBRFRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

## Load Data

In [14]:
df = pd.read_parquet('../data/cayzn_train.parquet')

## Define preprocessing stages

## Feature selection

In [15]:
#Selection des variables

# Comme dejà encodé, on ne vas pas utiliser ces features
data = df.drop(columns=["sale_date", "departure_date"])

# On pourra éventuellement laissé de côté "destination_public_holiday" car constamment == 0

data = data.drop(columns=["destination_current_public_holiday"])


## Feature engineering

In [27]:
## On a observer une plus forte demande en mai (jours férié), juin (également), septembre(rentrée) et decembre (fêtes)
# on peut créer une varaible "haute saison"
data['haute_saison'] = data['sale_day_x'].apply(lambda x: 1 if x in [4, 5, 8, 11] else 0)

data['is_public_holiday_near'] = (
    (data["origin_days_to_next_public_holiday"] <= 10)
)

# on peut regrouper les varaibles Od_number_of_similar_X_hours car elles sont très corrélé
data['total_similar_trains'] = data['od_number_of_similar_2_hours'] + data['od_number_of_similar_4_hours'] + data['od_number_of_similar_12_hours']
data['od_similar_low_density'] = data['total_similar_trains'] <= 8
data['od_similar_medium_density'] = data['total_similar_trains'].between(8, 12, inclusive="right")
data['od_similar_high_density'] = data['total_similar_trains'] > 12
## we dont encode the 'high' density to prevent from colinearity

# également créer une feature "is holiday"
data['is_public_holiday'] = (
    (data['origin_current_school_holiday'] == 1) &
    (data['destination_current_school_holiday'] == 1)
)
data.drop(columns=['origin_current_school_holiday', 'destination_current_school_holiday'], inplace=True)


In [28]:
# On encode uniquement les stations, car les date sont déjà encodés

# Creation de la pipeline
stationColumns = ['origin_station_name', 'destination_station_name']
allStations = pd.concat([data[stationColumns[0]], data[stationColumns[1]]])
stationEncoder = (sklearn.preprocessing.OneHotEncoder(sparse_output=False).fit(pd.DataFrame(allStations)))

numericalColumns = data.drop(columns=["demand"]).select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = sklearn.compose.ColumnTransformer(
    transformers = [
        ('originEncoder', stationEncoder, [stationColumns[0]]),
        ('destinationEncoder', stationEncoder, [stationColumns[1]]),
        ('standardscaler', sklearn.preprocessing.StandardScaler(), numericalColumns)
    ],
    remainder="passthrough"
)

RFRpipeline = sklearn.pipeline.Pipeline(
    steps= [
        ('preprocessing', preprocessor),
        ('regressor', sklearn.ensemble.RandomForestRegressor(random_state=0, verbose=True, n_jobs=-1))
    ]
)

XGBpipeline = sklearn.pipeline.Pipeline(
    steps= [
        ('preprocessing', preprocessor),
        ('regressor', XGBRegressor(random_state=0, ))
    ]
)
LightGBMpipeline = sklearn.pipeline.Pipeline( ## pertinent pour sa capacité à gérer les données données en très gros volumes, pas nécessaire de faire de la réduction de dimension car déjà le but de l'algo
    steps= [
        ('preprocessing', preprocessor),
        ('regressor', LGBMRegressor(random_state=0))
    ]
)

## Prédiction

In [29]:
# Création du jeu de test et de validation
y = data.demand
X = data.drop(columns=["demand"])

X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(
    X,
    y,
    train_size=0.7,
    random_state=0,
    
)

## Training with basic models and feature

In [None]:
# RANDOM FOREST REGRESSOR
RFRpipeline.fit(X_train, y_train)
print(RFRpipeline.score(X_val, y_val))

In [19]:
# XGBOOST REGRESSOR
XGBpipeline.fit(X_train, y_train)
print(XGBpipeline.score(X_val, y_val))

0.8162800073623657


In [None]:
# LightGBM REGRESSOR
LightGBMpipeline.fit(X_train, y_train)
print(LightGBMpipeline.score(X_val, y_val))

## Commentraire :
* XGBoost, RFR, LightGBM ont des performances équivalentes, bien que XGBoost et LightGBM soient plus rapides à l'entrainement.
* La sélection de variable à l'aide de l'importance (tant RFR que XGBoost) s'avère inefficace
* Reste donc à utiliser la CV pour voir qui est véritablement le meilleur modèle, et trouver d'autre manière de sélectionner les variables, où d'en créer de nouvelles

# Introduce Feature selection

## Selection using RFR feature importance

In [None]:
#compute feature importance
importances = RFRpipeline.named_steps['regressor'].feature_importances_
feature_names = RFRpipeline.named_steps['preprocessing'].get_feature_names_out()
# Create a DataFrame for better visualization
importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)
# Display the feature importances
importances_df

In [None]:
import re
## Select a subset of features that gather a given % of the importance
importanceRatio = 0.95
cumulative_importance = importances_df['Importance'].cumsum()
RFR_feature_subset = importances_df[cumulative_importance <= importanceRatio]['Feature'].tolist()
RFR_feature_subset = [re.search(r'__(.+)$', feature).group(1) for feature in RFR_feature_subset]

RFR_subset_numerical_columns = list(set(RFR_feature_subset) & set(numericalColumns))

has_destination_station = False
has_origin_station = False
for feature in RFR_feature_subset:
    if re.match(r"(.*)origin_station_name_(.*)", feature):
        RFR_feature_subset.remove(feature)
        if not has_origin_station:
            RFR_feature_subset.append('origin_station_name')
            has_origin_station = True
    if re.match(r"(.*)destination_station_name_(.*)", feature):
        RFR_feature_subset.remove(feature)
        if not has_destination_station:
            RFR_feature_subset.append('destination_station_name')
            has_destination_station = True
            
print(f"Selected features: {RFR_feature_subset}")

    

In [None]:
## Apply feature selection to the data

X_rfr_subset = X[RFR_feature_subset]
X_train_rfr_subset, X_val_rfr_subset, y_train_rfr_subset, y_val_rfr_subset = sklearn.model_selection.train_test_split(
    X_rfr_subset,
    y,
    train_size=0.7,
    random_state=0
)

rfr_subset_preprocessor = sklearn.compose.ColumnTransformer(
    transformers = [
        ('standardscaler', sklearn.preprocessing.StandardScaler(), RFR_subset_numerical_columns),
        ('originEncoder', stationEncoder, [stationColumns[0]]) if 'origin_station_name' in RFR_feature_subset else ('originEncoder', 'passthrough', []),
        ('destinationEncoder', stationEncoder, [stationColumns[1]]) if 'destination_station_name' in RFR_feature_subset else ('destinationEncoder', 'passthrough', [])
    ],
    remainder="passthrough"
)

RFR_subset_pipeline = sklearn.pipeline.Pipeline(
    steps= [
        ('preprocessing', rfr_subset_preprocessor),
        ('regressor', sklearn.ensemble.RandomForestRegressor(random_state=0, verbose=True, n_jobs=-1))
    ]
)
XGB_subset_pipeline = sklearn.pipeline.Pipeline(
    steps= [
        ('preprocessing', rfr_subset_preprocessor),
        ('regressor', XGBRegressor())
    ]
)

## Prediction with RFR selected features

In [None]:
# RANDOM FOREST REGRESSOR
RFR_subset_pipeline.fit(X_train_rfr_subset, y_train_rfr_subset)
print(RFR_subset_pipeline.score(X_val_rfr_subset, y_val_rfr_subset))

In [None]:
## Prediction with XGBOOST selected features
XGB_subset_pipeline.fit(X_train_rfr_subset, y_train_rfr_subset)
print(XGB_subset_pipeline.score(X_val_rfr_subset, y_val_rfr_subset))

## Feature selection with XGBoost

In [20]:
## Retrieve feature importance from XGBoost
XGB_importance = XGBpipeline.named_steps['regressor'].feature_importances_
XGB_features = XGBpipeline.named_steps['preprocessing'].get_feature_names_out()
XGB_feature_importance = pd.DataFrame(
    {
        "feature" : XGB_features,
        "importance" : XGB_importance
     }
).sort_values(by="importance", ascending=False)
XGB_feature_importance

Unnamed: 0,feature,importance
24,standardscaler__od_travel_time_minutes,0.299827
31,standardscaler__sale_day_x,0.192791
33,standardscaler__sale_week,0.047853
23,standardscaler__od_origin_year,0.039804
20,standardscaler__od_origin_time,0.039514
29,standardscaler__price,0.039051
26,standardscaler__origin_current_school_holiday,0.028614
18,standardscaler__od_number_of_similar_4_hours,0.023924
37,standardscaler__total_similar_trains,0.021756
15,standardscaler__od_destination_time,0.021713


In [24]:
import re
## Select a subset of features that gather a given % of the importance
importanceRatio = 0.98
XGB_cumulative_importance = XGB_feature_importance['importance'].cumsum()
XGB_feature_subset = XGB_feature_importance[XGB_cumulative_importance <= importanceRatio]['feature'].tolist()
XGB_feature_subset = [re.search(r'__(.+)$', feature).group(1) for feature in XGB_feature_subset]

XGB_subset_numerical_Columns = list(set(XGB_feature_subset) & set(numericalColumns))

# Remove the station name features and add the original feature names

has_destination_station = False
has_origin_station = False
for feature in XGB_feature_subset:
    if re.match(r"(.*)origin_station_name_(.*)", feature):
        XGB_feature_subset.remove(feature)
        if not has_origin_station:
            XGB_feature_subset.append('origin_station_name')
            has_origin_station = True
            
for feature in XGB_feature_subset:
    if re.match(r"(.*)destination_station_name_(.*)", feature):
        XGB_feature_subset.remove(feature)
        if not has_destination_station:
            XGB_feature_subset.append('destination_station_name')
            has_destination_station = True
            
print(f"Selected features: {XGB_feature_subset}")

Selected features: ['od_travel_time_minutes', 'sale_day_x', 'sale_week', 'od_origin_year', 'od_origin_time', 'price', 'origin_current_school_holiday', 'od_number_of_similar_4_hours', 'total_similar_trains', 'od_destination_time', 'od_number_of_similar_12_hours', 'origin_station_name_cpe', 'od_origin_week', 'destination_days_to_next_school_holiday', 'od_origin_weekday', 'od_origin_month', 'sale_month', 'od_similar_medium_density', 'destination_days_to_next_public_holiday', 'origin_days_to_next_school_holiday', 'sale_year', 'od_number_of_similar_2_hours', 'sale_day', 'origin_station_name', 'destination_station_name']


In [26]:
## Apply feature selection to the data
X_XGB_subset = X[XGB_feature_subset]
X_train_XGB_subset, X_val_XGB_subset, y_train_XGB_subset, y_val_XGB_subset = sklearn.model_selection.train_test_split(
    X_XGB_subset,
    y,
    train_size=0.7,
    random_state=0
)
XGB_subset_preprocessor = sklearn.compose.ColumnTransformer(
    transformers = [
        ('standardscaler', sklearn.preprocessing.StandardScaler(), XGB_subset_numerical_Columns),
        ('originEncoder', stationEncoder, [stationColumns[0]]) if 'origin_station_name' in XGB_feature_subset else ('originEncoder', 'passthrough', []),
        ('destinationEncoder', stationEncoder, [stationColumns[1]]) if 'destination_station_name' in XGB_feature_subset else ('destinationEncoder', 'passthrough', [])
    ],
    remainder="passthrough"
)

XGB_XGB_subset_pipeline = sklearn.pipeline.Pipeline(
    steps= [
        ('preprocessing', XGB_subset_preprocessor),
        ('regressor', XGBRegressor())
    ]
)
RFR_XGB_subset_pipeline = sklearn.pipeline.Pipeline(
    steps= [
        ('preprocessing', XGB_subset_preprocessor),
        ('regressor', sklearn.ensemble.RandomForestRegressor(random_state=0, verbose=True, n_jobs=-1, n_estimators=25))
    ]
)

LigthtGBM_XGB_subset_pipeline = sklearn.pipeline.Pipeline(
    steps= [
        ('preprocessing', XGB_subset_preprocessor),
        ('regressor', LGBMRegressor())
    ]
)

KeyError: "['origin_station_name_cpe'] not in index"

## Prediction with XGBoost selected features

In [None]:
# RANDOM FOREST REGRESSOR
RFR_XGB_subset_pipeline.fit(X_train_XGB_subset, y_train_XGB_subset)
print(RFR_XGB_subset_pipeline.score(X_val_XGB_subset, y_val_XGB_subset))

In [None]:
# XGBOOST REGRESSOR
XGB_subset_pipeline.fit(X_train_XGB_subset, y_train_XGB_subset)
print(XGB_subset_pipeline.score(X_val_XGB_subset, y_val_XGB_subset))

## Autre pistes de feature engineering

## Comparaison des modèles en CV

In [30]:
xgb_param_grid = {
    
}

lgbm_param_grid = {
}

xgb_search = GridSearchCV(XGBpipeline, xgb_param_grid, cv=10, n_jobs=-1, verbose=1)
lgbm_search = GridSearchCV(LightGBMpipeline, lgbm_param_grid, cv=10, n_jobs=-1, verbose=1)

# Fit on your data (replace df and target column as needed)

xgb_search.fit(X, y)
lgbm_search.fit(X, y)

print("XGBoost best RMSE:", xgb_search.best_score_)
print("XGBoost best params:", xgb_search.best_params_)

print("LightGBM best RMSE:", lgbm_search.best_score_)
print("LightGBM best params:", lgbm_search.best_params_)



Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.158167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1082
[LightGBM] [Info] Number of data points in the train set: 632841, number of used features: 40
[LightGBM] [Info] Start training from score 3.000149
XGBoost best RMSE: 0.7309605121612549
XGBoost best params: {}
LightGBM best RMSE: 0.750381616416935
LightGBM best params: {}


## test des réseaux de neuronnes

## Definition de la structure

In [None]:
import torch.nn as nn
import torch.optim as optim

class DemandPredictor(nn.Module):
    def __init__(self, input_dim):
        super(DemandPredictor, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),       # 1er bloc
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(128, 64),              # 2e bloc
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(64, 32),               # 3e bloc
            nn.ReLU(),

            nn.Linear(32, 1)                 # Sortie
        )

    def forward(self, x):
        return self.model(x)
    
model = DemandPredictor(input_dim=X.shape[1])
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)



## NN training loop

In [None]:
y_val

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Split train/val
X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X, y, test_size=0.3, random_state=0)

y_val = y_val.reset_index(drop=True)
y_train = y_train.reset_index(drop = True)

# Preprocessing
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
# Tensors
train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
val_dataset = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Instancier le modèle
model = DemandPredictor(input_dim=X_train.shape[1])
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

# Boucle d'entraînement
n_epochs = 50
for epoch in range(n_epochs):
    model.train()
    train_loss = 0
    for xb, yb in train_loader:
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            pred = model(xb)
            loss = loss_fn(pred, yb)
            val_loss += loss.item()

    print(f"Epoch {epoch+1}/{n_epochs} | Train Loss: {train_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(val_loader):.4f}")
