### Imports

In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath(
    os.path.join(os.path.dirname('__file__'), '..')))

In [2]:
# %pip install -r ../requirements.txt

In [None]:
from src.encoding.encoders import *
from src.models.sklearn_api_models_config import get_model
from src.experiments.base_experiment import BaseExperiment
import logging
import datetime as dt
from typing import List, Union, Optional
import pathlib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from src.datasets.base_tabular_dataset import BaseTabularDataset
from src.encoding.tools import create_encoding_pipeline
from src.experiments.features_selection import get_features, explore_features
from src.models.sklearn_api_model import Model, ModelTree
import src.features as ft
import mlflow.sklearn
import mlflow
import mlflow.data.pandas_dataset
from mlflow.models import infer_signature
import matplotlib.pyplot as plt
%matplotlib widget
import cudf as cd



1 GPU(s) detected.


### Config

In [5]:
# Define a logger used by all modules
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO, encoding="utf-8",
                    format="%(name)s %(asctime)s: %(levelname)s: %(message)s", handlers=[logging.StreamHandler()])

In [6]:
# Define the root directory of the project
root_dir = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))
root_dir = pathlib.Path(root_dir)

##### Encoding Pipeline

In [7]:
# Define an encoding scheme to create the encoding pipeline
encoders_dict = {
    'number': {
        'as_number': {
            'imputers': [imputers.SimpleImputer(strategy='mean')],
            'encoders': [
                ne.StandardScaler(),
            ]
        }
    },
    'category': {
        'as_category': {
            'imputers': [imputers.SimpleImputer(strategy='most_frequent')],
            'encoders': [
                ne.MultiTargetEncoder(drop_invariant=True, return_df=True),
            ]
        }
    },
    'datetime': {
        'as_number': {
            'imputers': [de.DateFeatureExtractor()],
            'encoders': [
                ne.CyclicalFeatures(drop_original=True)
            ]
        },
        'as_category': {
            'imputers': [de.DateFeatureExtractor(dtype='category')],
            'encoders': [
                ne.MultiTargetEncoder(drop_invariant=True, return_df=True),
            ]
        }
    },
    'boolean': {
        'as_number': {
            'imputers': [imputers.SimpleImputer(strategy='most_frequent')],
             'encoders': [ne.BooleanEncoder()]
         }

    }
}

In [8]:
# Create the encoding pipeline
pipeline = create_encoding_pipeline(encoders_dict)

Creating encoding pipeline


In [9]:
pipeline

##### Dataset

In [10]:
# Define the configuration for the fetching of the data
fetch_config = {
    "data_start": '01-01-2019',
    "data_stop": '31-12-2023',
    'data_dir': root_dir / 'data',
    # "locations": ['CHU Dijon', 'CH Beaune', 'CH Semur', 'CH Chatillon Montbard', 'CH privé Dijon', 'CH Langres', 'CH Chaumont', 'HNFC', 'CHU Besançon']
    "locations": ['CH AGGLOMERATION MONTARGOISE']
    }

In [11]:
# Select the features to be used in the dataset
ars_features_class = [
    ft.HospitalFeatures,
    ft.AirQualityFeatures,
    ft.EpidemiologicalFeatures,
    # ft.FireFightersFeatures(include_calls=False),
    # ft.GoogleTrendFeatures,
    ft.MeteorologicalFeatures,
    ft.SociologicalFeatures,
    ft.SportsCompetitionFeatures,
    ft.TrafficFeatures
    ]

In [12]:
# Define the configuration of the dataset
get_dataset_config = {
    'from_date': '01-01-2019',
    'to_date': '30-12-2023',
    'locations': ['CH AGGLOMERATION MONTARGOISE'],
    # 'locations': ['CHU Dijon', 'CH Beaune', 'CH Semur', 'CH Chatillon Montbard', 'CH privé Dijon'],
    # 'axis': 'rows',
    'shift': range(1, 8, 1),
    # 'rolling_window': [7, 14, 31, 365],
    'freq': '1D',
    'split_config': {'test_size': 0.2, 'val_size': 0.2, 'shuffle': False},
    'create_X_y': True,
    'encoding_pipeline': pipeline,
    'targets_names': ['nb_hospit_np_from_ED_adults'],
    'targets_shift': -3,
    'targets_rolling_window': 3,
    'targets_history_shifts': range(1, 8, 1),
    # 'targets_history_rolling_windows': [7, 14, 31, 365],
    'targets_locations': ['CH AGGLOMERATION MONTARGOISE'],
    'drop_constant_thr': 1.0,
    'data_dir': root_dir / 'data'
    }

In [13]:
from src.tools.utils import supprimer_fichier_feather_recursif

In [14]:
dossiers_a_parcourir = [
    "../data"
]
# print(os.getcwd())
supprimer_fichier_feather_recursif(dossiers_a_parcourir)

Deleting files...
Fichier supprimé: ../data/features/sociologicalfeatures/data_CH AGGLOMERATION MONTARGOISE.feather
Fichier supprimé: ../data/features/sportscompetitionfeatures/data_CH AGGLOMERATION MONTARGOISE.feather
Fichier supprimé: ../data/features/airqualityfeatures/data_CH AGGLOMERATION MONTARGOISE.feather
Fichier supprimé: ../data/features/trafficfeatures/data_CH AGGLOMERATION MONTARGOISE.feather
Fichier supprimé: ../data/features/epidemiologicalfeatures/data_CH AGGLOMERATION MONTARGOISE.feather
Fichier supprimé: ../data/features/hospitalfeatures/data_CH AGGLOMERATION MONTARGOISE.feather
Fichier supprimé: ../data/features/meteorologicalfeatures/data_CH AGGLOMERATION MONTARGOISE.feather


In [15]:
# Create the dataset and fetch the data from the source then call get_dataset() method to fill the different attributes (X and y) of the different sets, and their encodings
arsTabularDataset = BaseTabularDataset(features_classes=ars_features_class, logger=logger, fetch_config=fetch_config, getter_config=get_dataset_config)

root 2024-12-04 20:06:09,001: INFO: Initialisation de la classe BaseTabularDataset
root 2024-12-04 20:06:09,007: INFO: Initialisation des features
root 2024-12-04 20:06:09,008: INFO: Fetching dataset
root 2024-12-04 20:06:09,569: INFO: Fetching hospitalfeatures's data for CH AGGLOMERATION MONTARGOISE...
root 2024-12-04 20:06:09,571: INFO: Intégration de la target
root 2024-12-04 20:06:09,572: INFO:   - Chargement des données de CH AGGLOMERATION MONTARGOISE depuis le fichier Excel
root 2024-12-04 20:06:09,579: INFO:             nb_emmergencies
date                       
2019-01-01              202
2019-01-02              264
2019-01-03              248
2019-01-04              207
2019-01-05              184
...                     ...
2023-12-27              237
2023-12-28              192
2023-12-29              216
2023-12-30              198
2023-12-31              171

[1826 rows x 1 columns]
root 2024-12-04 20:06:09,586: INFO:             nb_emmergencies  nb_hospit_np_from_ED_chil

Unnamed: 0_level_0,nb_emmergencies,nb_hospit_np_from_ED_children,nb_hospit_np_from_ED_adults
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,202,0,10
2019-01-02,264,0,10
2019-01-03,248,0,10
2019-01-04,207,0,10
2019-01-05,184,0,10
...,...,...,...
2023-12-27,237,0,26
2023-12-28,192,0,23
2023-12-29,216,0,22
2023-12-30,198,0,16


root 2024-12-04 20:06:09,896: INFO: Fetching airqualityfeatures's data for CH AGGLOMERATION MONTARGOISE...
root 2024-12-04 20:06:09,897: INFO: On regarde la qualité de l'air
root 2024-12-04 20:06:09,908: INFO: On s'intéresse aux codes : FR34013, FR34017, FR34029
root 2024-12-04 20:06:09,909: INFO: On relit le dataframe d'archive de l'air
root 2024-12-04 20:06:09,913: INFO: Fin de la gestion de la qualité de l'air en 0.02 s.


Unnamed: 0_level_0,O3_FR34017,O3_FR34029,PM10_FR34013,PM10_FR34029,PM25_FR34029,NO2_FR34013,NO2_FR34029
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-01,74.0,74.0,10.0,9.0,5.2,10.0,5.3
2018-01-02,53.0,59.0,12.0,9.0,5.2,23.0,6.2
2018-01-03,68.0,69.0,13.0,10.0,5.2,17.0,7.2
2018-01-04,59.0,64.0,5.0,2.0,5.2,16.0,4.3
2018-01-05,62.0,67.0,10.0,5.0,5.2,33.0,9.4
...,...,...,...,...,...,...,...
2024-04-17,59.0,66.0,8.0,5.0,3.3,20.0,4.9
2024-04-18,58.0,64.0,8.0,6.0,4.3,29.0,6.8
2024-04-19,63.0,68.0,9.0,6.0,3.7,20.0,5.7
2024-04-20,68.0,73.0,11.0,9.0,5.3,18.0,3.3


root 2024-12-04 20:06:10,208: INFO: Fetching epidemiologicalfeatures's data for CH AGGLOMERATION MONTARGOISE...
root 2024-12-04 20:06:10,210: INFO: On s'occupe de l'incidence des maladies d'après Sentinelles
root 2024-12-04 20:06:10,211: INFO:   - on regarde l'incidence de grippe pour la région CENTRE
root 2024-12-04 20:06:10,222: INFO:     Pour la dernière date connue ('18/11/2024', semaine 202447), l'incidence était de 1311
root 2024-12-04 20:06:10,231: INFO:     La première date connue était '29/10/1984', semaine 198444
root 2024-12-04 20:06:10,238: INFO:   - on regarde l'incidence de diarrhee pour la région CENTRE
root 2024-12-04 20:06:10,246: INFO:     Pour la dernière date connue ('18/11/2024', semaine 202447), l'incidence était de 788
root 2024-12-04 20:06:10,254: INFO:     La première date connue était '03/12/1990', semaine 199049
root 2024-12-04 20:06:10,261: INFO:   - on regarde l'incidence de varicelle pour la région CENTRE
root 2024-12-04 20:06:10,268: INFO:     Pour la der

Unnamed: 0_level_0,inc_grippe,inc_diarrhee,inc_varicelle,inc_ira
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,864,3939,869,0
2019-01-02,864,3939,869,0
2019-01-03,864,3939,869,0
2019-01-04,864,3939,869,0
2019-01-05,864,3939,869,0
...,...,...,...,...
2023-12-27,5387,3215,228,8554
2023-12-28,5387,3215,228,8554
2023-12-29,5387,3215,228,8554
2023-12-30,5387,3215,228,8554


root 2024-12-04 20:06:10,626: INFO: Fetching meteorologicalfeatures's data for CH AGGLOMERATION MONTARGOISE...


Unnamed: 0_level_0,meteo_tavg,meteo_tmin,meteo_tmax,meteo_prcp,meteo_snow,meteo_wdir,meteo_wspd,meteo_pres
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-01,10.4,6.0,14.0,0.3,0.0,348.0,8.2,1016.1
2019-01-02,10.4,6.0,14.0,0.3,0.0,348.0,8.2,1016.1
2019-01-03,10.4,6.0,14.0,0.3,0.0,348.0,8.2,1016.1
2019-01-04,10.4,6.0,14.0,0.3,0.0,348.0,8.2,1016.1
2019-01-05,10.4,6.0,14.0,0.3,0.0,348.0,8.2,1016.1
...,...,...,...,...,...,...,...,...
2023-12-27,8.8,5.0,12.0,0.0,0.0,192.0,13.6,1017.7
2023-12-28,9.8,9.0,10.0,1.3,0.0,218.0,19.5,1018.4
2023-12-29,9.8,8.0,12.0,0.4,0.0,223.0,21.8,1017.6
2023-12-30,9.7,8.0,12.0,0.3,0.0,200.0,16.0,1015.0


root 2024-12-04 20:06:10,956: INFO: Fetching sociologicalfeatures's data for CH AGGLOMERATION MONTARGOISE...
root 2024-12-04 20:06:10,958: INFO: On s'occupe des variables de vacances
root 2024-12-04 20:06:10,958: INFO: On récupère la liste des jours fériés
root 2024-12-04 20:06:10,959: INFO: On l'intègre au dataframe
root 2024-12-04 20:06:10,966: INFO: On s'occupe des vacances en tant que tel
root 2024-12-04 20:06:11,455: INFO: Variables de vacances intégrées
root 2024-12-04 20:06:11,456: INFO: On s'occupe des variables de confinement
root 2024-12-04 20:06:11,472: INFO: Variables de confinement intégrées
root 2024-12-04 20:06:11,473: INFO: On s'occupe des variables de Ramadan
root 2024-12-04 20:06:11,479: INFO: Intégration du déménagement de l'HNFC
root 2024-12-04 20:06:11,481: INFO: Intégration du COVID


Unnamed: 0_level_0,bankHolidays,eveBankHolidays,holidays,borderHolidays,confinement1,confinement2,couvrefeux,ramadan,before_HNFC_moving,during_HNFC_moving,after_HNFC_moving,before_COVID,during_COVID,after_COVID
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-01-01,True,False,True,False,False,False,False,False,False,False,True,True,False,False
2019-01-02,False,False,True,False,False,False,False,False,False,False,True,True,False,False
2019-01-03,False,False,True,False,False,False,False,False,False,False,True,True,False,False
2019-01-04,False,False,True,False,False,False,False,False,False,False,True,True,False,False
2019-01-05,False,False,True,False,False,False,False,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-27,False,False,True,False,False,False,False,False,False,False,True,False,False,True
2023-12-28,False,False,True,False,False,False,False,False,False,False,True,False,False,True
2023-12-29,False,False,True,False,False,False,False,False,False,False,True,False,False,True
2023-12-30,False,False,True,False,False,False,False,False,False,False,True,False,False,True


root 2024-12-04 20:06:11,873: INFO: Fetching sportscompetitionfeatures's data for CH AGGLOMERATION MONTARGOISE...
root 2024-12-04 20:06:11,875: INFO: Intégration des données de football
root 2024-12-04 20:06:11,888: INFO: Données de football intégrées


Unnamed: 0_level_0,foot
date,Unnamed: 1_level_1
2019-01-01,0
2019-01-02,1
2019-01-03,0
2019-01-04,0
2019-01-05,0
...,...
2023-12-27,0
2023-12-28,0
2023-12-29,0
2023-12-30,0


root 2024-12-04 20:06:12,182: INFO: Fetching trafficfeatures's data for CH AGGLOMERATION MONTARGOISE...
root 2024-12-04 20:06:12,183: INFO: Intégration des données de trafic


Unnamed: 0_level_0,nb_accidents
date,Unnamed: 1_level_1
2019-01-01,1.0
2019-01-02,0.0
2019-01-03,0.0
2019-01-04,0.0
2019-01-05,1.0
...,...
2023-12-27,0.0
2023-12-28,0.0
2023-12-29,0.0
2023-12-30,0.0


root 2024-12-04 20:06:12,550: INFO: Getting the dataset from 01-01-2019 to 30-12-2023 for CH AGGLOMERATION MONTARGOISE
root 2024-12-04 20:06:12,820: INFO: Getting data for hospitalfeatures from 2019-01-01 00:00:00 to 2023-12-30 00:00:00, at a 1D frequency
root 2024-12-04 20:06:12,827: INFO: Augmentation des features...
root 2024-12-04 20:06:12,834: INFO: Getting data for airqualityfeatures from 2019-01-01 00:00:00 to 2023-12-30 00:00:00, at a 1D frequency
root 2024-12-04 20:06:12,843: INFO: Augmentation des features...
root 2024-12-04 20:06:12,860: INFO: Getting data for epidemiologicalfeatures from 2019-01-01 00:00:00 to 2023-12-30 00:00:00, at a 1D frequency
root 2024-12-04 20:06:12,865: INFO: Augmentation des features...
root 2024-12-04 20:06:12,876: INFO: Getting data for meteorologicalfeatures from 2019-01-01 00:00:00 to 2023-12-30 00:00:00, at a 1D frequency
root 2024-12-04 20:06:12,885: INFO: Augmentation des features...
root 2024-12-04 20:06:12,908: INFO: Getting data for socio

Dropped columns with zero variance: ['nb_hospit_np_from_ED_children']
Dropped columns with zero variance: ['meteo_snow']
            inc_varicelle%%J-1  O3_FR34017%%J-1  nb_emmergencies%%J-3  \
date                                                                    
2019-01-03               869.0             62.0                 202.0   
2019-01-04               869.0             42.0                 202.0   
2019-01-05               869.0             38.0                 264.0   
2019-01-06               869.0             51.0                 248.0   
2019-01-07               869.0             42.0                 207.0   
...                        ...              ...                   ...   
2023-12-20                 0.0             39.0                 158.0   
2023-12-21                 0.0             49.0                 251.0   
2023-12-22                 0.0             63.0                 210.0   
2023-12-23                 0.0             69.0                 177.0   
202

root 2024-12-04 20:06:13,026: INFO: Calculating train/val/test sets and encodings...
root 2024-12-04 20:06:13,213: INFO: 30 features not encoded (same unit as target)


Dropped 0 constant columns from both sets: set()
X shape: (1162, 265), y shape: (1162, 1)
[ColumnTransformer] .... (1 of 4) Processing pipeline-1, total=   0.0s
[ColumnTransformer] .... (2 of 4) Processing pipeline-3, total=   0.0s
[ColumnTransformer] .... (3 of 4) Processing pipeline-4, total=   0.1s
[ColumnTransformer] .... (4 of 4) Processing pipeline-5, total=   0.0s


In [None]:
df = arsTabularDataset.data

# # Identifie les colonnes contenant des NaN
# cols_with_nan = df.columns[df.isna().any()].tolist()
# print("Colonnes contenant des NaN:", cols_with_nan)

# # Affiche la liste des index des lignes contenant des NaN pour chaque colonne
# nan_indices = {col: df[df[col].isna()].index.tolist() for col in cols_with_nan}
# nan_indices

In [None]:
df

In [None]:
# df.to_csv(fetch_config['data_dir'] / f'datasets/full_dataset_{get_dataset_config['locations']}.csv')
# df_target = arsTabularDataset.data[arsTabularDataset.targets_names]
# df_target.to_csv(fetch_config['data_dir'] / f'datasets/full_dataset_{get_dataset_config['locations']}_targets.csv')

##### Model

In [None]:
# print(arsTabularDataset.data.columns.to_list())
# Define the model parameters
from src.models.obectives import *
model_params = {
    # 'tree_method': 'approx',
    # 'tree_method': 'hist',
    'early_stopping_rounds': 10,
    # 'eval_set': [(arsTabularDataset.enc_X_val, arsTabularDataset.y_val)], # TODO: to be set in the experiment's run method
    'verbosity': 1,
    'n_estimators': 10000,
    'learning_rate': 0.1,
    'min_child_weight': 5,
    'subsample': 0.1, 
    'sampling_method': 'gradient_based',
    'colsample_bytree':0.7,
    'colsample_bylevel':0.7,
    'colsample_bynode':0.7,
    # 'huber_slope': 1.0,
    # 'quantile_alpha': np.array([0.5]),
    'objective': 'reg:squarederror',
    # 'tweedie_variance_power': 2,
    # 'alpha': 10
    # 'multi_strategy': 'one_output_per_tree',
    # 'multi_strategy': 'multi_output_tree' 
}

In [None]:
metrics = ['mae', 'mse', 'rmse', 'w_rmse', 'pw_rmse', 'msle', 'rmsle', 'r2', 'mqe', 'msse', 'max_error', 'explained_variance']

In [None]:
# Create the model
n=2
model = get_model(model_type='xgboost',
                  name='exogeneous_regressor',
                  device='cuda',
                  task_type='regression',
                  test_metrics=metrics,
                  params=model_params,
                  n=n,
                  ensemble_method='voting')

In [None]:
# import copy
# endogeneous_model = copy.deepcopy(exogenous_model)
# endogeneous_model.set_params({'name':'endogeneous_regressor'})
# model = ModelVoting(models=[exogenous_model, endogeneous_model], loss=metrics)

In [None]:
# Create the experiment
ars_experiment = BaseExperiment(logger=logger, dataset=arsTabularDataset, model=model, name=', '.join(arsTabularDataset.targets_names))

In [None]:
# Set the model fitting config
grid_params = {
    'max_depth': [3, 4, 5, 7, 9],
    'gamma' : [0.01, 0.05, 0.1, 0.2, 0.3]
}

In [None]:
fit_params = {
    'verbose': 1,
}

In [None]:
grid_params_list = [grid_params for _ in range(n)]
fit_params_list = [fit_params for _ in range(n)]

In [None]:
from sklearn.model_selection import TimeSeriesSplit

In [None]:
model_config={"optimization": "grid", "grid_params": grid_params, "fit_params": fit_params, 'cv_folds': 10}
voting_congig={"optimization": "grid", "grid_params_list": grid_params_list, "fit_params_list": fit_params_list, 'cv_folds': TimeSeriesSplit()}

### Start run

In [None]:
find_best_features = False

In [None]:
run = mlflow.start_run(run_name='run_' + str(ars_experiment.run_nb), log_system_metrics=True)
run_dir = ars_experiment.dir_runs / f'{run.info.run_id}/artifacts/'
run_dir = pathlib.Path(run_dir)
ars_experiment.logger.info("Running the experiment...")

In [None]:
# ars_experiment.dataset.get_dataset(**get_dataset_config)
# Certain fit_params doivent être initialisés après la création des datasets : eval_set
model_config['fit_params'].update({'eval_set': [((ars_experiment.dataset.enc_X_train, ars_experiment.dataset.y_train[target]), (
    ars_experiment.dataset.enc_X_val, ars_experiment.dataset.y_val[target])) for target in ars_experiment.dataset.targets_names]})

mlflow.log_table(data=ars_experiment.dataset.data,
                artifact_file='datasets/full_dataset.json')


In [None]:
if find_best_features:
    if isinstance(find_best_features, bool):
        # selected_features = ars_experiment.get_important_features(dataset=dataset, model=ars_experiment.model, model_config=model_config)
        selected_features = ['nb_emmergencies%%J-7', 'nb_emmergencies%%J-1', 'nb_emmergencies%%J-2','nb_emmergencies%%J-3',
                            'nb_emmergencies', 'NO2_FR26094%%mean_7J', 'nb_emmergencies%%mean_365J', 'eveBankHolidays',
                            'meteo_wdir%%J-7', 'confinement1', 'trend_grippe%%mean_7J', 'trend_hopital%%J-3', 'trend_vaccin%%J-2',
                            'inc_diarrhee%%J-7', 'PM25_FR26094%%J-7', 'trend_crampes abdominales%%J-7', 'trend_médecin',
                            'trend_crampes abdominales%%mean_7J', 'confinement2', 'NO2_FR26010', 'trend_hopital%%J-2', 'trend_mal de tête%%mean_7J',
                            'trend_paralysie%%J-7', 'trend_accident de voiture%%mean_7J', 'trend_paralysie%%mean_7J', 'meteo_tavg%%mean_7J',
                            'trend_insuffisance cardiaque', 'trend_fièvre%%J-7', 'trend_infection respiratoire%%mean_7J']
        selected_features.extend(['PM10_FR26005%%mean_31J', 'foot%%std_14J', 'inc_ira%%mean_31J', 
                            'meteo_tmin%%mean_31J', 'trend_vaccin%%mean_31J', 'confinement2',
                            'meteo_tmax%%mean_31J', 'after_HNFC_moving', 'trend_vaccin%%mean_14J',
                            'trend_hopital%%mean_31J', 'trend_hopital%%mean_14J', 'date##week_cos',
                            'O3_FR26010%%mean_31J', 'O3_FR26005%%mean_31J', 'meteo_tavg%%mean_31J',
                            'inc_grippe%%mean_31J', 'inc_grippe%%mean_14J', 'date##week_sin',
                            'date##dayofYear_sin', 'confinement1'])
        selected_features = ['nb_emmergencies_CHU Dijon', 'nb_emmergencies_CHU Dijon%%J-1',
        'nb_emmergencies_CHU Dijon%%J-2', 'nb_emmergencies_CHU Dijon%%J-3',
        'nb_emmergencies_CHU Dijon%%J-4', 'nb_emmergencies_CHU Dijon%%J-5',
        'nb_emmergencies_CHU Dijon%%J-6', 'nb_emmergencies_CHU Dijon%%J-7',
        'nb_emmergencies_CHU Dijon%%J-8', 'nb_emmergencies_CHU Dijon%%J-9',
        'nb_emmergencies_CHU Dijon%%J-10', 'nb_emmergencies_CHU Dijon%%J-11',
        'nb_emmergencies_CHU Dijon%%J-12', 'nb_emmergencies_CHU Dijon%%J-13',
        'nb_emmergencies_CHU Dijon%%mean_7J',
        'nb_emmergencies_CHU Dijon%%mean_14J', 'inc_diarrhee', 'inc_ira',
        'inc_diarrhee%%J-1', 'inc_diarrhee%%J-2', 'inc_diarrhee%%J-3',
        'inc_ira%%J-1', 'inc_ira%%J-2', 'inc_ira%%J-3', 'inc_ira%%J-4',
        'inc_ira%%J-5', 'inc_ira%%J-6', 'inc_ira%%J-8', 'inc_ira%%J-9',
        'inc_ira%%J-10', 'inc_ira%%J-11', 'inc_ira%%J-12', 'inc_ira%%J-13',
        'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-6',
        'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-7',
        'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-8',
        'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-9',
        'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-10',
        'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-11',
        'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-12',
        'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-13',
        'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%mean_7J%%J-6',
        'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%mean_14J%%J-6',
        'date##week##cat##target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J']
    elif isinstance(find_best_features, List):
        selected_features = find_best_features
    elif isinstance(find_best_features, str):
        selected_features = [find_best_features]
    else:
        raise ValueError('find_best_features must be a bool, a list or a string')

    if get_dataset_config['axis'] == 'columns':
        for loc in get_dataset_config['locations']:
            selected_features = [feat + '_' + loc.name if not feat.endswith(loc.name) else feat for feat in selected_features]
    elif get_dataset_config['axis'] == 'rows' and 'location' not in ars_experiment.dataset.columns.to_list():
        selected_features.append('location')

    # selected_features = dataset.enc_X_train.columns.to_list()
    get_dataset_config['features_names'] = selected_features
    ars_experiment.logger.info(
        'Features selected: {}'.format(selected_features))
    ars_experiment.dataset.get_dataset(**get_dataset_config)
    mlflow.log_table(data=ars_experiment.dataset.data,
                    artifact_file='datasets/full_dataset_feature_selection.json')
    model_config['fit_params']['eval_set'] = [
        (ars_experiment.dataset.enc_X_val, ars_experiment.dataset.y_val[target]) for target in ars_experiment.dataset.targets_names]


In [None]:

mlflow.log_table(data=ars_experiment.dataset.train_set,
                    artifact_file='datasets/train_set.json')
mlflow.log_table(data=ars_experiment.dataset.val_set,
                    artifact_file='datasets/val_set.json')
mlflow.log_table(data=ars_experiment.dataset.test_set,
                    artifact_file='datasets/test_set.json')

train_dataset = mlflow.data.pandas_dataset.from_pandas(
    ars_experiment.dataset.train_set)
val_dataset = mlflow.data.pandas_dataset.from_pandas(
    ars_experiment.dataset.val_set)
test_dataset = mlflow.data.pandas_dataset.from_pandas(
    ars_experiment.dataset.test_set)

mlflow.log_input(dataset=train_dataset, context='training')
mlflow.log_input(dataset=val_dataset, context='validation')
mlflow.log_input(dataset=test_dataset, context='testing')

dataset_config_log = get_dataset_config.copy()
dataset_config_log['locations'] = [loc.name for loc in dataset_config_log.pop('locations')]
dataset_config_log['targets_locations'] = [loc.name for loc in dataset_config_log.pop('targets_locations')]
mlflow.log_params(dataset_config_log)


In [None]:
mlflow.log_params({f'grid_{key}': value for key,
                    value in model_config['grid_params'].items()})
# mlflow.log_params(model_config['params'])
mlflow.log_params(model_config['fit_params'])
mlflow.log_param('optimization', model_config['optimization'])
ars_experiment.model.fit(cd.DataFrame(ars_experiment.dataset.enc_X_train),
                ars_experiment.dataset.y_train, **model_config)
ars_experiment.logger.info("Model fitted.")


In [None]:
params = ars_experiment.model.get_params(deep=True)
if params['objective'] is not None:
    # Check if objective is a function
    if callable(params['objective']):
        params['objective'] = params['objective'].__name__

if params['eval_metric'] is not None:
    if callable(params['eval_metric']):
        params['eval_metric'] = params['eval_metric'].__name__
    else:
        params['eval_metric'] = params['eval_metric']
mlflow.log_params(params=params)


In [None]:
y_pred = ars_experiment.predict(ars_experiment.dataset)
mlflow.log_table(data=y_pred, artifact_file='datasets/pred.json')
scores = ars_experiment.score(ars_experiment.dataset)
mlflow.log_metrics(scores)
print(scores)


In [None]:
signature = infer_signature(ars_experiment.dataset.enc_X_test, y_pred)
mlflow.sklearn.log_model(ars_experiment.model, "model", signature=signature)
figure, ax = ars_experiment.plot(ars_experiment.dataset, y_pred, scores)
mlflow.log_figure(figure, 'predictions.png')
plt.show()


In [None]:
error_fig = ars_experiment.model.get_prediction_error_display(y=ars_experiment.dataset.y_test, y_pred=y_pred)
mlflow.log_figure(error_fig, 'errors.png')

ars_experiment.run_nb += 1
mlflow.end_run()

In [None]:
import xgboost as xgb

In [None]:
model: Model = ars_experiment.model
best_estimator: xgb.XGBModel = model.best_estimator_
booster: xgb.Booster = best_estimator.get_booster()


In [None]:
print("Model Parameters:")
for param in best_estimator.get_xgb_params().keys():
    print(f"{param}: {best_estimator.get_xgb_params()[param]}")


In [None]:
print("\nFeature Importances:")
print(best_estimator.feature_importances_)

In [None]:
xgb.plot_importance(best_estimator, importance_type='cover', max_num_features=30)

In [None]:
xgb.plot_tree(best_estimator)

In [None]:
# Un exemple pour déterminer l'importance des variables après un pré-apprentissage XGBoost
importance_gain = booster.get_score(importance_type='gain')
importance_cover = booster.get_score(importance_type='cover')
importance_weight = booster.get_score(importance_type='weight')

df_gain = pd.DataFrame.from_dict(importance_gain, orient='index', columns=['gain'])
df_cover = pd.DataFrame.from_dict(importance_cover, orient='index', columns=['cover'])
df_weight = pd.DataFrame.from_dict(importance_weight, orient='index', columns=['weight'])

df = df_gain.join(df_cover, how='outer').join(df_weight, how='outer')
df.fillna(0, inplace=True)  # Remplacer les valeurs manquantes par 0 si nécessaire

df = df_gain.join(df_cover, how='outer').join(df_weight, how='outer')
df.fillna(0, inplace=True)  # Remplacer les valeurs manquantes par 0 si nécessaire

df['gain_norm'] = df['gain'] / df['gain'].sum()
df['cover_norm'] = df['cover'] / df['cover'].sum()
df['weight_norm'] = df['weight'] / df['weight'].sum()

w_gain = 0.5
w_cover = 0.3
w_weight = 0.2

df['importance'] = (df['gain_norm'] * w_gain) + (df['cover_norm'] * w_cover) + (df['weight_norm'] * w_weight)

df.sort_values(by='importance', ascending=False, inplace=True)
df['rank'] = df['importance'].rank(ascending=False)

df.reset_index(inplace=True)
df.rename(columns={'index': 'feature'}, inplace=True)
# print(df[['feature', 'gain', 'cover', 'weight', 'importance', 'rank']])

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Sort features by importance
df_sorted = df.sort_values(by='importance', ascending=False)

# # Select only the 30 first features (based on sorted order)
# df_sorted = df_sorted.nlargest(30, 'importance')

# Plotting the horizontal bar chart
plt.figure(figsize=(20, 8))
plt.barh(df_sorted['feature'], df_sorted['importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')

# Adding the importance values next to each bar
for index, value in enumerate(df_sorted['importance']):
    plt.text(value + 0.0005, index, f'{value:.5f}', va='center')
plt.tight_layout()
plt.show()

In [None]:
df_sorted

In [None]:
df_sorted.to_csv("feature_importance.csv")

In [None]:
X_train = ars_experiment.dataset.enc_X_train
X_test = ars_experiment.dataset.enc_X_test
X_val = ars_experiment.dataset.enc_X_val

y_train = ars_experiment.dataset.y_train
y_test = ars_experiment.dataset.y_test
y_val = ars_experiment.dataset.y_val

In [None]:
def recursive_prediction(dataset: BaseTabularDataset):
    predictions = pd.DataFrame(index=dataset.y_test.index)
    predictions['preds'] = np.nan
    for i in predictions.index[6:]:
        df_copy = dataset.enc_X_test
        # print(f'{i}')

        # Remplacer les valeurs réelles par les valeurs prédites
        for j in range(6, 0, -1):
            # print(f'on prédit J-{j} et on place cette prédiction dans les colonnes d\'historique des jours d\'historiques suivant')
            # print(pd.DataFrame(df_copy.loc[i - dt.timedelta(days=j)]).T)
            prediction = model.predict(pd.DataFrame(df_copy.loc[i - dt.timedelta(days=j)]).T)[0]
            # print(prediction)
            is_last_history_day = True
            for k in range(j-1, 0, -1):
                # print(f'on place la prédiction de J-{j} dans timeserie_J-{j-k} de la ligne J-{k}')
                df_copy.loc[i - dt.timedelta(days=k), f'target_nb_vers_hospit%J+3%mean_3J%%J-{j-k}'] = prediction
                is_last_history_day = False
            if is_last_history_day:
                print(f'prediction pour le {i}: {prediction}')
                predictions.loc[i, 'preds'] = prediction

    return predictions

In [None]:
preds = recursive_prediction(ars_experiment.dataset)

In [None]:
ars_experiment.plot(ars_experiment.dataset, preds)

In [None]:
import shap
shap.initjs()
explainer = explainer_xgb = shap.TreeExplainer(best_estimator)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train)

In [None]:
shap.plots.beeswarm(shap_values=shap_values)

In [None]:
shap.plots.force(explainer_xgb.expected_value, shap_values)

In [None]:
# Run the experiment
# ars_experiment.run(dataset_config=get_dataset_config, model_config=model_config, find_best_features=True)

In [None]:
# y_pred = ars_experiment.predict(ars_experiment.dataset)
# y_true = ars_experiment.dataset.y_test

In [None]:
# ars_experiment.model.get_prediction_error_display(y_true, y_pred)

In [None]:
# X_train.to_csv("X_train.csv", index=False)
# X_test.to_csv("X_test.csv", index=False)
# X_val.to_csv("X_val.csv", index=False)
# y_train.to_csv("y_train.csv", index=False)
# y_test.to_csv("y_test.csv", index=False)
# y_val.to_csv("y_val.csv", index=False)

In [None]:
# model = ars_experiment.model

In [None]:
# dataset = arsTabularDataset

In [None]:
# dataset.enc_data.head()

In [None]:
# df = dataset.enc_data
# df

In [None]:
# from scipy.cluster import hierarchy
# from scipy.spatial.distance import squareform
# from scipy.stats import spearmanr
# import matplotlib.pyplot as plt
# import numpy as np

# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
# corr = spearmanr(X).correlation

# # Ensure the correlation matrix is symmetric
# corr = (corr + corr.T) / 2
# np.fill_diagonal(corr, 1)

# # We convert the correlation matrix to a distance matrix before performing
# # hierarchical clustering using Ward's linkage.
# distance_matrix = 1 - np.abs(corr)
# dist_linkage = hierarchy.ward(squareform(distance_matrix))
# dendro = hierarchy.dendrogram(
#     dist_linkage, labels=X.columns.to_list(), ax=ax1, leaf_rotation=90
# )
# dendro_idx = np.arange(0, len(dendro["ivl"]))

# ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]])
# ax2.set_xticks(dendro_idx)
# ax2.set_yticks(dendro_idx)
# ax2.set_xticklabels(dendro["ivl"], rotation="vertical")
# ax2.set_yticklabels(dendro["ivl"])
# fig.tight_layout()

In [None]:
# from shap_select import shap_select

In [None]:
# selected_features_df = shap_select(model.best_estimator_, X_val, y_val, task="regression", threshold=0.05)

In [None]:
# X_train

In [None]:
# bst = model.best_estimator_.get_booster()

In [None]:
# importance_gain = bst.get_score(importance_type='gain')
# importance_cover = bst.get_score(importance_type='cover')
# importance_weight = bst.get_score(importance_type='weight')

In [None]:
# df_cover = pd.DataFrame(importance_cover, index=[0]).T
# df_gain = pd.DataFrame(importance_gain, index=[0]).T
# df_weight = pd.DataFrame(importance_weight, index=[0]).T

In [None]:
# df_cover.shape

In [None]:
# df_cover.sort_values(by=0).plot(kind="barh", figsize=(15,20))
# df_gain.sort_values(by=0).plot(kind="barh", figsize=(15,20))
# df_weight.sort_values(by=0).plot(kind="barh", figsize=(15,20))

In [None]:
# model.shapley_additive_explanation(X_test, outname='shap_b', dir_output='.', mode="beeswarm", figsize=(50, 25))

In [None]:
# from collections import defaultdict

# cluster_ids = hierarchy.fcluster(dist_linkage, 1, criterion="distance")
# cluster_id_to_feature_ids = defaultdict(list)
# for idx, cluster_id in enumerate(cluster_ids):
#     cluster_id_to_feature_ids[cluster_id].append(idx)
# selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
# selected_features_names = X.columns[selected_features]
# X_train_sel = X_train[selected_features_names]
# X_test_sel = X_test[selected_features_names]
# X_val_sel = X_val[selected_features_names]
# model_config['fit_params'].update({'eval_set': [(X_val_sel, y_val[target]) for target in ars_experiment.dataset.targets_names]})
# model.fit(X_train_sel, y_train, **model_config)
# print(
#     "Baseline accuracy on test data with features removed:"
#     f" {model.score(X_test_sel, y_test):.2}"
# )


In [None]:
# import matplotlib

# from sklearn.inspection import permutation_importance
# from sklearn.utils.fixes import parse_version


# def plot_permutation_importance(clf, X, y, ax):
#     result = permutation_importance(clf, X, y, n_repeats=10, random_state=42)
#     perm_sorted_idx = result.importances_mean.argsort()

#     # `labels` argument in boxplot is deprecated in matplotlib 3.9 and has been
#     # renamed to `tick_labels`. The following code handles this.
#     tick_labels_parameter_name = (
#         "tick_labels"
#         if parse_version(matplotlib.__version__) >= parse_version("3.9")
#         else "labels"
#     )
#     tick_labels_dict = {tick_labels_parameter_name: X.columns[perm_sorted_idx]}
#     ax.boxplot(result.importances[perm_sorted_idx].T, vert=False, **tick_labels_dict)
#     ax.axvline(x=0, color="k", linestyle="--")
#     return ax


In [None]:
# fig, ax = plt.subplots(figsize=(7, 6))
# plot_permutation_importance(model, X_test_sel, y_test, ax)
# ax.set_title("Permutation Importances on selected subset of features\n(test set)")
# ax.set_xlabel("Decrease in accuracy score")
# ax.figure.tight_layout()
# plt.show()

In [None]:
# get the first line of x train
# X_train_sel[:1]

In [None]:
# y_train[:1]

In [None]:
# model.predict(X_train_sel[:1])

In [None]:
# import shap
# shap.initjs()
# explainer_xgb = shap.TreeExplainer(model.best_estimator_)
# single_explanation = explainer_xgb.shap_values(X_train_sel[:1])
# shap.summary_plot(single_explanation, X_test_sel, plot_type="bar")


In [None]:
# X_train.columns

In [None]:
# model = ars_experiment.model

In [None]:
# X_train = ars_experiment.dataset.enc_X_train
# X_test = ars_experiment.dataset.enc_X_test
# X_val = ars_experiment.dataset.enc_X_val

# y_train = ars_experiment.dataset.y_train
# y_test = ars_experiment.dataset.y_test
# y_val = ars_experiment.dataset.y_val

In [None]:
# explainer_xgb = shap.TreeExplainer(model.best_estimator_)

In [None]:
# single_explanation = explainer_xgb.shap_values(X_train[:1])

In [None]:
# shap_values_xgb = explainer_xgb.shap_values(X_test[:50])

In [None]:
# shap.dependence_plot('inc_grippe', shap_values_xgb, X_train_sel)

In [None]:
# shap_values_xgb