### Imports

In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath(
    os.path.join(os.path.dirname('__file__'), '..')))

In [2]:
# %pip install -r ../requirements.txt

In [3]:
from src.encoding.encoders import *
from src.encoding.tools import create_encoding_pipeline
from src.models.sklearn_api_model import save_object, Model
from src.models.sklearn_api_models_config import get_model
from src.datasets.base_tabular_dataset import BaseTabularDataset
from src.experiments.base_experiment import BaseExperiment
import src.features as ft
import logging
import pandas as pd
import pathlib
import numpy as np

In [4]:
import logging
import os
import sys
import datetime as dt
from typing import List, Union, Optional
import pathlib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from src.datasets.base_tabular_dataset import BaseTabularDataset
from src.encoding.tools import create_encoding_pipeline
from src.experiments.features_selection import get_features, explore_features
from src.models.sklearn_api_model import Model, ModelTree
import src.features as ft
import mlflow.sklearn
import mlflow
import mlflow.data.pandas_dataset
from mlflow.models import infer_signature
import os
import matplotlib.pyplot as plt
%matplotlib widget
import cudf as cd
import numpy as np
import re


### Config

In [5]:
# Define a logger used by all modules
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO, encoding="utf-8",
                    format="%(name)s %(asctime)s: %(levelname)s: %(message)s", handlers=[logging.StreamHandler()])

In [6]:
# Define the root directory of the project
root_dir = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))
root_dir = pathlib.Path(root_dir)

##### Encoding Pipeline

In [7]:
# Define an encoding scheme to create the encoding pipeline
encoders_dict = {
    'number': {
        'as_number': {
            'imputers': [imputers.SimpleImputer(strategy='mean')],
            'encoders': [
                ne.StandardScaler(),
            ]
        }
    },
    'category': {
        'as_category': {
            'imputers': [imputers.SimpleImputer(strategy='most_frequent')],
            'encoders': [
                ne.MultiTargetEncoder(drop_invariant=True, return_df=True),
            ]
        }
    },
    'datetime': {
        'as_number': {
            'imputers': [de.DateFeatureExtractor()],
            'encoders': [
                ne.CyclicalFeatures(drop_original=True)
            ]
        },
        'as_category': {
            'imputers': [de.DateFeatureExtractor(dtype='category')],
            'encoders': [
                ne.MultiTargetEncoder(drop_invariant=True, return_df=True),
            ]
        }
    },
    'boolean': {
        'as_number': {
            'imputers': [imputers.SimpleImputer(strategy='most_frequent')],
             'encoders': [ne.BooleanEncoder()]
         }

    }
}

In [8]:
# Create the encoding pipeline
pipeline = create_encoding_pipeline(encoders_dict)

Creating encoding pipeline


In [9]:
pipeline

##### Dataset

In [10]:
# Define the configuration for the fetching of the data
fetch_config = {
    "data_start": '01-01-2019',
    "data_stop": '31-12-2023',
    'data_dir': root_dir / 'data',
    # "locations": ['CHU Dijon', 'CH Beaune', 'CH Semur', 'CH Chatillon Montbard', 'CH privé Dijon', 'CH Langres', 'CH Chaumont', 'HNFC', 'CHU Besançon']
    "locations": ['CHU Dijon']
    }

In [11]:
# Select the features to be used in the dataset
ars_features_class = [
    ft.HospitalFeatures,
    ft.AirQualityFeatures,
    ft.EpidemiologicalFeatures,
    # ft.FireFightersFeatures(include_calls=False),
    ft.GoogleTrendFeatures,
    ft.MeteorologicalFeatures,
    ft.SociologicalFeatures,
    ft.SportsCompetitionFeatures,
    ft.TrafficFeatures
    ]

In [12]:
# Select the target columns to be predicted
target_colomns = ['nb_emmergencies']
# target_colomns = ['nb_vers_hospit']
# target_colomns = ['nb_hospit_np_adults%%J+1%%mean_7J']

In [13]:
# Define the splitting scheme to create the sets
split_config = {'test_size': 0.2, 'val_size': 0.2, 'shuffle': False}

In [None]:
# Define the configuration of the dataset
dataset_config = {
    'from_date': '01-01-2019',
    'to_date': '30-12-2023',
    'locations': ['CHU Dijon'],
    # 'locations': ['CHU Dijon', 'CH Beaune', 'CH Semur', 'CH Chatillon Montbard', 'CH privé Dijon'],
    'axis': 'rows',
    'shift': range(1, 8, 1),
    'rolling_window': [7, 14, 31, 365],
    'freq': '1D',
    'split_config': split_config,
    'create_X_y': True,
    'encoding_pipeline': pipeline,
    'targets_names': target_colomns,
    'targets_shift': -3,
    'targets_rolling_window': 3,
    'targets_history_shifts': range(1, 8, 1),
    'targets_history_rolling_windows': [7, 14, 31, 365],
    'targets_locations': ['CHU Dijon'],
    'drop_constant_thr': 1.0,
    'data_dir': root_dir / 'data'
    }

In [15]:
# Create the dataset and fetch the data from the source then call get_dataset() method to fill the different attributes (X and y) of the different sets, and their encodings
arsTabularDataset = BaseTabularDataset(features_classes=ars_features_class, logger=logger, getter_config=dataset_config, fetch_config=fetch_config)

root 2024-11-13 18:42:45,094: INFO: Initialisation de la classe BaseTabularDataset
root 2024-11-13 18:42:45,099: INFO: Initialisation des features
root 2024-11-13 18:42:45,100: INFO: Fetching dataset
root 2024-11-13 18:42:45,276: INFO: hospitalfeatures's data already fetched for CHU Dijon
root 2024-11-13 18:42:45,449: INFO: airqualityfeatures's data already fetched for CHU Dijon
root 2024-11-13 18:42:45,640: INFO: epidemiologicalfeatures's data already fetched for CHU Dijon
root 2024-11-13 18:42:45,816: INFO: googletrendfeatures's data already fetched for CHU Dijon
root 2024-11-13 18:42:46,008: INFO: meteorologicalfeatures's data already fetched for CHU Dijon
root 2024-11-13 18:42:46,192: INFO: sociologicalfeatures's data already fetched for CHU Dijon
root 2024-11-13 18:42:46,418: INFO: sportscompetitionfeatures's data already fetched for CHU Dijon
root 2024-11-13 18:42:46,592: INFO: trafficfeatures's data already fetched for CHU Dijon
root 2024-11-13 18:42:46,776: INFO: Getting the da

Dropped 92 constant columns from both sets: {'PM10_FR26094', 'NO2_FR26094', 'PM25_FR26094', 'trend_épilepsie', 'PM10_FR26010', 'NO2_FR26010'}
X shape: (1162, 1382), y shape: (1162, 1)
[ColumnTransformer] .... (1 of 5) Processing pipeline-1, total=   0.0s
[ColumnTransformer] .... (2 of 5) Processing pipeline-2, total=   0.0s
[ColumnTransformer] .... (3 of 5) Processing pipeline-3, total=   0.0s
[ColumnTransformer] .... (4 of 5) Processing pipeline-4, total=   0.0s
[ColumnTransformer] .... (5 of 5) Processing pipeline-5, total=   0.0s


In [16]:
df = arsTabularDataset.enc_data

# Identifie les colonnes contenant des NaN
cols_with_nan = df.columns[df.isna().any()].tolist()
print("Colonnes contenant des NaN:", cols_with_nan)

# Affiche la liste des index des lignes contenant des NaN pour chaque colonne
nan_indices = {col: df[df[col].isna()].index.tolist() for col in cols_with_nan}
nan_indices

Colonnes contenant des NaN: []


{}

In [17]:
df

Unnamed: 0_level_0,meteo_wspd%%J-6,trend_stress%%J-5,trend_douleur abdominale%%J-2,trend_fracture%%std_14J,trend_urgence médicale%%std_7J,trend_démence%%J-5,trend_saignements%%mean_31J,trend_gastro-entérite%%mean_7J,trend_fracture%%J-1,trend_schizophrénie%%J-2,...,nb_accidents%%J-6,nb_emmergencies%%std_365J,nb_emmergencies,nb_emmergencies%%mean_31J,nb_accidents%%mean_7J,nb_emmergencies%%J-4,nb_accidents%%mean_14J,nb_accidents%%J-5,nb_emmergencies%%mean_365J,nb_accidents%%J-1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-03,-1.006782,-0.190018,-0.081982,-0.459817,-0.174225,-0.058098,-0.293615,-0.07785,-0.116422,-0.053938,...,0.0,26.347167,186,207.096774,0.285714,202.0,0.5,0.0,192.476712,0.0
2019-01-04,-1.006782,-0.190018,-0.081982,-0.459817,-0.174225,-0.058098,-0.293615,-0.07785,-0.116422,-0.053938,...,0.0,26.347167,186,207.096774,0.285714,202.0,0.5,0.0,192.476712,0.0
2019-01-05,-1.006782,-0.190018,-0.081982,-0.459817,-0.174225,-0.058098,-0.293615,-0.07785,-0.116422,-0.053938,...,0.0,26.347167,204,207.096774,0.285714,202.0,0.5,0.0,192.476712,0.0
2019-01-06,-1.006782,-0.190018,-0.081982,-0.459817,-0.174225,-0.058098,-0.293615,-0.07785,-0.116422,-0.053938,...,0.0,26.347167,182,207.096774,0.285714,198.0,0.5,0.0,192.476712,0.0
2019-01-07,-1.006782,-0.190018,-0.081982,-0.459817,-0.174225,-0.058098,-0.293615,-0.07785,-0.116422,-0.053938,...,0.0,26.347167,194,207.096774,0.285714,186.0,0.5,0.0,192.476712,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-20,0.023389,-0.190018,-0.081982,-0.459817,-0.174225,-0.058098,-0.293615,-0.07785,-0.116422,-0.053938,...,0.0,18.901087,183,179.548387,0.000000,193.0,0.0,0.0,176.602740,0.0
2023-12-21,-1.287738,-0.190018,-0.081982,-0.459817,-0.174225,-0.058098,-0.293615,-0.07785,-0.116422,-0.053938,...,0.0,18.892883,187,179.612903,0.000000,148.0,0.0,0.0,176.586301,0.0
2023-12-22,-0.257567,-0.190018,-0.081982,-0.459817,-0.174225,-0.058098,-0.293615,-0.07785,-0.116422,-0.053938,...,0.0,18.867387,182,179.064516,0.000000,165.0,0.0,0.0,176.556164,0.0
2023-12-23,-1.231547,-0.190018,-0.081982,-0.459817,-0.174225,-0.058098,-0.293615,-0.07785,-0.116422,-0.053938,...,0.0,18.801829,198,180.032258,0.000000,185.0,0.0,0.0,176.487671,0.0


In [18]:
# dataset.data.to_csv(fetch_config['data_dir'] / 'dataset.csv')

##### Model

In [19]:
# print(arsTabularDataset.data.columns.to_list())
# Define the model parameters
from src.models.obectives import *
model_params = {
    # 'tree_method': 'approx',
    # 'tree_method': 'hist',
    'early_stopping_rounds': 10,
    # 'eval_set': [(arsTabularDataset.enc_X_val, arsTabularDataset.y_val)], # TODO: to be set in the experiment's run method
    'verbosity': 1,
    'n_estimators': 10000,
    'learning_rate': 0.1,
    'min_child_weight': 5,
    # 'huber_slope': 1.0,
    # 'quantile_alpha': np.array([0.5]),
    'objective': weighted_rmse_obj,
    # 'tweedie_variance_power': 2,
    # 'alpha': 10
    # 'multi_strategy': 'one_output_per_tree',
    # 'multi_strategy': 'multi_output_tree' 
}

In [20]:
metrics = ['mae', 'mse', 'rmse', 'w_rmse', 'pw_rmse', 'msle', 'rmsle', 'r2', 'mqe', 'msse', 'max_error', 'explained_variance']

In [21]:
# Create the model
model = get_model(model_type='xgboost', name='XGBoost', device='cuda', task_type='regression', test_metrics=metrics, params=model_params)

<function weighted_rmse at 0x72765356e560>


In [22]:
# Create the experiment
ars_experiment = BaseExperiment(logger=logger, dataset=arsTabularDataset, model=model)

In [23]:
# Set the model fitting config
grid_params = {
    'max_depth': [3, 4, 5, 7, 9],
    'gamma' : [0.01, 0.05, 0.1, 0.2, 0.3]
}

In [24]:
fit_params = {
    'verbose': 1,
}

In [25]:
model_config={"optimization": "grid", "grid_params": grid_params, "fit_params": fit_params}

### Start run

In [26]:
find_best_features = True

In [27]:
run = mlflow.start_run(run_name='run_' + str(ars_experiment.run_nb), log_system_metrics=True)
run_dir = ars_experiment.dir_runs / f'{run.info.run_id}/artifacts/'
run_dir = pathlib.Path(run_dir)

2024/11/13 18:42:49 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


In [28]:
ars_experiment.logger.info("Running the experiment...")

# Certain fit_params doivent être initialisés après la création des datasets : eval_set
model_config['fit_params'].update({'eval_set': [(
    ars_experiment.dataset.enc_X_val, ars_experiment.dataset.y_val[target]) for target in ars_experiment.dataset.targets_names]})

mlflow.log_table(data=ars_experiment.dataset.data,
                artifact_file='datasets/full_dataset.json')

root 2024-11-13 18:42:49,681: INFO: Running the experiment...


In [29]:
if find_best_features:
    # selected_features = ars_experiment.get_important_features(dataset=dataset, model=ars_experiment.model, model_config=model_config)
    selected_features = ['nb_emmergencies%%J-7', 'nb_emmergencies%%J-1', 'nb_emmergencies%%J-2','nb_emmergencies%%J-3',
                         'nb_emmergencies', 'NO2_FR26094%%mean_7J', 'nb_emmergencies%%mean_365J', 'eveBankHolidays',
                         'meteo_wdir%%J-7', 'confinement1', 'trend_grippe%%mean_7J', 'trend_hopital%%J-3', 'trend_vaccin%%J-2',
                         'inc_diarrhee%%J-7', 'PM25_FR26094%%J-7', 'trend_crampes abdominales%%J-7', 'trend_médecin',
                         'trend_crampes abdominales%%mean_7J', 'confinement2', 'NO2_FR26010', 'trend_hopital%%J-2', 'trend_mal de tête%%mean_7J',
                         'trend_paralysie%%J-7', 'trend_accident de voiture%%mean_7J', 'trend_paralysie%%mean_7J', 'meteo_tavg%%mean_7J',
                         'trend_insuffisance cardiaque', 'trend_fièvre%%J-7', 'trend_infection respiratoire%%mean_7J']
    selected_features.extend(['PM10_FR26005%%mean_31J', 'foot%%std_14J', 'inc_ira%%mean_31J', 
                         'meteo_tmin%%mean_31J', 'trend_vaccin%%mean_31J', 'confinement2',
                         'meteo_tmax%%mean_31J', 'after_HNFC_moving', 'trend_vaccin%%mean_14J',
                         'trend_hopital%%mean_31J', 'trend_hopital%%mean_14J', 'date##week_cos',
                         'O3_FR26010%%mean_31J', 'O3_FR26005%%mean_31J', 'meteo_tavg%%mean_31J',
                         'inc_grippe%%mean_31J', 'inc_grippe%%mean_14J', 'date##week_sin',
                         'date##dayofYear_sin', 'confinement1'])
    selected_features = ['nb_emmergencies_CHU Dijon', 'nb_emmergencies_CHU Dijon%%J-1',
       'nb_emmergencies_CHU Dijon%%J-2', 'nb_emmergencies_CHU Dijon%%J-3',
       'nb_emmergencies_CHU Dijon%%J-4', 'nb_emmergencies_CHU Dijon%%J-5',
       'nb_emmergencies_CHU Dijon%%J-6', 'nb_emmergencies_CHU Dijon%%J-7',
       'nb_emmergencies_CHU Dijon%%J-8', 'nb_emmergencies_CHU Dijon%%J-9',
       'nb_emmergencies_CHU Dijon%%J-10', 'nb_emmergencies_CHU Dijon%%J-11',
       'nb_emmergencies_CHU Dijon%%J-12', 'nb_emmergencies_CHU Dijon%%J-13',
       'nb_emmergencies_CHU Dijon%%mean_7J',
       'nb_emmergencies_CHU Dijon%%mean_14J', 'inc_diarrhee', 'inc_ira',
       'inc_diarrhee%%J-1', 'inc_diarrhee%%J-2', 'inc_diarrhee%%J-3',
       'inc_ira%%J-1', 'inc_ira%%J-2', 'inc_ira%%J-3', 'inc_ira%%J-4',
       'inc_ira%%J-5', 'inc_ira%%J-6', 'inc_ira%%J-8', 'inc_ira%%J-9',
       'inc_ira%%J-10', 'inc_ira%%J-11', 'inc_ira%%J-12', 'inc_ira%%J-13',
       'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-6',
       'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-7',
       'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-8',
       'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-9',
       'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-10',
       'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-11',
       'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-12',
       'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-13',
       'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%mean_7J%%J-6',
       'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%mean_14J%%J-6',
       'date##week##cat##target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J']

    if dataset_config['axis'] == 'columns':
        selected_features = [feat + '_CHU Dijon' for  feat in selected_features]
    else:
        selected_features.append('location')
    # selected_features = dataset.enc_X_train.columns.to_list()
    dataset_config['features_names'] = selected_features
    ars_experiment.logger.info(
        'Features selected: {}'.format(selected_features))
    ars_experiment.dataset.get_dataset(**dataset_config)
    mlflow.log_table(data=ars_experiment.dataset.data,
                    artifact_file='datasets/full_dataset_feature_selection.json')
    model_config['fit_params']['eval_set'] = [
        (ars_experiment.dataset.enc_X_val, ars_experiment.dataset.y_val[target]) for target in ars_experiment.dataset.targets_names]


root 2024-11-13 18:42:50,143: INFO: Features selected: ['nb_emmergencies_CHU Dijon', 'nb_emmergencies_CHU Dijon%%J-1', 'nb_emmergencies_CHU Dijon%%J-2', 'nb_emmergencies_CHU Dijon%%J-3', 'nb_emmergencies_CHU Dijon%%J-4', 'nb_emmergencies_CHU Dijon%%J-5', 'nb_emmergencies_CHU Dijon%%J-6', 'nb_emmergencies_CHU Dijon%%J-7', 'nb_emmergencies_CHU Dijon%%J-8', 'nb_emmergencies_CHU Dijon%%J-9', 'nb_emmergencies_CHU Dijon%%J-10', 'nb_emmergencies_CHU Dijon%%J-11', 'nb_emmergencies_CHU Dijon%%J-12', 'nb_emmergencies_CHU Dijon%%J-13', 'nb_emmergencies_CHU Dijon%%mean_7J', 'nb_emmergencies_CHU Dijon%%mean_14J', 'inc_diarrhee', 'inc_ira', 'inc_diarrhee%%J-1', 'inc_diarrhee%%J-2', 'inc_diarrhee%%J-3', 'inc_ira%%J-1', 'inc_ira%%J-2', 'inc_ira%%J-3', 'inc_ira%%J-4', 'inc_ira%%J-5', 'inc_ira%%J-6', 'inc_ira%%J-8', 'inc_ira%%J-9', 'inc_ira%%J-10', 'inc_ira%%J-11', 'inc_ira%%J-12', 'inc_ira%%J-13', 'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-6', 'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-

KeyError: "['target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-12', 'nb_emmergencies_CHU Dijon%%J-1', 'inc_ira%%J-10', 'nb_emmergencies_CHU Dijon%%J-8', 'inc_ira%%J-9', 'inc_ira%%J-13', 'nb_emmergencies_CHU Dijon%%mean_7J', 'nb_emmergencies_CHU Dijon%%J-6', 'nb_emmergencies_CHU Dijon%%J-11', 'nb_emmergencies_CHU Dijon%%J-3', 'inc_ira%%J-11', 'nb_emmergencies_CHU Dijon', 'nb_emmergencies_CHU Dijon%%J-10', 'nb_emmergencies_CHU Dijon%%J-7', 'nb_emmergencies_CHU Dijon%%J-4', 'nb_emmergencies_CHU Dijon%%J-13', 'nb_emmergencies_CHU Dijon%%J-9', 'inc_ira%%J-8', 'inc_ira%%J-12', 'target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J%%J-13', 'nb_emmergencies_CHU Dijon%%J-2', 'nb_emmergencies_CHU Dijon%%J-12', 'nb_emmergencies_CHU Dijon%%mean_14J', 'nb_emmergencies_CHU Dijon%%J-5'] not in index"

In [None]:
mlflow.log_table(data=ars_experiment.dataset.train_set,
                    artifact_file='datasets/train_set.json')
mlflow.log_table(data=ars_experiment.dataset.val_set,
                    artifact_file='datasets/val_set.json')
mlflow.log_table(data=ars_experiment.dataset.test_set,
                    artifact_file='datasets/test_set.json')

train_dataset = mlflow.data.pandas_dataset.from_pandas(
    ars_experiment.dataset.train_set)
val_dataset = mlflow.data.pandas_dataset.from_pandas(
    ars_experiment.dataset.val_set)
test_dataset = mlflow.data.pandas_dataset.from_pandas(
    ars_experiment.dataset.test_set)

mlflow.log_input(dataset=train_dataset, context='training')
mlflow.log_input(dataset=val_dataset, context='validation')
mlflow.log_input(dataset=test_dataset, context='testing')

dataset_config_log = dataset_config.copy()
dataset_config_log['locations'] = [loc.name for loc in dataset_config_log.pop('locations')]
dataset_config_log['targets_locations'] = [loc.name for loc in dataset_config_log.pop('targets_locations')]
mlflow.log_params(dataset_config_log)

In [None]:
mlflow.log_params({f'grid_{key}': value for key,
                    value in model_config['grid_params'].items()})
# mlflow.log_params(model_config['params'])
mlflow.log_params(model_config['fit_params'])
mlflow.log_param('optimization', model_config['optimization'])

In [None]:
ars_experiment.model.fit(cd.DataFrame(ars_experiment.dataset.enc_X_train),
                ars_experiment.dataset.y_train, **model_config)
ars_experiment.logger.info("Model fitted.")

In [None]:
params = ars_experiment.model.get_params(deep=True)
if params['objective'] is not None:
    # Check if objective is a function
    if callable(params['objective']):
        params['objective'] = params['objective'].__name__

if params['eval_metric'] is not None:
    if callable(params['eval_metric']):
        params['eval_metric'] = params['eval_metric'].__name__
    else:
        params['eval_metric'] = params['eval_metric']
mlflow.log_params(params=params)

In [None]:
y_pred = ars_experiment.predict(ars_experiment.dataset)
mlflow.log_table(data=y_pred, artifact_file='datasets/pred.json')

In [None]:
scores = ars_experiment.score(ars_experiment.dataset)
mlflow.log_metrics(scores)
print(scores)

In [None]:
signature = infer_signature(ars_experiment.dataset.enc_X_test, y_pred)
mlflow.sklearn.log_model(ars_experiment.model, "model", signature=signature)

In [None]:
figure, ax = ars_experiment.plot(ars_experiment.dataset, y_pred, scores)
mlflow.log_figure(figure, 'predictions.png')
plt.show()

In [None]:
error_fig = ars_experiment.model.get_prediction_error_display(y=ars_experiment.dataset.y_test, y_pred=y_pred)
mlflow.log_figure(error_fig, 'errors.png')


In [None]:
ars_experiment.run_nb += 1
mlflow.end_run()

In [None]:
# Run the experiment
# ars_experiment.run(dataset_config=dataset_config, model_config=model_config, find_best_features=True)

In [None]:
# X_train = ars_experiment.dataset.enc_X_train
# X_test = ars_experiment.dataset.enc_X_test
# X_val = ars_experiment.dataset.enc_X_val

# y_train = ars_experiment.dataset.y_train
# y_test = ars_experiment.dataset.y_test
# y_val = ars_experiment.dataset.y_val

In [None]:
# y_pred = ars_experiment.predict(ars_experiment.dataset)
# y_true = ars_experiment.dataset.y_test

In [None]:
# ars_experiment.model.get_prediction_error_display(y_true, y_pred)

In [None]:
# X_train.to_csv("X_train.csv", index=False)
# X_test.to_csv("X_test.csv", index=False)
# X_val.to_csv("X_val.csv", index=False)
# y_train.to_csv("y_train.csv", index=False)
# y_test.to_csv("y_test.csv", index=False)
# y_val.to_csv("y_val.csv", index=False)

In [None]:
# model = ars_experiment.model

In [None]:
# dataset = arsTabularDataset

In [None]:
# dataset.enc_data.head()

In [None]:
# df = dataset.enc_data
# df

In [None]:
# from scipy.cluster import hierarchy
# from scipy.spatial.distance import squareform
# from scipy.stats import spearmanr
# import matplotlib.pyplot as plt
# import numpy as np

# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
# corr = spearmanr(X).correlation

# # Ensure the correlation matrix is symmetric
# corr = (corr + corr.T) / 2
# np.fill_diagonal(corr, 1)

# # We convert the correlation matrix to a distance matrix before performing
# # hierarchical clustering using Ward's linkage.
# distance_matrix = 1 - np.abs(corr)
# dist_linkage = hierarchy.ward(squareform(distance_matrix))
# dendro = hierarchy.dendrogram(
#     dist_linkage, labels=X.columns.to_list(), ax=ax1, leaf_rotation=90
# )
# dendro_idx = np.arange(0, len(dendro["ivl"]))

# ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]])
# ax2.set_xticks(dendro_idx)
# ax2.set_yticks(dendro_idx)
# ax2.set_xticklabels(dendro["ivl"], rotation="vertical")
# ax2.set_yticklabels(dendro["ivl"])
# fig.tight_layout()

In [None]:
# from shap_select import shap_select

In [None]:
# selected_features_df = shap_select(model.best_estimator_, X_val, y_val, task="regression", threshold=0.05)

In [None]:
# X_train

In [None]:
# bst = model.best_estimator_.get_booster()

In [None]:
# importance_gain = bst.get_score(importance_type='gain')
# importance_cover = bst.get_score(importance_type='cover')
# importance_weight = bst.get_score(importance_type='weight')

In [None]:
# df_cover = pd.DataFrame(importance_cover, index=[0]).T
# df_gain = pd.DataFrame(importance_gain, index=[0]).T
# df_weight = pd.DataFrame(importance_weight, index=[0]).T

In [None]:
# df_cover.shape

In [None]:
# df_cover.sort_values(by=0).plot(kind="barh", figsize=(15,20))
# df_gain.sort_values(by=0).plot(kind="barh", figsize=(15,20))
# df_weight.sort_values(by=0).plot(kind="barh", figsize=(15,20))

In [None]:
# model.shapley_additive_explanation(X_test, outname='shap_b', dir_output='.', mode="beeswarm", figsize=(50, 25))

In [None]:
# from collections import defaultdict

# cluster_ids = hierarchy.fcluster(dist_linkage, 1, criterion="distance")
# cluster_id_to_feature_ids = defaultdict(list)
# for idx, cluster_id in enumerate(cluster_ids):
#     cluster_id_to_feature_ids[cluster_id].append(idx)
# selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
# selected_features_names = X.columns[selected_features]
# X_train_sel = X_train[selected_features_names]
# X_test_sel = X_test[selected_features_names]
# X_val_sel = X_val[selected_features_names]
# model_config['fit_params'].update({'eval_set': [(X_val_sel, y_val[target]) for target in ars_experiment.dataset.targets_names]})
# model.fit(X_train_sel, y_train, **model_config)
# print(
#     "Baseline accuracy on test data with features removed:"
#     f" {model.score(X_test_sel, y_test):.2}"
# )


In [None]:
# import matplotlib

# from sklearn.inspection import permutation_importance
# from sklearn.utils.fixes import parse_version


# def plot_permutation_importance(clf, X, y, ax):
#     result = permutation_importance(clf, X, y, n_repeats=10, random_state=42)
#     perm_sorted_idx = result.importances_mean.argsort()

#     # `labels` argument in boxplot is deprecated in matplotlib 3.9 and has been
#     # renamed to `tick_labels`. The following code handles this.
#     tick_labels_parameter_name = (
#         "tick_labels"
#         if parse_version(matplotlib.__version__) >= parse_version("3.9")
#         else "labels"
#     )
#     tick_labels_dict = {tick_labels_parameter_name: X.columns[perm_sorted_idx]}
#     ax.boxplot(result.importances[perm_sorted_idx].T, vert=False, **tick_labels_dict)
#     ax.axvline(x=0, color="k", linestyle="--")
#     return ax


In [None]:
# fig, ax = plt.subplots(figsize=(7, 6))
# plot_permutation_importance(model, X_test_sel, y_test, ax)
# ax.set_title("Permutation Importances on selected subset of features\n(test set)")
# ax.set_xlabel("Decrease in accuracy score")
# ax.figure.tight_layout()
# plt.show()

In [None]:
# get the first line of x train
# X_train_sel[:1]

In [None]:
# y_train[:1]

In [None]:
# model.predict(X_train_sel[:1])

In [None]:
# import shap
# shap.initjs()
# explainer_xgb = shap.TreeExplainer(model.best_estimator_)
# single_explanation = explainer_xgb.shap_values(X_train_sel[:1])
# shap.summary_plot(single_explanation, X_test_sel, plot_type="bar")


In [None]:
# X_train.columns

In [None]:
# model = ars_experiment.model

In [None]:
# X_train = ars_experiment.dataset.enc_X_train
# X_test = ars_experiment.dataset.enc_X_test
# X_val = ars_experiment.dataset.enc_X_val

# y_train = ars_experiment.dataset.y_train
# y_test = ars_experiment.dataset.y_test
# y_val = ars_experiment.dataset.y_val

In [None]:
# explainer_xgb = shap.TreeExplainer(model.best_estimator_)

In [None]:
# single_explanation = explainer_xgb.shap_values(X_train[:1])

In [None]:
# shap_values_xgb = explainer_xgb.shap_values(X_test[:50])

In [None]:
# shap.plots.beeswarm(shap_values=shap_values_xgb)

In [None]:
# shap.plots.force(explainer_xgb.expected_value, shap_values_xgb)

In [None]:
# shap.dependence_plot('inc_grippe', shap_values_xgb, X_train_sel)

In [None]:
# shap_values_xgb