In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath(
    os.path.join(os.path.dirname('__file__'), '..')))

In [2]:
# %pip install -r ../requirements.txt

In [3]:
from src.encoding.encoders import *
from src.encoding.tools import create_encoding_pipeline
from src.models.sklearn_models import save_object, Model
from src.models.sklearn_models_config import get_model
from src.datasets.base_tabular_dataset import BaseTabularDataset
from src.experiments.base_experiment import BaseExperiment
import src.features as ft
import logging

mlflow.set_tracking_uri("http://localhost:5000")

In [4]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO, encoding="utf-8",
                    format="%(asctime)s: %(levelname)s: %(message)s",
                    datefmt='%Y/%m/%d %H:%M:%S',
                    handlers=[logging.StreamHandler()])
root_dir = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))
config = ft.Config({'max_nan': 15, "departement": "21", "root_dir": root_dir, "start": '01-01-2019',
                    "stop": '31-12-2023', "logger": logger, "step_unit": 'days', "step_value": 1,
                    "shift": 0, "rolling_window": 0, "etablissement": "CHU Dijon", 'region':'BOURGOGNE'})

In [5]:
ars_features_class = [ft.AirQualityFeatures, ft.HopitalFeatures(config=config, include_emmergency_arrivals=True, include_nb_hospit=True), ft.EpidemiologicalFeatures, ft.FireFightersFeatures(config=config, include_calls=False),
                      ft.GoogleTrendFeatures, ft.MeteorologicalFeatures, ft.SociologicalFeatures,
                      ft.SportsCompetitionFeatures, ft.TrafficFeatures]

2024/09/13 18:15:03: INFO: Initialisation de la classe HopitalFeatures
2024/09/13 18:15:03: INFO: Initialisation de la classe FireFightersFeatures


In [6]:
arsTabularDataset = BaseTabularDataset(target_colomns=['Total_CHU Dijon'],  # ,  'nb_vers_hospit'
                                       config=config, features_class=ars_features_class)
arsTabularDataset.fetch_data(save=True) # Fetch data from the features, do this only once, if you need smaller datasets, use the get_dataset method

2024/09/13 18:15:03: INFO: Initialisation de la classe BaseTabularDataset_Total_CHU Dijon
2024/09/13 18:15:03: INFO: Initialisation de la classe AirQualityFeatures
2024/09/13 18:15:03: INFO: Initialisation de la classe EpidemiologicalFeatures
2024/09/13 18:15:03: INFO: Initialisation de la classe GoogleTrendFeatures
2024/09/13 18:15:03: INFO: Initialisation de la classe MeteorologicalFeatures
2024/09/13 18:15:03: INFO: Initialisation de la classe SociologicalFeatures
2024/09/13 18:15:03: INFO: Initialisation de la classe SportsCompetitionFeatures
2024/09/13 18:15:03: INFO: Initialisation de la classe TrafficFeatures
2024/09/13 18:15:03: INFO: Fetching data for BaseTabularDataset_Total_CHU Dijon...
2024/09/13 18:15:03: INFO: Fetching data from AirQualityFeatures({'max_nan': 15, 'departement': '21', 'root_dir': '/home/maxime/Documents/WORKSPACES/forecasting_models', 'start': '01-01-2019', 'stop': '31-12-2023', 'logger': <RootLogger root (INFO)>, 'step_unit': 'days', 'step_value': 1, 'shi

Dijon


2024/09/13 18:15:07: INFO: Variables de vacances intégrées
2024/09/13 18:15:07: INFO: On s'occupe des variables de confinement
2024/09/13 18:15:07: INFO: Variables de confinement intégrées
2024/09/13 18:15:07: INFO: On s'occupe des variables de Ramadan
2024/09/13 18:15:07: INFO: Saving data for SociologicalFeatures...
2024/09/13 18:15:07: INFO: Fetching data from SportsCompetitionFeatures({'max_nan': 15, 'departement': '21', 'root_dir': '/home/maxime/Documents/WORKSPACES/forecasting_models', 'start': '01-01-2019', 'stop': '31-12-2023', 'logger': <RootLogger root (INFO)>, 'step_unit': 'days', 'step_value': 1, 'shift': 0, 'rolling_window': 0, 'etablissement': 'CHU Dijon', 'region': 'BOURGOGNE'})
2024/09/13 18:15:07: INFO: Fetching data for SportsCompetitionFeatures...
2024/09/13 18:15:07: INFO: Intégration des données de football
2024/09/13 18:15:07: INFO: Données de football intégrées
2024/09/13 18:15:07: INFO: Saving data for SportsCompetitionFeatures...
2024/09/13 18:15:07: INFO: Fetc

In [7]:
model_params = {
    'early_stopping_rounds': 10,
    # 'eval_set': [(arsTabularDataset.enc_X_val, arsTabularDataset.y_val)], # TODO: to be set in the experiment's run method
    'verbosity': 0,
    'n_estimators': 10000,
    'learning_rate': 0.1,
    'min_child_weight': 5,
    # 'multi_strategy': 'one_output_per_tree',
    # 'multi_strategy': 'multi_output_tree' 
}
model = get_model(model_type='xgboost', name='XGBRegressor', device='cuda', task_type='regression', test_metrics='rmse', with_metric='w_rmse', params=model_params)

In [8]:
ars_experiment = BaseExperiment(dataset=arsTabularDataset, model=model, config=config)

In [9]:
grid_params = {
    'max_depth': [3, 5, 7, 9, 11],
}

In [10]:
fit_params = {
    'verbose': 0,
}

In [11]:
encoders_dict = {
    'number': {
        'as_number': {
            'imputers': [imputers.SimpleImputer(strategy='mean')],
            'encoders': [
                ne.StandardScaler(),

            ]
        }
    },
    'category': {
        'as_category': {
            'imputers': [imputers.SimpleImputer(strategy='most_frequent')],
            'encoders': [
                # ne.TargetEncoder(target_type='continuous-multioutput'),
                # ne.TargetEncoder(target_type='continuous'),
                ne.MultiTargetEncoder(drop_invariant=True, return_df=True),
            ]
        }
    },
    'datetime': {
        'as_number': {
            'imputers': [de.DateFeatureExtractor()],
            'encoders': [
                ne.CyclicalFeatures(drop_original=True)
            ]
        },
        'as_category': {
            'imputers': [de.DateFeatureExtractor(dtype='category')],
            'encoders': [
                # ne.TargetEncoder(target_type='continuous'),
                ne.MultiTargetEncoder(drop_invariant=True, return_df=True),


            ]
        }
    }
}

In [12]:
# tes = ne.MultiTargetEncoder(drop_invariant=True, return_df=True)


In [13]:
# tes.set_output(transform='pandas')

In [14]:
split_config = {'test_size': 0.2, 'val_size': 0.2, 'shuffle': False}

In [15]:
dataset_config={'from_date': '01-01-2019', 'to_date': '31-12-2023', 'shift':7, 'rolling_window':[7, 14], 'freq':'1YE', 'split_config': split_config}
model_config={"optimization": "grid", "grid_params": grid_params, "fit_params": fit_params}
encoding_pipeline = create_encoding_pipeline(encoders_dict=encoders_dict)

Creating encoding pipeline


In [16]:
dataset = ars_experiment.run(dataset_config=dataset_config, model_config=model_config, encoding_pipeline=encoding_pipeline, find_best_features=True)

2024/09/13 18:15:08 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
2024/09/13 18:15:08: INFO: Running the experiment...
2024/09/13 18:15:08: INFO: Augmentation des features...
2024/09/13 18:15:08: INFO: Aggregating data by data type...
2024/09/13 18:15:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_29 at: http://127.0.0.1:8080/#/experiments/397958626087620787/runs/652073be21ed46a894d0a7a5d78056f3.
2024/09/13 18:15:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/397958626087620787.
2024/09/13 18:15:34 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...


[FeatureUnion]  (step 1 of 4) Processing columntransformer-1, total=   0.1s
[FeatureUnion]  (step 2 of 4) Processing columntransformer-2, total=   0.0s
[FeatureUnion]  (step 3 of 4) Processing columntransformer-3, total=   0.0s
[FeatureUnion]  (step 4 of 4) Processing columntransformer-4, total=   0.1s


2024/09/13 18:15:34 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [17]:
dataset = ars_experiment.dataset
data = dataset.data
enc_data = dataset.enc_data
train_set = dataset.train_set

In [18]:
X_train = dataset.X_train
X_train

In [23]:
dataset.plot(freq='1YE', max_subplots=16)

2024/09/13 18:16:30: INFO: Aggregating data by data type...


KeyboardInterrupt: 

In [20]:
enc_data

In [21]:
dataset = dataset.get_dataset(**dataset_config)

2024/09/13 18:15:34: INFO: Augmentation des features...
2024/09/13 18:15:35: INFO: Aggregating data by data type...


In [22]:
enc_data = dataset.enc_data
enc_data