In [1]:
import os
import sys

sys.path.insert(0, os.path.abspath(
    os.path.join(os.path.dirname('__file__'), '..')))
from src.encoding.encoders import *
from src.encoding.tools import create_encoding_pipeline
import pandas as pd
import src.features as ft
from src.datasets.base_tabular_dataset import BaseTabularDataset
import datetime as dt
import logging
import pathlib
pd.set_option('display.max_columns', None)

# Define a logger used by all modules
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO, encoding="utf-8",
                    format="%(name)s %(asctime)s: %(levelname)s: %(message)s", handlers=[logging.StreamHandler()])

# Define the root directory of the project
root_dir = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))
root_dir = pathlib.Path(root_dir)

# Define the configuration for the fetching of the data
fetch_config = {
    "data_start": '01-01-2017',
    "data_stop": '31-12-2023',
    'data_dir': root_dir / 'data',
    "etablissement": "CHU Dijon",
    "departement": "21",
    'region': 'BOURGOGNE'
    }

# Select the features to be used in the dataset
ars_features_class = [
    ft.HopitalFeatures,
    ft.AirQualityFeatures,
    ft.EpidemiologicalFeatures,
    # ft.FireFightersFeatures(include_calls=False),
    ft.GoogleTrendFeatures,
    ft.MeteorologicalFeatures,
    ft.SociologicalFeatures,
    ft.SportsCompetitionFeatures,
    ft.TrafficFeatures
    ]

# Select the target columns to be predicted
# target_colomns = ['nb_vers_hospit']
target_colomns = ['nb_emmergencies_CHU Dijon']

# Define an encoding scheme to create the encoding pipeline
encoders_dict = {
    'number': {
        'as_number': {
            'imputers': [imputers.SimpleImputer(strategy='mean')],
            'encoders': [
                ne.StandardScaler(),
            ]
        }
    },
    'category': {
        'as_category': {
            'imputers': [imputers.SimpleImputer(strategy='most_frequent')],
            'encoders': [
                ne.MultiTargetEncoder(drop_invariant=True, return_df=True),
            ]
        }
    },
    'datetime': {
        'as_number': {
            'imputers': [de.DateFeatureExtractor()],
            'encoders': [
                ne.CyclicalFeatures(drop_original=True)
            ]
        },
        'as_category': {
            'imputers': [de.DateFeatureExtractor(dtype='category')],
            'encoders': [
                ne.MultiTargetEncoder(drop_invariant=True, return_df=True),
            ]
        }
    }
}

# Create the encoding pipeline
pipeline = create_encoding_pipeline(encoders_dict)

# Define the splitting scheme to create the sets
split_config = {'test_size': 0.2, 'val_size': 0.2, 'shuffle': False}

# Define the configuration of the dataset
dataset_config = {
    'from_date': '15-01-2019',
    'to_date': '30-12-2023',
    # 'shift': range(1, 14, 1),
    # 'rolling_window': [7, 14],
    'freq': '1D',
    'split_config': split_config,
    'create_X_y': True,
    # 'encoding_pipeline': pipeline,
    'targets_names': target_colomns,
    'targets_shift': -7,
    'targets_rolling_window': 7,
    # 'targets_history_shifts': range(1, 14, 1),
    # 'targets_history_rolling_windows': [7, 14],
    'drop_constant_thr':1.0,
    'data_dir': root_dir / 'data'
    }

baseTabularDataset = BaseTabularDataset(features_class=ars_features_class, logger=logger, fetch_config=fetch_config, getter_config=dataset_config)


root 2024-10-24 14:25:15,357: INFO: Initialisation de la classe BaseTabularDataset
root 2024-10-24 14:25:15,360: INFO: Initialisation de la classe hopitalfeatures
root 2024-10-24 14:25:15,361: INFO: Initialisation de la classe airqualityfeatures
root 2024-10-24 14:25:15,362: INFO: Initialisation de la classe epidemiologicalfeatures
root 2024-10-24 14:25:15,363: INFO: Initialisation de la classe googletrendfeatures
root 2024-10-24 14:25:15,364: INFO: Initialisation de la classe meteorologicalfeatures
root 2024-10-24 14:25:15,364: INFO: Initialisation de la classe sociologicalfeatures
root 2024-10-24 14:25:15,365: INFO: Initialisation de la classe sportscompetitionfeatures
root 2024-10-24 14:25:15,365: INFO: Initialisation de la classe trafficfeatures
root 2024-10-24 14:25:15,367: INFO: Fetching data for hopitalfeatures
root 2024-10-24 14:25:15,376: INFO: Fetching data for airqualityfeatures
root 2024-10-24 14:25:15,384: INFO: Fetching data for epidemiologicalfeatures
root 2024-10-24 14:

Creating encoding pipeline
/home/maxime/Documents/WORKSPACES/forecasting_models/data
Dropped columns with zero variance: []
Dropped columns with zero variance: []
Dropped columns with zero variance: []
Dropped columns with zero variance: []
Dropped columns with zero variance: []
Dropped columns with zero variance: []
Dropped columns with zero variance: []
Dropped columns with zero variance: []
Column 'PM10_FR26010' is constant at 15.0 for 100.00% of the rows.
Column 'PM10_FR26094' is constant at 19.0 for 100.00% of the rows.
Column 'PM25_FR26094' is constant at 6.8 for 100.00% of the rows.
Column 'NO2_FR26010' is constant at 6.9 for 100.00% of the rows.
Column 'NO2_FR26094' is constant at 7.0 for 100.00% of the rows.
Dropped constant columns from both sets: ['PM10_FR26010', 'PM10_FR26094', 'PM25_FR26094', 'NO2_FR26010', 'NO2_FR26094']


In [2]:
pipeline

In [3]:
for target in baseTabularDataset.y_train.columns:
    pipeline.fit(baseTabularDataset.X_train, baseTabularDataset.y_train)
    pipeline

[ColumnTransformer] .... (1 of 4) Processing pipeline-1, total=   0.0s
[ColumnTransformer] .... (2 of 4) Processing pipeline-3, total=   0.0s
[ColumnTransformer] .... (3 of 4) Processing pipeline-4, total=   0.1s
[ColumnTransformer] ..... (4 of 4) Processing remainder, total=   0.0s


In [7]:
encoded = pipeline.transform(baseTabularDataset.X_train)
encoded.columns = [col.split('__')[-1] for col in encoded.columns]
encoded

Unnamed: 0_level_0,nb_emmergencies_CHU Dijon,O3_FR26005,O3_FR26010,PM10_FR26005,PM10_FR26014,PM25_FR26005,NO2_FR26005,NO2_FR26014,inc_grippe,inc_diarrhee,inc_varicelle,inc_ira,meteo_CHU DIJON_0_tavg,meteo_CHU DIJON_0_tmin,meteo_CHU DIJON_0_tmax,meteo_CHU DIJON_0_prcp,meteo_CHU DIJON_0_snow,meteo_CHU DIJON_0_wdir,meteo_CHU DIJON_0_wspd,meteo_CHU DIJON_0_pres,foot,nb_accidents,date##month_sin,date##month_cos,date##day_sin,date##day_cos,date##dayofweek_sin,date##dayofweek_cos,date##quarter_sin,date##quarter_cos,date##week_sin,date##week_cos,date##dayofYear_sin,date##dayofYear_cos,date##month##cat##target_nb_emmergencies_CHU Dijon%%J+7%%mean_7J,date##day##cat##target_nb_emmergencies_CHU Dijon%%J+7%%mean_7J,date##dayofweek##cat##target_nb_emmergencies_CHU Dijon%%J+7%%mean_7J,date##quarter##cat##target_nb_emmergencies_CHU Dijon%%J+7%%mean_7J,date##week##cat##target_nb_emmergencies_CHU Dijon%%J+7%%mean_7J,date##dayofYear##cat##target_nb_emmergencies_CHU Dijon%%J+7%%mean_7J,bankHolidays,eveBankHolidays,holidays,borderHolidays,confinement1,confinement2,couvrefeux,ramadan,before_HNFC_moving,during_HNFC_moving,after_HNFC_moving,before_COVID,during_COVID,after_COVID
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1
2019-01-21,1.099193,-1.109083,-0.840250,1.900100,1.913539,2.689014,1.260492,0.244221,3.034792,-0.137427,0.343221,-0.713197,-1.532716,-1.172778,-1.785856,0.941143,0.669075,-1.352003,-0.263216,0.356084,0.531331,-0.699051,0.5,8.660254e-01,-0.897805,-0.440394,0.000000e+00,1.0,1.0,6.123234e-17,0.456629,0.889657,0.352752,0.935717,184.768411,174.888873,175.125435,174.400446,190.746590,178.172139,False,False,False,False,False,False,False,False,False,False,True,True,False,False
2019-01-22,0.549368,-0.440539,-0.628496,0.946033,0.727968,1.835331,0.867855,0.490474,3.034792,-0.137427,0.343221,-0.713197,-1.755468,-1.456655,-1.902167,-0.395308,0.669075,0.181268,-0.206942,-0.678669,2.596766,-0.699051,0.5,8.660254e-01,-0.968077,-0.250653,8.660254e-01,0.5,1.0,6.123234e-17,0.456629,0.889657,0.368763,0.929523,184.768411,175.456671,175.134146,174.400446,190.746590,178.262129,False,False,False,False,False,False,False,False,False,False,True,True,False,False
2019-01-23,-0.131368,-0.524107,-0.332039,-0.008035,-0.062414,0.640174,1.522250,0.736727,3.034792,-0.137427,0.343221,-0.713197,-1.504871,-1.172778,-1.739331,0.153592,1.444785,0.435354,-0.319489,-2.284321,1.667320,-0.699051,0.5,8.660254e-01,-0.998717,-0.050649,8.660254e-01,-0.5,1.0,6.123234e-17,0.456629,0.889657,0.384665,0.923056,184.768411,176.056730,175.147212,174.400446,190.746590,179.042043,False,False,False,False,False,False,False,False,False,False,True,True,False,False
2019-01-24,0.523186,-0.691243,-0.628496,1.354919,1.386619,2.176804,1.522250,0.613600,3.034792,-0.137427,0.343221,-0.713197,-1.699780,-1.283175,-1.855642,-0.085060,1.436705,1.495501,0.505855,-0.797606,0.737875,0.613102,0.5,8.660254e-01,-0.988468,0.151428,1.224647e-16,-1.0,1.0,6.123234e-17,0.456629,0.889657,0.400454,0.916317,184.768411,176.079313,175.157665,174.400446,190.746590,179.018046,False,False,False,False,False,False,False,False,False,False,True,True,False,False
2019-01-25,1.177739,-0.691243,-0.459092,1.354919,1.123158,1.664594,2.961919,1.352360,3.034792,-0.137427,0.343221,-0.713197,-1.713702,-1.850928,-1.599758,-0.395308,1.428625,-0.309378,-1.332413,0.189572,2.390223,-0.699051,0.5,8.660254e-01,-0.937752,0.347305,-8.660254e-01,-0.5,1.0,6.123234e-17,0.456629,0.889657,0.416125,0.909308,184.768411,176.831000,174.864154,174.400446,190.746590,179.048043,False,False,False,False,False,False,False,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-03-06,0.654096,0.436926,0.514980,0.400851,0.332777,0.981647,-0.807397,-0.371412,-0.133537,0.415143,1.743055,0.269726,-1.240352,-1.346258,-1.087990,-0.395308,-0.106635,-1.369526,0.543371,0.237147,-0.604658,-0.699051,1.0,6.123234e-17,0.937752,0.347305,-2.449294e-16,1.0,1.0,6.123234e-17,0.875735,0.482792,0.905702,0.423914,153.094213,175.105023,174.812445,174.400446,182.547862,174.296563,False,False,False,False,False,False,False,False,False,False,True,False,False,True
2022-03-07,0.758825,0.186222,0.260874,1.354919,1.650079,1.664594,-0.179178,0.244221,1.368970,2.097532,2.269107,3.541170,-1.268196,-1.346258,-0.971679,-0.395308,-0.106635,-1.246865,0.562129,0.260935,-0.604658,-0.699051,1.0,6.123234e-17,0.988468,0.151428,0.000000e+00,1.0,1.0,6.123234e-17,0.92669,0.375828,0.912846,0.408304,153.094213,175.085667,175.125435,174.400446,173.229859,173.876609,False,False,False,False,False,False,False,False,False,False,True,False,False,True
2022-03-08,0.549368,-0.148051,0.218524,2.990463,3.099111,3.371961,2.045766,1.598613,1.368970,2.097532,2.269107,3.541170,-1.115054,-1.645906,-0.518066,-0.395308,-0.106635,-1.395811,-0.469552,0.189572,-0.604658,0.613102,1.0,6.123234e-17,0.998717,-0.050649,8.660254e-01,0.5,1.0,6.123234e-17,0.92669,0.375828,0.919720,0.392574,153.094213,175.640560,175.134146,174.400446,173.229859,173.696628,False,False,False,False,False,False,False,False,False,False,True,False,False,True
2022-03-09,0.287546,-0.440539,-0.332039,2.854167,2.572190,2.689014,2.438403,1.844866,1.368970,2.097532,2.269107,3.541170,-0.934067,-1.645906,-0.297075,-0.395308,-0.106635,-1.395811,-0.957256,0.617746,-0.604658,-0.699051,1.0,6.123234e-17,0.968077,-0.250653,8.660254e-01,-0.5,1.0,6.123234e-17,0.92669,0.375828,0.926324,0.376728,153.094213,175.701856,175.147212,174.400446,173.229859,173.246678,False,False,False,False,False,False,False,False,False,False,True,False,False,True
