In [1]:
import os
import sys

In [2]:
sys.path.insert(0, os.path.abspath(
    os.path.join(os.path.dirname('__file__'), '..')))
from src.encoding.encoders import *
from src.encoding.tools import create_encoding_pipeline
import pandas as pd
import src.features as ft
from src.datasets.base_tabular_dataset import BaseTabularDataset
import datetime as dt
import logging
import pathlib
pd.set_option('display.max_columns', None)

In [3]:
# Define a logger used by all modules
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO, encoding="utf-8",
                    format="%(name)s %(asctime)s: %(levelname)s: %(message)s", handlers=[logging.StreamHandler()])

In [4]:
# Define the root directory of the project
root_dir = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))
root_dir = pathlib.Path(root_dir)

In [5]:
# Define the configuration for the fetching of the data
fetch_config = {
    "data_start": '01-01-2017',
    "data_stop": '31-12-2023',
    'data_dir': root_dir / 'data',
    "etablissement": "CHU Dijon",
    "departement": "21",
    'region': 'BOURGOGNE'
    }

In [6]:
# Select the features to be used in the dataset
ars_features_class = [
    ft.HopitalFeatures,
    ft.AirQualityFeatures,
    ft.EpidemiologicalFeatures,
    # ft.FireFightersFeatures(include_calls=False),
    ft.GoogleTrendFeatures,
    ft.MeteorologicalFeatures,
    ft.SociologicalFeatures,
    ft.SportsCompetitionFeatures,
    ft.TrafficFeatures
    ]

In [7]:
# Select the target columns to be predicted
# target_colomns = ['nb_vers_hospit']
target_colomns = ['nb_emmergencies_CHU Dijon']

In [8]:
# Define an encoding scheme to create the encoding pipeline
encoders_dict = {
    'number': {
        'as_number': {
            'imputers': [imputers.SimpleImputer(strategy='mean')],
            'encoders': [
                ne.StandardScaler(),
            ]
        }
    },
    'category': {
        'as_category': {
            'imputers': [imputers.SimpleImputer(strategy='most_frequent')],
            'encoders': [
                ne.MultiTargetEncoder(drop_invariant=True, return_df=True),
            ]
        }
    },
    'datetime': {
        'as_number': {
            'imputers': [de.DateFeatureExtractor()],
            'encoders': [
                ne.CyclicalFeatures(drop_original=True)
            ]
        },
        'as_category': {
            'imputers': [de.DateFeatureExtractor(dtype='category')],
            'encoders': [
                ne.MultiTargetEncoder(drop_invariant=True, return_df=True),
            ]
        }
    },
    'boolean': {
        'as_number': {
            'imputers': [imputers.SimpleImputer(strategy='most_frequent')],
             'encoders': [ne.BooleanEncoder()]
         }

    }
}

In [9]:
# Create the encoding pipeline
pipeline = create_encoding_pipeline(encoders_dict)

Creating encoding pipeline


In [10]:
# Define the splitting scheme to create the sets
split_config = {'test_size': 0.2, 'val_size': 0.2, 'shuffle': False}

In [11]:
# Define the configuration of the dataset
dataset_config = {
    'from_date': '15-01-2019',
    'to_date': '30-12-2023',
    # 'shift': range(1, 14, 1),
    # 'rolling_window': [7, 14],
    'freq': '1D',
    'split_config': split_config,
    'create_X_y': True,
    'encoding_pipeline': pipeline,
    'targets_names': target_colomns,
    'targets_shift': -3,
    'targets_rolling_window': 3,
    # 'targets_history_shifts': range(6, 14, 1),
    # 'targets_history_rolling_windows': [7, 14],
    'drop_constant_thr':1.0,
    'data_dir': root_dir / 'data'
    }

In [12]:
baseTabularDataset = BaseTabularDataset(features_class=ars_features_class, logger=logger, fetch_config=fetch_config, getter_config=dataset_config)
# print(baseTabularDataset.data.columns.to_list())
# liste = ['O3_FR26005%%mean_7J', 'target_Total_CHU Dijon']
# baseTabularDataset = baseTabularDataset.get_dataset(**dataset_config, features_names=liste)


root 2024-10-24 17:58:41,554: INFO: Initialisation de la classe BaseTabularDataset
root 2024-10-24 17:58:41,556: INFO: Initialisation de la classe hopitalfeatures
root 2024-10-24 17:58:41,556: INFO: Initialisation de la classe airqualityfeatures
root 2024-10-24 17:58:41,557: INFO: Initialisation de la classe epidemiologicalfeatures
root 2024-10-24 17:58:41,557: INFO: Initialisation de la classe googletrendfeatures
root 2024-10-24 17:58:41,557: INFO: Initialisation de la classe meteorologicalfeatures
root 2024-10-24 17:58:41,558: INFO: Initialisation de la classe sociologicalfeatures
root 2024-10-24 17:58:41,558: INFO: Initialisation de la classe sportscompetitionfeatures
root 2024-10-24 17:58:41,558: INFO: Initialisation de la classe trafficfeatures
root 2024-10-24 17:58:41,559: INFO: Fetching data for hopitalfeatures


/home/maxime/Documents/WORKSPACES/forecasting_models/data


root 2024-10-24 17:58:41,718: INFO: Fetching data for airqualityfeatures
root 2024-10-24 17:58:41,787: INFO: Fetching data for epidemiologicalfeatures
root 2024-10-24 17:58:45,436: INFO: Fetching data for googletrendfeatures
root 2024-10-24 17:58:45,447: INFO: Fetching data for meteorologicalfeatures
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


root 2024-10-24 17:58:45,481: INFO: Fetching data for sociologicalfeatures


/home/maxime/Documents/WORKSPACES/forecasting_models/tests
/home/maxime/Documents/WORKSPACES/forecasting_models/src/geolocalisation


root 2024-10-24 17:58:46,192: INFO: Fetching data for sportscompetitionfeatures
root 2024-10-24 17:58:46,207: INFO: Fetching data for trafficfeatures
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


root 2024-10-24 17:58:46,228: INFO: Getting the dataset from 15-01-2019 to 30-12-2023...
root 2024-10-24 17:58:46,229: INFO: Getting data for hopitalfeatures from 2019-01-15 00:00:00 to 2023-12-30 00:00:00, at a 1D frequency
root 2024-10-24 17:58:46,233: INFO: Getting data for airqualityfeatures from 2019-01-15 00:00:00 to 2023-12-30 00:00:00, at a 1D frequency
root 2024-10-24 17:58:46,243: INFO: Getting data for epidemiologicalfeatures from 2019-01-15 00:00:00 to

Dropped columns with zero variance: []
Dropped columns with zero variance: []
Dropped columns with zero variance: []
Dropped columns with zero variance: []
Dropped columns with zero variance: []
Dropped columns with zero variance: []
Dropped columns with zero variance: []
Dropped columns with zero variance: []
Column 'PM10_FR26010' is constant at 15.0 for 100.00% of the rows.
Column 'PM10_FR26094' is constant at 19.0 for 100.00% of the rows.
Column 'PM25_FR26094' is constant at 6.8 for 100.00% of the rows.
Column 'NO2_FR26010' is constant at 6.9 for 100.00% of the rows.
Column 'NO2_FR26094' is constant at 7.0 for 100.00% of the rows.
Dropped 5 constant columns from both sets: ['PM10_FR26010', 'PM10_FR26094', 'PM25_FR26094', 'NO2_FR26010', 'NO2_FR26094']
X shape: (1153, 37), y shape: (1153, 1)
[ColumnTransformer] .... (1 of 4) Processing pipeline-1, total=   0.0s
[ColumnTransformer] .... (2 of 4) Processing pipeline-3, total=   0.0s
[ColumnTransformer] .... (3 of 4) Processing pipeline-

In [13]:
data = baseTabularDataset.data
data

Unnamed: 0_level_0,nb_emmergencies_CHU Dijon,O3_FR26005,O3_FR26010,PM10_FR26005,PM10_FR26014,PM25_FR26005,NO2_FR26005,NO2_FR26014,inc_grippe,inc_diarrhee,inc_varicelle,inc_ira,meteo_CHU DIJON_0_tavg,meteo_CHU DIJON_0_tmin,meteo_CHU DIJON_0_tmax,meteo_CHU DIJON_0_prcp,meteo_CHU DIJON_0_snow,meteo_CHU DIJON_0_wdir,meteo_CHU DIJON_0_wspd,meteo_CHU DIJON_0_pres,bankHolidays,eveBankHolidays,holidays,borderHolidays,confinement1,confinement2,couvrefeux,ramadan,before_HNFC_moving,during_HNFC_moving,after_HNFC_moving,before_COVID,during_COVID,after_COVID,foot,nb_accidents,date,target_nb_emmergencies_CHU Dijon%%J+3%%mean_3J
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
2019-01-17,230,53.0,59.0,16.0,13.0,8.7,17.0,20.0,1431,1047,48,0,5.6,1.0,8.5,0.0,0.0,224.0,16.7,1013.4,False,False,False,False,False,False,False,False,False,False,True,True,False,False,17.0,1.0,2019-01-17,194.333333
2019-01-18,221,39.0,52.0,18.0,12.0,7.2,27.0,29.0,1431,1047,48,0,-1.1,-4.8,3.8,0.8,0.0,351.0,9.5,1020.1,False,False,False,False,False,False,False,False,False,False,True,True,False,False,15.0,0.0,2019-01-18,187.000000
2019-01-19,230,19.0,27.0,23.0,22.0,16.0,27.0,28.0,1431,1047,48,0,-3.1,-6.4,-1.6,0.0,10.0,18.0,3.8,1016.0,False,False,False,False,False,False,False,False,False,False,True,True,False,False,35.0,1.0,2019-01-19,195.000000
2019-01-20,211,12.0,15.0,39.0,39.0,32.0,29.0,29.0,1431,1047,48,0,0.6,-2.4,1.3,0.8,10.0,5.0,3.0,1014.5,False,False,False,False,False,False,False,False,False,False,True,True,False,False,18.0,0.0,2019-01-20,221.333333
2019-01-21,217,29.0,41.0,30.0,28.0,23.0,22.0,20.0,3891,745,217,0,0.6,-0.9,1.6,5.6,10.0,13.0,10.2,1020.8,False,False,False,False,False,False,False,False,False,False,True,True,False,False,11.0,0.0,2019-01-21,233.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-20,183,42.0,50.0,12.0,10.0,6.6,15.0,24.0,3419,902,55,6489,5.7,1.9,7.5,1.5,0.0,233.0,11.9,1024.5,False,False,False,False,False,False,False,False,False,False,True,False,False,True,0.0,0.0,2023-12-20,180.333333
2023-12-21,187,54.0,63.0,6.0,5.0,3.3,8.4,18.0,3419,902,55,6489,8.4,5.3,11.0,0.5,0.0,211.0,19.1,1016.9,False,False,False,False,False,False,False,False,False,False,True,False,False,True,0.0,0.0,2023-12-21,176.333333
2023-12-22,182,69.0,69.0,8.0,6.0,3.0,6.8,24.0,3419,902,55,6489,9.2,7.6,11.1,3.3,0.0,260.0,17.8,1018.0,False,False,False,True,False,False,False,False,False,False,True,False,False,True,2.0,0.0,2023-12-22,178.000000
2023-12-23,198,63.0,63.0,5.0,5.0,2.8,6.1,18.0,3419,902,55,6489,7.6,7.1,2.7,0.2,0.0,204.0,14.4,1023.4,False,False,True,False,False,False,False,False,False,False,True,False,False,True,0.0,0.0,2023-12-23,171.333333


In [14]:
baseTabularDataset.train_set.to_csv('train.csv', index=False)
baseTabularDataset.test_set.to_csv('test.csv', index=False)
baseTabularDataset.val_set.to_csv('val.csv', index=False)
baseTabularDataset.enc_X_train.to_csv('enc_X_train.csv', index=False)
baseTabularDataset.enc_X_test.to_csv('enc_X_test.csv', index=False)
baseTabularDataset.enc_X_val.to_csv('enc_X_val.csv', index=False)
baseTabularDataset.y_train.to_csv('y_train.csv', index=False)
baseTabularDataset.y_test.to_csv('y_test.csv', index=False)
baseTabularDataset.y_val.to_csv('y_val.csv', index=False)