# Stream Dataset

In [15]:
import os
import sys
sys.path.insert(0, os.path.abspath(
    os.path.join(os.path.dirname('__file__'), '..')))

In [16]:
from src.encoding.encoders import *
from src.encoding.tools import create_encoding_pipeline
from src.models.sklearn_api_model import save_object, Model
from src.models.sklearn_api_models_config import get_model
from src.datasets.base_tabular_dataset import BaseTabularDataset
from src.experiments.base_experiment import BaseExperiment
import src.features as ft
import logging
import pandas as pd
import pathlib
import numpy as np
from datetime import datetime as dt
from src.location.location import Location
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')
from sklearn.metrics import mean_absolute_error, mean_squared_error
from pmdarima import auto_arima

In [17]:
# Define the root directory of the project
root_dir = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))
root_dir = pathlib.Path(root_dir)

In [18]:
# Define a logger used by all modules
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO, encoding="utf-8",
                    format="%(name)s %(asctime)s: %(levelname)s: %(message)s", handlers=[logging.StreamHandler()])

## Load dataset

In [19]:
ETABS = ['CHU Dijon'] # ['CHU Dijon', 'CH Beaune', 'CH Semur', 'CH Chatillon Montbard', 'CH privé Dijon']
DATE_START = '01-01-2019'
DATE_END = '06-12-2020'

In [20]:
# Select the features to be used in the dataset
ars_features_class = [
    ft.HospitalFeatures(include_emmergency_arrivals=True, include_nb_hospit=False),
    ft.AirQualityFeatures,
    ft.GoogleTrendFeatures,
    ft.MeteorologicalFeatures,
    ft.SociologicalFeatures,
    ft.PopulationFeatures,
    ft.SportsCompetitionFeatures,
    ft.TrafficFeatures,
    ft.EpidemiologicalFeatures
    ]

In [21]:
# Define the configuration for the fetching of the data
fetch_config = {
    "data_start": DATE_START,
    "data_stop": DATE_END,
    'data_dir': root_dir / 'data',
    "locations": ETABS
    }

In [22]:
# Select the target columns to be predicted
target_columns = ['nb_emmergencies']
# target_columns = ['nb_vers_hospit']
# target_columns = ['Y75T79']

In [23]:
# Define an encoding scheme to create the encoding pipeline
encoders_dict = {
    'number': {
        'as_number': {
            'imputers': [imputers.SimpleImputer(strategy='mean')],
            'encoders': [
                ne.StandardScaler(),
            ]
        }
    },
    'category': {
        'as_category': {
            'imputers': [imputers.SimpleImputer(strategy='most_frequent')],
            'encoders': [
                ne.MultiTargetEncoder(drop_invariant=True, return_df=True),
            ]
        }
    },
    'datetime': {
        'as_number': {
            'imputers': [de.DateFeatureExtractor()],
            'encoders': [
                ne.CyclicalFeatures(drop_original=True)
            ]
        },
        'as_category': {
            'imputers': [de.DateFeatureExtractor(dtype='category')],
            'encoders': [
                ne.MultiTargetEncoder(drop_invariant=True, return_df=True),
            ]
        }
    },
    'boolean': {
        'as_number': {
            'imputers': [imputers.SimpleImputer(strategy='most_frequent')],
             'encoders': [ne.BooleanEncoder()]
         }

    }
}

In [24]:
# Create the encoding pipeline
pipeline = create_encoding_pipeline(encoders_dict)

Creating encoding pipeline


In [25]:
pipeline

In [26]:
# Define the splitting scheme to create the sets
split_config = {'test_size': 0.2, 'val_size': 0.2, 'shuffle': False}

In [27]:
# Define the configuration of the dataset
dataset_config = {
    'from_date': DATE_START,
    'to_date': DATE_END,
    'locations': ETABS,
    'axis': 'rows',
    'shift': range(1, 8, 1),
    'rolling_window': [7, 14, 31, 365],
    'freq': '1D',
    'split_config': split_config,
    'create_X_y': True,
    'encoding_pipeline': pipeline,
    'targets_names': target_columns,
    'targets_shift': -1,
    'targets_rolling_window': 0,
    'targets_history_shifts': range(1, 8, 1),
    'targets_history_rolling_windows': [7, 14, 31, 365],
    'targets_locations': ['CHU Dijon'],
    'drop_constant_thr': 1.0,
    'data_dir': root_dir / 'data'
    }

In [28]:
# Create the dataset and fetch the data from the source then call get_dataset() method to fill the different attributes (X and y) of the different sets, and their encodings
arsTabularDataset = BaseTabularDataset(features_classes=ars_features_class, logger=logger, fetch_config=fetch_config, getter_config=dataset_config)

root 2024-12-10 12:16:52,266: INFO: Initialisation de la classe BaseTabularDataset
root 2024-12-10 12:16:52,302: INFO: Initialisation des features
root 2024-12-10 12:16:52,309: INFO: Fetching dataset
root 2024-12-10 12:16:52,311: INFO: Fetching data for hospitalfeatures at CHU Dijon


hospitalfeatures 2024-12-10 12:16:52,640: INFO: hospitalfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:16:52,667: INFO: Fetching data for airqualityfeatures at CHU Dijon
root 2024-12-10 12:16:52,851: INFO: airqualityfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:16:52,864: INFO: Fetching data for googletrendfeatures at CHU Dijon
root 2024-12-10 12:16:53,026: INFO: googletrendfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:16:53,041: INFO: Fetching data for meteorologicalfeatures at CHU Dijon
root 2024-12-10 12:16:53,211: INFO: meteorologicalfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:16:53,223: INFO: Fetching data for sociologicalfeatures at CHU Dijon
root 2024-12-10 12:16:53,371: INFO: sociologicalfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:16:53,387: INFO: Fetching data for populationfeatures at CHU Dijon
root 2024-12-10 12:16:53,592: INFO: populationfeatures's data already fetched for CHU 

Dropped 164 constant columns from both sets: {'trend_éruption cutanée', 'PM10_FR26094', 'inc_ira', 'NO2_FR26094', 'trend_mal de tête', 'trend_épilepsie', 'trend_palpitations', 'NO2_FR26010', 'trend_vomissements', 'trend_gastro-entérite', 'PM10_FR26010', 'trend_SOS médecin', 'PM25_FR26094'}
X shape: (451, 1734), y shape: (451, 1)
[ColumnTransformer] .... (1 of 6) Processing pipeline-1, total=   0.1s
[ColumnTransformer] .... (2 of 6) Processing pipeline-2, total=   0.0s
[ColumnTransformer] .... (3 of 6) Processing pipeline-3, total=   0.1s
[ColumnTransformer] .... (4 of 6) Processing pipeline-4, total=   0.1s


root 2024-12-10 12:16:59,468: INFO: 47 features not encoded (same unit as target)


[ColumnTransformer] .... (5 of 6) Processing pipeline-5, total=   0.1s
[ColumnTransformer] ..... (6 of 6) Processing remainder, total=   0.0s


In [29]:
df = arsTabularDataset.data
df.shape

(705, 1782)

In [30]:
df

Unnamed: 0_level_0,O3_FR26005%%J-3,trend_fièvre%%J-5,trend_douleur%%mean_31J,Y60T64%%mean_31J,meteo_pres%%J-7,meteo_snow%%J-3,trend_entorse%%mean_7J,meteo_snow%%std_365J,trend_médecin%%mean_31J,Y50T54%%std_7J,...,trend_infection cutanée,eveBankHolidays%%J-5,trend_allergie%%J-3,Y_LT5%%J-2,trend_hypothermie%%J-7,trend_éruption cutanée%%J-6,trend_stress%%J-1,trend_noyade%%J-5,trend_douleur%%std_31J,trend_méningite%%mean_31J
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01,38.0,0.0,54.483871,33434.354839,1034.5,0.0,0.0,2.666901,3.225806,0.899735,...,0.0,False,0.0,26662.0,0.0,0.0,0.0,0.0,25.015823,2.225806
2019-01-02,31.0,0.0,54.838710,33434.354839,1034.5,0.0,0.0,2.666901,3.225806,0.899735,...,0.0,False,0.0,26662.0,0.0,0.0,0.0,0.0,25.246910,2.225806
2019-01-03,26.0,0.0,55.193548,33434.354839,1034.5,0.0,0.0,2.666901,3.225806,0.899735,...,0.0,False,0.0,26662.0,0.0,0.0,0.0,0.0,25.268451,2.225806
2019-01-04,36.0,0.0,55.548387,33434.354839,1034.5,0.0,0.0,2.666901,2.387097,0.899735,...,0.0,False,0.0,26660.0,0.0,0.0,0.0,0.0,25.400576,2.225806
2019-01-05,62.0,0.0,55.774194,33434.354839,1034.5,0.0,0.0,2.666901,2.387097,0.899735,...,0.0,False,0.0,26659.0,0.0,84.0,0.0,0.0,25.472481,2.225806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-01,16.0,0.0,51.000000,33712.129032,1026.1,0.0,0.0,0.000000,2.096774,0.000000,...,0.0,False,0.0,26160.0,0.0,0.0,0.0,0.0,16.769019,0.000000
2020-12-02,26.0,0.0,52.354839,33712.000000,1021.1,0.0,0.0,0.000000,2.096774,0.000000,...,0.0,False,0.0,26160.0,0.0,0.0,0.0,0.0,13.975093,0.000000
2020-12-03,24.0,0.0,53.193548,33711.870968,1022.1,0.0,0.0,0.000000,2.096774,0.000000,...,72.0,False,0.0,26160.0,0.0,0.0,0.0,0.0,13.739043,0.000000
2020-12-04,22.0,0.0,52.903226,33711.741935,1018.9,0.0,0.0,0.000000,2.096774,0.000000,...,0.0,False,0.0,26160.0,0.0,0.0,0.0,0.0,13.597438,0.000000


In [31]:
col1 = df.columns

## Update dataset

In [32]:
def update(dataset, date_to=None, steps=1):
    shift = abs(dataset.getter_config['targets_shift'])
    if date_to is None:
        date_to = (dataset.data.index[-1] + pd.Timedelta(days=steps+shift)).strftime('%d-%m-%Y')
    else:
        date_to = (dt.strptime(date_to, '%d-%m-%Y') + pd.Timedelta(days=shift)).strftime('%d-%m-%Y')
    dataset.getter_config['to_date'] = date_to
    dataset.fetch_config['data_stop'] = date_to
    dataset.fetch_dataset(**dataset.fetch_config)
    dataset.get_dataset(**dataset.getter_config, inplace=True)
    return dataset

### Increment untill date

In [33]:
arsTabularDataset = update(arsTabularDataset, date_to='09-12-2024')
df = arsTabularDataset.data

root 2024-12-10 12:17:00,086: INFO: Fetching dataset
root 2024-12-10 12:17:00,094: INFO: Fetching data for hospitalfeatures at CHU Dijon is located at Dijon, 21231, Côte-d'Or, Bourgogne-Franche-Comté
hospitalfeatures 2024-12-10 12:17:00,100: INFO: hospitalfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:17:00,141: INFO: Fetching data for airqualityfeatures at CHU Dijon is located at Dijon, 21231, Côte-d'Or, Bourgogne-Franche-Comté
root 2024-12-10 12:17:00,144: INFO: airqualityfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:17:00,163: INFO: Fetching data for googletrendfeatures at CHU Dijon is located at Dijon, 21231, Côte-d'Or, Bourgogne-Franche-Comté
root 2024-12-10 12:17:00,165: INFO: googletrendfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:17:00,185: INFO: Fetching data for meteorologicalfeatures at CHU Dijon is located at Dijon, 21231, Côte-d'Or, Bourgogne-Franche-Comté
root 2024-12-10 12:17:00,187: INFO: meteorologicalfeatures's 

Dropped 90 constant columns from both sets: {'PM10_FR26094', 'NO2_FR26094', 'trend_épilepsie', 'NO2_FR26010', 'PM10_FR26010', 'PM25_FR26094'}
X shape: (1388, 1808), y shape: (1388, 1)
[ColumnTransformer] .... (1 of 6) Processing pipeline-1, total=   0.2s
[ColumnTransformer] .... (2 of 6) Processing pipeline-2, total=   0.0s
[ColumnTransformer] .... (3 of 6) Processing pipeline-3, total=   0.0s
[ColumnTransformer] .... (4 of 6) Processing pipeline-4, total=   0.1s


root 2024-12-10 12:17:05,771: INFO: 47 features not encoded (same unit as target)


[ColumnTransformer] .... (5 of 6) Processing pipeline-5, total=   0.1s
[ColumnTransformer] ..... (6 of 6) Processing remainder, total=   0.0s


In [34]:
df

Unnamed: 0,O3_FR26005%%J-3,trend_fièvre%%J-5,trend_douleur%%mean_31J,Y60T64%%mean_31J,meteo_pres%%J-7,meteo_snow%%J-3,trend_entorse%%mean_7J,meteo_snow%%std_365J,trend_médecin%%mean_31J,Y50T54%%std_7J,...,eveBankHolidays%%J-5,trend_allergie%%J-3,Y_LT5%%J-2,trend_hypothermie%%J-7,trend_éruption cutanée%%J-6,trend_stress%%J-1,trend_noyade%%J-5,inc_ira%%J-7,trend_douleur%%std_31J,trend_méningite%%mean_31J
2019-01-01,38.0,0.0,54.483871,33434.354839,1034.5,0.0,0.0,2.666901,3.225806,0.899735,...,False,0.0,26662.0,0.0,0.0,0.0,0.0,0.0,25.015823,2.225806
2019-01-02,31.0,0.0,54.838710,33434.354839,1034.5,0.0,0.0,2.666901,3.225806,0.899735,...,False,0.0,26662.0,0.0,0.0,0.0,0.0,0.0,25.246910,2.225806
2019-01-03,26.0,0.0,55.193548,33434.354839,1034.5,0.0,0.0,2.666901,3.225806,0.899735,...,False,0.0,26662.0,0.0,0.0,0.0,0.0,0.0,25.268451,2.225806
2019-01-04,36.0,0.0,55.548387,33434.354839,1034.5,0.0,0.0,2.666901,2.387097,0.899735,...,False,0.0,26660.0,0.0,0.0,0.0,0.0,0.0,25.400576,2.225806
2019-01-05,62.0,0.0,55.774194,33434.354839,1034.5,0.0,0.0,2.666901,2.387097,0.899735,...,False,0.0,26659.0,0.0,84.0,0.0,0.0,0.0,25.472481,2.225806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-05,58.0,0.0,79.000000,33749.000000,1026.1,0.0,0.0,22.617892,16.000000,0.000000,...,False,0.0,24333.0,0.0,0.0,0.0,0.0,6291.0,0.000000,0.000000
2024-12-06,58.0,0.0,79.000000,33749.000000,1028.8,0.0,0.0,22.617892,16.000000,0.000000,...,False,0.0,24333.0,0.0,0.0,0.0,0.0,6291.0,0.000000,0.000000
2024-12-07,58.0,0.0,79.000000,33749.000000,1029.2,0.0,0.0,22.617892,16.000000,0.000000,...,False,0.0,24333.0,0.0,0.0,0.0,0.0,6291.0,0.000000,0.000000
2024-12-08,58.0,0.0,79.000000,33749.000000,1024.9,0.0,0.0,22.617892,16.000000,0.000000,...,False,0.0,24333.0,0.0,0.0,0.0,0.0,6291.0,0.000000,0.000000


In [35]:
print([i for i in df.columns if i not in col1])

['trend_palpitations%%J-6', 'trend_palpitations%%J-2', 'trend_SOS médecin%%J-7', 'trend_gastro-entérite%%J-1', 'trend_mal de tête%%std_14J', 'trend_mal de tête%%J-5', 'trend_SOS médecin%%J-2', 'trend_SOS médecin%%mean_7J', 'trend_gastro-entérite%%J-2', 'trend_vomissements%%mean_7J', 'trend_mal de tête%%std_31J', 'trend_vomissements%%J-6', 'trend_vomissements%%J-3', 'trend_vomissements%%J-4', 'trend_vomissements%%J-2', 'trend_gastro-entérite%%J-6', 'trend_SOS médecin%%mean_31J', 'trend_gastro-entérite%%std_7J', 'trend_SOS médecin%%std_31J', 'trend_mal de tête%%J-1', 'trend_SOS médecin%%J-3', 'trend_gastro-entérite%%mean_31J', 'trend_gastro-entérite%%J-3', 'trend_mal de tête%%J-3', 'trend_vomissements', 'trend_vomissements%%J-5', 'trend_vomissements%%mean_14J', 'trend_SOS médecin%%J-5', 'trend_SOS médecin%%std_14J', 'trend_mal de tête', 'trend_vomissements%%std_14J', 'inc_ira%%J-6', 'trend_mal de tête%%J-7', 'trend_SOS médecin%%J-4', 'trend_vomissements%%std_31J', 'trend_gastro-entérite%

In [36]:
print([i for i in col1 if i not in df.columns])

['date']


### Increment by 1 day

In [37]:
arsTabularDataset = update(arsTabularDataset)
df = arsTabularDataset.data
df.shape

root 2024-12-10 12:17:06,484: INFO: Fetching dataset
root 2024-12-10 12:17:06,486: INFO: Fetching data for hospitalfeatures at CHU Dijon is located at Dijon, 21231, Côte-d'Or, Bourgogne-Franche-Comté
hospitalfeatures 2024-12-10 12:17:06,492: INFO: hospitalfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:17:06,509: INFO: Fetching data for airqualityfeatures at CHU Dijon is located at Dijon, 21231, Côte-d'Or, Bourgogne-Franche-Comté
root 2024-12-10 12:17:06,512: INFO: airqualityfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:17:06,530: INFO: Fetching data for googletrendfeatures at CHU Dijon is located at Dijon, 21231, Côte-d'Or, Bourgogne-Franche-Comté
root 2024-12-10 12:17:06,532: INFO: googletrendfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:17:06,549: INFO: Fetching data for meteorologicalfeatures at CHU Dijon is located at Dijon, 21231, Côte-d'Or, Bourgogne-Franche-Comté
root 2024-12-10 12:17:06,550: INFO: meteorologicalfeatures's 

Dropped 90 constant columns from both sets: {'PM10_FR26094', 'NO2_FR26094', 'trend_épilepsie', 'NO2_FR26010', 'PM10_FR26010', 'PM25_FR26094'}
X shape: (1388, 1808), y shape: (1388, 1)
[ColumnTransformer] .... (1 of 6) Processing pipeline-1, total=   0.1s
[ColumnTransformer] .... (2 of 6) Processing pipeline-2, total=   0.0s
[ColumnTransformer] .... (3 of 6) Processing pipeline-3, total=   0.0s
[ColumnTransformer] .... (4 of 6) Processing pipeline-4, total=   0.1s


root 2024-12-10 12:17:11,488: INFO: 47 features not encoded (same unit as target)


[ColumnTransformer] .... (5 of 6) Processing pipeline-5, total=   0.1s
[ColumnTransformer] ..... (6 of 6) Processing remainder, total=   0.0s


(2171, 1856)

In [38]:
df

Unnamed: 0,O3_FR26005%%J-3,trend_fièvre%%J-5,trend_douleur%%mean_31J,Y60T64%%mean_31J,meteo_pres%%J-7,meteo_snow%%J-3,trend_entorse%%mean_7J,meteo_snow%%std_365J,trend_médecin%%mean_31J,Y50T54%%std_7J,...,eveBankHolidays%%J-5,trend_allergie%%J-3,Y_LT5%%J-2,trend_hypothermie%%J-7,trend_éruption cutanée%%J-6,trend_stress%%J-1,trend_noyade%%J-5,inc_ira%%J-7,trend_douleur%%std_31J,trend_méningite%%mean_31J
2019-01-01,38.0,0.0,54.483871,33434.354839,1034.5,0.0,0.0,2.666901,3.225806,0.899735,...,False,0.0,26662.0,0.0,0.0,0.0,0.0,0.0,25.015823,2.225806
2019-01-02,31.0,0.0,54.838710,33434.354839,1034.5,0.0,0.0,2.666901,3.225806,0.899735,...,False,0.0,26662.0,0.0,0.0,0.0,0.0,0.0,25.246910,2.225806
2019-01-03,26.0,0.0,55.193548,33434.354839,1034.5,0.0,0.0,2.666901,3.225806,0.899735,...,False,0.0,26662.0,0.0,0.0,0.0,0.0,0.0,25.268451,2.225806
2019-01-04,36.0,0.0,55.548387,33434.354839,1034.5,0.0,0.0,2.666901,2.387097,0.899735,...,False,0.0,26660.0,0.0,0.0,0.0,0.0,0.0,25.400576,2.225806
2019-01-05,62.0,0.0,55.774194,33434.354839,1034.5,0.0,0.0,2.666901,2.387097,0.899735,...,False,0.0,26659.0,0.0,84.0,0.0,0.0,0.0,25.472481,2.225806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-06,58.0,0.0,79.000000,33749.000000,1028.8,0.0,0.0,22.617892,16.000000,0.000000,...,False,0.0,24333.0,0.0,0.0,0.0,0.0,6291.0,0.000000,0.000000
2024-12-07,58.0,0.0,79.000000,33749.000000,1029.2,0.0,0.0,22.617892,16.000000,0.000000,...,False,0.0,24333.0,0.0,0.0,0.0,0.0,6291.0,0.000000,0.000000
2024-12-08,58.0,0.0,79.000000,33749.000000,1024.9,0.0,0.0,22.617892,16.000000,0.000000,...,False,0.0,24333.0,0.0,0.0,0.0,0.0,6291.0,0.000000,0.000000
2024-12-09,58.0,0.0,79.000000,33749.000000,1019.8,0.0,0.0,22.617892,16.000000,0.000000,...,False,0.0,24333.0,0.0,0.0,0.0,0.0,6291.0,0.000000,0.000000


### Increment by n days

In [39]:
n = 4
arsTabularDataset = update(arsTabularDataset, steps=n)
df = arsTabularDataset.data
df.shape

root 2024-12-10 12:17:12,177: INFO: Fetching dataset
root 2024-12-10 12:17:12,185: INFO: Fetching data for hospitalfeatures at CHU Dijon is located at Dijon, 21231, Côte-d'Or, Bourgogne-Franche-Comté
hospitalfeatures 2024-12-10 12:17:12,190: INFO: hospitalfeatures's data already fetched for CHU Dijon


root 2024-12-10 12:17:12,204: INFO: Fetching data for airqualityfeatures at CHU Dijon is located at Dijon, 21231, Côte-d'Or, Bourgogne-Franche-Comté
root 2024-12-10 12:17:12,207: INFO: airqualityfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:17:12,227: INFO: Fetching data for googletrendfeatures at CHU Dijon is located at Dijon, 21231, Côte-d'Or, Bourgogne-Franche-Comté
root 2024-12-10 12:17:12,231: INFO: googletrendfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:17:12,247: INFO: Fetching data for meteorologicalfeatures at CHU Dijon is located at Dijon, 21231, Côte-d'Or, Bourgogne-Franche-Comté
root 2024-12-10 12:17:12,249: INFO: meteorologicalfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:17:12,385: INFO: Fetching data for sociologicalfeatures at CHU Dijon is located at Dijon, 21231, Côte-d'Or, Bourgogne-Franche-Comté
root 2024-12-10 12:17:12,387: INFO: sociologicalfeatures's data already fetched for CHU Dijon
root 2024-12-10 12:17:

Dropped 90 constant columns from both sets: {'PM10_FR26094', 'NO2_FR26094', 'trend_épilepsie', 'NO2_FR26010', 'PM10_FR26010', 'PM25_FR26094'}
X shape: (1392, 1808), y shape: (1392, 1)
[ColumnTransformer] .... (1 of 6) Processing pipeline-1, total=   0.1s
[ColumnTransformer] .... (2 of 6) Processing pipeline-2, total=   0.0s
[ColumnTransformer] .... (3 of 6) Processing pipeline-3, total=   0.0s
[ColumnTransformer] .... (4 of 6) Processing pipeline-4, total=   0.1s


root 2024-12-10 12:17:16,763: INFO: 47 features not encoded (same unit as target)


[ColumnTransformer] .... (5 of 6) Processing pipeline-5, total=   0.1s
[ColumnTransformer] ..... (6 of 6) Processing remainder, total=   0.0s


(2175, 1856)

In [40]:
df

Unnamed: 0,O3_FR26005%%J-3,trend_fièvre%%J-5,trend_douleur%%mean_31J,Y60T64%%mean_31J,meteo_pres%%J-7,meteo_snow%%J-3,trend_entorse%%mean_7J,meteo_snow%%std_365J,trend_médecin%%mean_31J,Y50T54%%std_7J,...,eveBankHolidays%%J-5,trend_allergie%%J-3,Y_LT5%%J-2,trend_hypothermie%%J-7,trend_éruption cutanée%%J-6,trend_stress%%J-1,trend_noyade%%J-5,inc_ira%%J-7,trend_douleur%%std_31J,trend_méningite%%mean_31J
2019-01-01,38.0,0.0,54.483871,33434.354839,1034.5,0.0,0.0,2.666901,3.225806,0.899735,...,False,0.0,26662.0,0.0,0.0,0.0,0.0,0.0,25.015823,2.225806
2019-01-02,31.0,0.0,54.838710,33434.354839,1034.5,0.0,0.0,2.666901,3.225806,0.899735,...,False,0.0,26662.0,0.0,0.0,0.0,0.0,0.0,25.246910,2.225806
2019-01-03,26.0,0.0,55.193548,33434.354839,1034.5,0.0,0.0,2.666901,3.225806,0.899735,...,False,0.0,26662.0,0.0,0.0,0.0,0.0,0.0,25.268451,2.225806
2019-01-04,36.0,0.0,55.548387,33434.354839,1034.5,0.0,0.0,2.666901,2.387097,0.899735,...,False,0.0,26660.0,0.0,0.0,0.0,0.0,0.0,25.400576,2.225806
2019-01-05,62.0,0.0,55.774194,33434.354839,1034.5,0.0,0.0,2.666901,2.387097,0.899735,...,False,0.0,26659.0,0.0,84.0,0.0,0.0,0.0,25.472481,2.225806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-10,58.0,0.0,79.000000,33749.000000,1021.7,0.0,0.0,22.617892,16.000000,0.000000,...,False,0.0,24333.0,0.0,0.0,0.0,0.0,6291.0,0.000000,0.000000
2024-12-11,58.0,0.0,79.000000,33749.000000,,,0.0,,16.000000,0.000000,...,False,0.0,24333.0,0.0,0.0,0.0,0.0,6291.0,0.000000,0.000000
2024-12-12,58.0,0.0,79.000000,33749.000000,,,0.0,,16.000000,0.000000,...,False,0.0,24333.0,0.0,0.0,0.0,0.0,6291.0,0.000000,0.000000
2024-12-13,58.0,0.0,79.000000,33749.000000,,,0.0,,16.000000,0.000000,...,False,0.0,24333.0,0.0,0.0,0.0,0.0,6291.0,0.000000,0.000000


### State of the saved file

In [41]:
df_file = pd.read_feather(root_dir / 'data/features/meteorologicalfeatures/data_CHU Dijon.feather')
df_file

Unnamed: 0_level_0,meteo_tavg,meteo_tmin,meteo_tmax,meteo_prcp,meteo_snow,meteo_wdir,meteo_wspd,meteo_pres,meteo_station
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-01-01,5.9,4.4,7.2,0.5,0.0,319.0,6.2,1034.5,
2019-01-02,4.6,2.6,6.6,0.0,0.0,343.0,17.4,1037.0,
2019-01-03,1.3,-2.2,4.4,0.0,0.0,5.0,14.1,1039.1,
2019-01-04,1.4,-1.4,5.1,0.0,0.0,359.0,11.0,1038.4,
2019-01-05,3.2,-1.0,5.0,0.0,0.0,323.0,11.0,1036.4,
...,...,...,...,...,...,...,...,...,...
2024-12-06,7.6,4.0,11.0,5.3,0.0,227.0,13.8,1019.3,
2024-12-07,6.1,4.0,7.9,8.0,0.0,207.0,17.7,1010.2,
2024-12-08,5.3,4.8,5.9,5.4,0.0,278.0,12.7,1006.4,
2024-12-09,4.5,2.8,5.1,5.4,0.0,13.0,23.8,1017.1,
