### Imports

In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath(
    os.path.join(os.path.dirname('__file__'), '..')))

In [2]:
# %pip install -r ../requirements.txt

In [3]:
from src.encoding.encoders import *
from src.encoding.tools import create_encoding_pipeline
from src.models.sklearn_api_model import save_object, Model
from src.models.sklearn_api_models_config import get_model
from src.datasets.base_tabular_dataset import BaseTabularDataset
from src.experiments.base_experiment import BaseExperiment
import src.features as ft
import logging
import pandas as pd
import pathlib
import numpy as np

1 GPU(s) detected.


In [4]:
import logging
import os
import sys
import datetime as dt
from typing import List, Union, Optional
import pathlib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from src.datasets.base_tabular_dataset import BaseTabularDataset
from src.encoding.tools import create_encoding_pipeline
from src.experiments.features_selection import get_features, explore_features
from src.models.sklearn_api_model import Model, ModelTree
import src.features as ft
import mlflow.sklearn
import mlflow
import mlflow.data.pandas_dataset
from mlflow.models import infer_signature
import os
import matplotlib.pyplot as plt
%matplotlib widget
import cudf as cd
import numpy as np
import re


### Config

In [5]:
# Define a logger used by all modules
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO, encoding="utf-8",
                    format="%(name)s %(asctime)s: %(levelname)s: %(message)s", handlers=[logging.StreamHandler()])

In [6]:
# Define the root directory of the project
root_dir = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))
root_dir = pathlib.Path(root_dir)

In [7]:
ALL_HOSPITALS = []

##### Encoding Pipeline

In [8]:
# Define an encoding scheme to create the encoding pipeline
encoders_dict = {
    'number': {
        'as_number': {
            'imputers': [imputers.SimpleImputer(strategy='mean')],
            'encoders': [
                ne.StandardScaler(),
            ]
        }
    },
    'category': {
        'as_category': {
            'imputers': [imputers.SimpleImputer(strategy='most_frequent')],
            'encoders': [
                ne.MultiTargetEncoder(drop_invariant=True, return_df=True),
            ]
        }
    },
    'datetime': {
        'as_number': {
            'imputers': [de.DateFeatureExtractor()],
            'encoders': [
                ne.CyclicalFeatures(drop_original=True)
            ]
        },
        'as_category': {
            'imputers': [de.DateFeatureExtractor(dtype='category')],
            'encoders': [
                ne.MultiTargetEncoder(drop_invariant=True, return_df=True),
            ]
        }
    },
    'boolean': {
        'as_number': {
            'imputers': [imputers.SimpleImputer(strategy='most_frequent')],
             'encoders': [ne.BooleanEncoder()]
         }

    }
}

In [9]:
# Create the encoding pipeline
pipeline = create_encoding_pipeline(encoders_dict)

Creating encoding pipeline


In [10]:
pipeline

##### Dataset

In [11]:
# Define the configuration for the fetching of the data
fetch_config = {
    "data_start": '01-01-2019',
    "data_stop": '31-12-2023',
    'data_dir': root_dir / 'data',
    "locations": ['CHU Dijon', 'CH Beaune', 'CH Semur', 'CH Chatillon Montbard', 'CH privé Dijon', 'CH Langres', 'CH Chaumont', 'HNFC', 'CHU Besançon']
    }

In [12]:
# Select the features to be used in the dataset
ars_features_class = [
    ft.HospitalFeatures,
    ft.AirQualityFeatures,
    ft.EpidemiologicalFeatures,
    # ft.FireFightersFeatures(include_calls=False),
    ft.GoogleTrendFeatures,
    ft.MeteorologicalFeatures,
    ft.SociologicalFeatures,
    ft.SportsCompetitionFeatures,
    ft.TrafficFeatures
    ]

In [13]:
# Select the target columns to be predicted
# target_colomns = ['nb_emmergencies']
target_colomns = ['nb_hospit_np_from_ED_adults']
# target_colomns = ['nb_hospit_np_adults%%J+1%%mean_7J']

In [14]:
# Define the splitting scheme to create the sets
split_config = {'test_size': 0.2, 'val_size': 0.2, 'shuffle': False}

In [15]:
# Create the dataset and fetch the data from the source then call get_dataset() method to fill the different attributes (X and y) of the different sets, and their encodings
arsTabularDataset = BaseTabularDataset(features_classes=ars_features_class, logger=logger, fetch_config=fetch_config)

root 2024-11-27 09:51:16,526: INFO: Initialisation de la classe BaseTabularDataset
root 2024-11-27 09:51:16,533: INFO: Initialisation des features
root 2024-11-27 09:51:16,535: INFO: Fetching dataset
root 2024-11-27 09:51:16,799: INFO: hospitalfeatures's data already fetched for CHU Dijon
root 2024-11-27 09:51:17,068: INFO: Fetching hospitalfeatures's data for CH Beaune...
root 2024-11-27 09:51:17,082: INFO: Intégration de la target
root 2024-11-27 09:51:17,083: INFO:   - Chargement des données de CH Beaune depuis le fichier Excel


FileNotFoundError: [Errno 2] No such file or directory: '/home/maxime/Documents/WORKSPACES/forecasting_models/data/features/hospitalfeatures/urgences/exports/export_CH Beaune.csv'

In [None]:
for location in fetch_config['locations']:
    # Define the configuration of the dataset
    dataset_config = {
    'from_date': '01-01-2019',
    'to_date': '30-12-2023',
    'locations': [location],
    # 'locations': ['CHU Dijon', 'CH Beaune', 'CH Semur', 'CH Chatillon Montbard', 'CH privé Dijon'],
    # 'axis': 'rows',
    'shift': range(1, 8, 1),
    'rolling_window': [7, 14, 31, 365],
    'freq': '1D',
    'split_config': split_config,
    'create_X_y': True,
    'encoding_pipeline': pipeline,
    'targets_names': target_colomns,
    'targets_shift': -7,
    'targets_rolling_window': 7,
    'targets_history_shifts': range(1, 8, 1),
    'targets_history_rolling_windows': [7, 14, 31, 365],
    'targets_locations': [location],
    'drop_constant_thr': 1.0,
    'data_dir': root_dir / 'data'
    }

    arsTabularDataset.get_dataset(**dataset_config)

    df = arsTabularDataset.enc_data

    # Identifie les colonnes contenant des NaN
    cols_with_nan = df.columns[df.isna().any()].tolist()
    print("Colonnes contenant des NaN:", cols_with_nan)

    # Affiche la liste des index des lignes contenant des NaN pour chaque colonne
    nan_indices = {col: df[df[col].isna()].index.tolist() for col in cols_with_nan}
    print(nan_indices)

    df.to_csv(fetch_config['data_dir'] / f'datasets/full_dataset_{location}.csv')
    df_target = arsTabularDataset.data[arsTabularDataset.targets_names]
    df_target.to_csv(fetch_config['data_dir'] / f'datasets/full_dataset_{location}_targets.csv')