# Prédiction et Validation du Modèle d'Hospitalisation

Ce notebook combine la construction du modèle de prédiction et sa validation approfondie pour détecter le surapprentissage.

In [6]:
import pandas as pd
import numpy as np
import os
from pycaret.regression import *
from model_validation import (
    plot_learning_curves,
    evaluate_cross_validation,
    plot_prediction_errors,
    calculate_error_statistics
)
from google.cloud import bigquery

## 1. Chargement et Préparation des Données

In [7]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/antob/Documents/Arctusol/projet_wagon/projet_data_JBN/projet-jbn-data-le-wagon-533639ce801d.json"
        # Initialisation du client BigQuery
client = bigquery.Client()

query = """
SELECT * FROM projet-jbn-data-le-wagon.dbt_medical_analysis_join_total_morbidite.class_join_total_morbidite_sexe_population

"""

df_hospi = client.query(query).to_dataframe()


In [11]:
# Chargement des données
df_hospi_dpt_ens = df_hospi[(df_hospi["niveau"] == "Départements") & (df_hospi["sexe"] == "Ensemble")]
df_hospi_dpt_HF = df_hospi[(df_hospi["niveau"] == "Départements") & (df_hospi["sexe"] != "Ensemble")]
df_hospi_reg_ens = df_hospi[(df_hospi["niveau"] == "Régions") & (df_hospi["sexe"] == "Ensemble")]
df_hospi_reg_HF = df_hospi[(df_hospi["niveau"] == "Régions") & (df_hospi["sexe"] != "Ensemble")]

# Création des données annuelles pour le niveau régional
df_hospi_reg_ens_yr = df_hospi_reg_ens.groupby(['nom_region', 'year',"nom_pathologie"]).agg({
    'nbr_hospi': 'sum',
    'population': 'mean',

}).reset_index()

print("Shape du DataFrame:", df_hospi_reg_ens_yr.shape)
df_hospi_reg_ens_yr.head()

Shape du DataFrame: (14274, 5)


Unnamed: 0,nom_region,year,nom_pathologie,nbr_hospi,population
0,Auvergne-Rhône-Alpes,2018-12-31,Accident vascul. cérébral mal défini,640,7997000.0
1,Auvergne-Rhône-Alpes,2018-12-31,Accouchement unique et spontané,43527,7997000.0
2,Auvergne-Rhône-Alpes,2018-12-31,Affect.inflam.org.génitaux féminins,3174,7997000.0
3,Auvergne-Rhône-Alpes,2018-12-31,Affections aiguës voies respir. sup.,4284,7997000.0
4,Auvergne-Rhône-Alpes,2018-12-31,Affections de la glande thyroïde,3203,7997000.0


## 2. Configuration et Entraînement du Modèle avec PyCaret

In [25]:
# Initialisation de PyCaret with the data and year needs to be transformed from datetime to int

reg_setup = setup(
    data=df_hospi_reg_ens_yr,
    target='nbr_hospi',
    fold_strategy = 'timeseries', 
    numeric_features = ['population'], 
    categorical_features = ['nom_region', 'nom_pathologie'],
    ignore_features = ['year'],
    fold = 10, 
    data_split_shuffle = False,
    fold_shuffle = False,
    transform_target = True, 
    session_id = 123
)

# Comparaison des modèles
best_model = compare_models(n_select=1)

# Création du modèle final
final_model = create_model(best_model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,nbr_hospi
2,Target type,Regression
3,Original data shape,"(14274, 5)"
4,Transformed data shape,"(14274, 16)"
5,Transformed train set shape,"(9991, 16)"
6,Transformed test set shape,"(4283, 16)"
7,Ignore features,1
8,Numeric features,1
9,Categorical features,2


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,1217.733,12587284.251,2849.7208,0.7531,0.7508,1.3026,0.089
rf,Random Forest Regressor,1106.1481,10798262.938,2646.6842,0.7514,0.7056,1.14,0.188
knn,K Neighbors Regressor,1370.8299,22980056.42,3642.0989,0.7176,0.7277,1.1636,0.031
lightgbm,Light Gradient Boosting Machine,1371.3137,15611475.6905,3245.2396,0.7085,0.8435,1.8263,0.091
dt,Decision Tree Regressor,1337.2537,16527000.1399,3260.3426,0.6595,0.7584,1.2497,0.028
et,Extra Trees Regressor,1324.1515,14207695.5449,3164.3594,0.6183,0.8733,2.1416,0.167
ada,AdaBoost Regressor,2065.4141,50060398.5684,5106.1563,0.5188,0.8814,1.5653,0.066
lasso,Lasso Regression,5483.1499,227853596.0504,9902.2401,-0.0666,1.3194,3.4539,0.024
llar,Lasso Least Angle Regression,5483.1499,227853596.0504,9902.2401,-0.0666,1.3194,3.4539,0.025
en,Elastic Net,5584.2516,258072831.8403,10328.3299,-0.1273,1.2944,3.2649,0.024


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,905.8928,3300052.0609,1816.6045,0.9323,0.3033,0.28
1,744.4662,2584156.3162,1607.5311,0.9479,0.2659,0.2275
2,1748.0355,17566209.8577,4191.2063,-0.0361,1.4867,3.8602
3,2361.4627,31961881.8291,5653.484,0.7431,0.5224,0.3655
4,884.6787,3096696.0333,1759.7432,0.95,1.061,1.9254
5,362.5239,687715.6089,829.2862,0.8102,1.2228,2.402
6,2523.7156,37885513.3746,6155.1209,0.873,0.3472,0.2559
7,2229.3334,28065837.4825,5297.72,0.9485,0.5061,0.2956
8,248.5115,483607.4561,695.4189,0.8102,0.4792,0.4789
9,168.7094,241172.4912,491.0932,0.5524,1.3132,2.935


## 3. Validation Approfondie du Modèle

In [26]:
# Obtention des données d'entraînement
X = get_config('X')
y = get_config('y')

# 1. Courbes d'apprentissage
print("Traçage des courbes d'apprentissage...")
plot_learning_curves(final_model, X, y)

# 2. Validation croisée détaillée
print("\nÉvaluation de la validation croisée...")
scores, mae_scores, rmse_scores = evaluate_cross_validation(final_model, X, y)

# 3. Prédictions et analyse des erreurs
predictions = predict_model(final_model)
y_true = predictions['nbr_hospi']
y_pred = predictions['prediction_label']

print("\nAnalyse des erreurs de prédiction...")
plot_prediction_errors(y_true, y_pred)

# 4. Statistiques détaillées
print("\nCalcul des statistiques d'erreur détaillées...")
mae, rmse, r2, mape = calculate_error_statistics(y_true, y_pred)

Traçage des courbes d'apprentissage...


ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\antob\Documents\Arctusol\projet_wagon\projet_data_JBN\.venv\Lib\site-packages\pandas\core\arrays\categorical.py", line 564, in astype
    new_cats = new_cats.astype(dtype=dtype, copy=copy)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Auvergne-Rhône-Alpes'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\antob\Documents\Arctusol\projet_wagon\projet_data_JBN\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\antob\Documents\Arctusol\projet_wagon\projet_data_JBN\.venv\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\antob\Documents\Arctusol\projet_wagon\projet_data_JBN\.venv\Lib\site-packages\sklearn\ensemble\_gb.py", line 659, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\antob\Documents\Arctusol\projet_wagon\projet_data_JBN\.venv\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\antob\Documents\Arctusol\projet_wagon\projet_data_JBN\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1263, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\antob\Documents\Arctusol\projet_wagon\projet_data_JBN\.venv\Lib\site-packages\sklearn\utils\validation.py", line 921, in check_array
    array = array.astype(new_dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\antob\Documents\Arctusol\projet_wagon\projet_data_JBN\.venv\Lib\site-packages\pandas\core\generic.py", line 6534, in astype
    new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\antob\Documents\Arctusol\projet_wagon\projet_data_JBN\.venv\Lib\site-packages\pandas\core\internals\managers.py", line 414, in astype
    return self.apply(
           ^^^^^^^^^^^
  File "c:\Users\antob\Documents\Arctusol\projet_wagon\projet_data_JBN\.venv\Lib\site-packages\pandas\core\internals\managers.py", line 354, in apply
    applied = getattr(b, f)(**kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\antob\Documents\Arctusol\projet_wagon\projet_data_JBN\.venv\Lib\site-packages\pandas\core\internals\blocks.py", line 616, in astype
    new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\antob\Documents\Arctusol\projet_wagon\projet_data_JBN\.venv\Lib\site-packages\pandas\core\dtypes\astype.py", line 238, in astype_array_safe
    new_values = astype_array(values, dtype, copy=copy)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\antob\Documents\Arctusol\projet_wagon\projet_data_JBN\.venv\Lib\site-packages\pandas\core\dtypes\astype.py", line 180, in astype_array
    values = values.astype(dtype, copy=copy)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\antob\Documents\Arctusol\projet_wagon\projet_data_JBN\.venv\Lib\site-packages\pandas\core\arrays\categorical.py", line 575, in astype
    raise ValueError(msg)
ValueError: Cannot cast object dtype to float32


## 4. Interprétation des Résultats

Analysons les différents indicateurs de validation :

1. **Courbes d'apprentissage** : 
   - Si les courbes d'entraînement et de validation sont proches, le modèle généralise bien
   - Un grand écart indique un surapprentissage

2. **Validation croisée** :
   - La variation des scores entre les folds indique la stabilité du modèle
   - Une grande variation suggère une instabilité

3. **Distribution des erreurs** :
   - Une distribution normale centrée sur 0 est idéale
   - Des biais systématiques sont visibles dans le scatter plot

4. **MAPE** :
   - Donne une idée de l'erreur en pourcentage
   - Plus facile à interpréter pour les utilisateurs métier