# Preprocessing of data from Paireau et al (2022)

In [3]:
import iisignature as isig
import numpy as np
import pandas as pd
import time
from sklearn.linear_model import Lasso, LinearRegression
import sys
from sklearn.model_selection import train_test_split
import os
import torch
import pickle
from sklearn.preprocessing import StandardScaler 
import matplotlib.pyplot as plt
from matplotlib import rc
import matplotlib.pyplot as plt
import seaborn as sns
import pyreadr

sns.set()
rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
rc('text', usetex=True)

This notebook pre-processed the results from Paireau, J., Andronico, A., Hozé, N., Layan, M., Crepey, P., Roumagnac, A., ... & Cauchemez, S. (2022). An ensemble model based on early predictors to forecast COVID-19 health care demand in France. Proceedings of the National Academy of Sciences, 119(18), e2103302119.

Since this article gives results for predicting gross hospital admissions, we need to process the data to obtain predictions on the growth rate. Please start by downloading the file `Ensemble.rds` containing the results of the aformentioned article by following this link: https://gitlab.pasteur.fr/mmmi-pasteur/covid19-ensemble-model/Results .  

In [5]:
result_PNAS = pyreadr.read_r('SET_ENSEMBLE_PATH_HERE')
result_PNAS = result_PNAS[None]

In [6]:
result_PNAS['date']= pd.to_datetime(result_PNAS['date'])

In [10]:
result_PNAS = result_PNAS.loc[result_PNAS['model']=="Ensemble"]
result_PNAS = result_PNAS.loc[result_PNAS['var']=="iHosp"]


(100139, 31)

In [15]:
regions = ["ARA", "BFC", "BRE", "CVL",  "GES", "HDF", "IDF", "NOR",
                      "NAQ", "OCC", "PDL", "PAC"]


results_RF_temp = pd.DataFrame({'date' : result_PNAS['date'].unique()[2:]})

In [18]:
result_PNAS = result_PNAS.drop(['lower_2', 'lower_5',
       'lower_10', 'lower_20', 'lower_30', 'lower_40', 'lower_50', 'lower_60',
       'lower_70', 'lower_80', 'lower_90', 'upper_90', 'upper_80',
       'upper_70', 'upper_60', 'upper_50', 'upper_40', 'upper_30', 'upper_20',
       'upper_10', 'upper_5', 'upper_2', 'model', 'date_proj', 'lower',
       'upper'],axis=1)

In [None]:
for region in regions:
    for horizon in np.arange(1,15):
        result_PNAS_horizon = result_PNAS.loc[result_PNAS['prediction_horizon'] == horizon].copy()
        result_PNAS_horizon_region = result_PNAS_horizon.loc[result_PNAS_horizon['region'] == region].copy()

        var = 'pred'  + str(region) + str(horizon)
        results_PNAS_temp = results_PNAS_temp.assign(value = np.empty((result_PNAS_horizon_region.shape[0]-2)))
        results_PNAS_temp = results_PNAS_temp.rename(columns={'value' : var})

        pred = np.log(result_PNAS_horizon_region['point'][1:].to_numpy()/ \
                     result_PNAS_horizon_region['point'][:-1].to_numpy())


results_RF = results_RF_temp.copy()

In [20]:
results_PNAS_temp = result_PNAS.copy()
results_PNAS_temp = results_PNAS_temp.reset_index()
results_PNAS_temp["pred_r"] = np.empty(results_PNAS_temp.shape[0])

In [22]:
for region in regions:
    for horizon in np.arange(1,15):
        result_PNAS_horizon = results_PNAS_temp.loc[results_PNAS_temp['prediction_horizon'] == horizon].copy()
        result_PNAS_horizon_region = result_PNAS_horizon.loc[result_PNAS_horizon['region'] == region].copy()
        #print(result_PNAS_horizon_region.shape)
        pred = np.log((result_PNAS_horizon_region['point'][1:].to_numpy() \
                       /result_PNAS_horizon_region['point'][:-1].to_numpy()))
        results_PNAS_temp.loc[result_PNAS_horizon_region.index[1:], 'pred_r'] =  pred
        dates = results_PNAS_temp['date'].unique()[horizon + 1 : ]
        results_PNAS_temp.loc[result_PNAS_horizon_region.index[1:], 'date'] = dates


In [24]:
results_PNAS_temp.rename(columns={'prediction_horizon' : horizon})


Unnamed: 0,index,var,region,date,14,point,pred_r
0,287311,iHosp,ARA,2021-03-07,-1.0,124.839526,1.057374e-200
1,287312,iHosp,ARA,2021-03-07,0.0,124.364461,1.057374e-200
2,287313,iHosp,ARA,2021-03-07,1.0,123.892634,1.057374e-200
3,287314,iHosp,ARA,2021-03-07,2.0,123.908161,1.057374e-200
4,287315,iHosp,ARA,2021-03-07,3.0,123.268286,1.057374e-200
...,...,...,...,...,...,...,...
24006,311317,iHosp,PDL,2021-07-05,-1.0,2.129376,5.018987e-200
24007,311318,iHosp,PDL,2021-07-05,0.0,2.113516,5.018987e-200
24008,311319,iHosp,PDL,2021-07-06,1.0,2.099437,1.544741e-01
24009,311320,iHosp,PDL,2021-07-06,-1.0,2.076958,5.069114e-200


In [None]:
results_PNAS_temp.to_csv("SET_RESULT_PATH")