# Generate the Dataset

We want to predict possible future evolutions of the pandemic knowing the parameters of the simulator which more or less have been able to describe the trend of the real data for the past year.
These parameters have been computed in the previous notebook.

In [1]:
import json
import pandas as pd

with open('res/parameters.json', 'r') as json_file:
  j = json.load(json_file)

initial_params = j['initial_params']
intervention_params = j['intervention_params']

## Latin-Hypercube Sampling

* beta
* pop_infected
* init_zone
* actuated_zone
* num_tests

TODO: Description

In [2]:
from pyDOE import lhs
import seaborn as sns
from scipy.stats.distributions import norm

cols = ['beta', 'pop_infected', 'initial_zone', 'actuated_zone', 'tests_percentage']

samples = lhs(n=len(cols), samples=10)
samples = pd.DataFrame(samples, columns=cols)

# Transformations
samples['beta'] = norm(0.016, 0.005).ppf(samples['beta']).clip(1e-4)
samples['pop_infected'] = norm(5e-3, 1.5e-3).ppf(samples['pop_infected']).clip(1e-4) * 400e3
samples['initial_zone'] = [ int(4 * v) for v in samples['initial_zone'] ]
samples['actuated_zone'] = [ int(4 * v) for v in samples['actuated_zone'] ]
samples['tests_percentage'] = samples['tests_percentage'] * 0.01

# TODO: Visualization
# sns.histplot(samples['pop_infected']);
samples.describe()

Unnamed: 0,beta,pop_infected,initial_zone,actuated_zone,tests_percentage
count,10.0,10.0,10.0,10.0,10.0
mean,0.015924,2030.499911,1.4,1.6,0.004978
std,0.004973,712.724457,1.173788,1.173788,0.003041
min,0.007563,772.945729,0.0,0.0,0.000998
25%,0.012804,1628.859986,0.25,1.0,0.002806
50%,0.015975,2040.859429,1.5,1.5,0.004619
75%,0.018583,2413.578165,2.0,2.75,0.007461
max,0.023677,3380.718409,3.0,3.0,0.009466


We will consider a shorter time interval, in particular, around a month centered in today.

In [3]:
from util import data
import covasim as cv

pop_size = 400e3
pop_scale = 10
df = data.get_regional_data(scaling_factor=4.46e6/pop_size)

Covasim 2.0.3 (2021-03-11) — © 2021 by IDM


In [22]:
time_interval = 15 # days
result_cols = ['n_severe', 'n_critical', 'cum_diagnoses', 'cum_deaths']
sim_days = range(0, time_interval * 2 + 1)
data_cols = [col + str(day) for day in sim_days for col in result_cols]

Unnamed: 0,n_severe0,n_critical0,cum_diagnoses0,cum_deaths0,n_severe1,n_critical1,cum_diagnoses1,cum_deaths1,n_severe2,n_critical2,...,cum_diagnoses28,cum_deaths28,n_severe29,n_critical29,cum_diagnoses29,cum_deaths29,n_severe30,n_critical30,cum_diagnoses30,cum_deaths30


In [30]:
from util import interventions

initial_params['start_day'] = df['date'].iloc[-1] - pd.Timedelta(time_interval, unit='D')
initial_params['end_day'] = df['date'].iloc[-1] + pd.Timedelta(time_interval, unit='D')

data = []
for _, s in samples.iterrows():
    initial_params['pop_infected'] = s['pop_infected']
    initial_params['beta'] = s['beta']
    intervs = interventions.SamplingInterventions.get_all(
        init_zone=s['initial_zone'],
        actuated_zone=s['actuated_zone'],
        num_tests=samples['tests_percentage']*pop_size,
        actuation_day=time_interval
    )
    sim = cv.Sim(pars=initial_params, interventions=intervs)
    sim.run()
    raw = [result.values for result in sim.results[result_cols]]
    raw = [item for daily in zip(*raw) for item in daily]
    row = pd.Series(raw)
    data.append(row)
data


[0        0.000000
 1        0.000000
 2        0.000000
 3        0.000000
 4        0.000000
           ...    
 119     94.184075
 120    432.988128
 121    112.256181
 122      0.000000
 123    102.202373
 Length: 124, dtype: float64,
 0        0.000000
 1        0.000000
 2        0.000000
 3        0.000000
 4        0.000000
           ...    
 119     34.261402
 120    171.992678
 121     42.998170
 122      0.000000
 123     34.261402
 Length: 124, dtype: float64,
 0         0.0
 1         0.0
 2         0.0
 3         0.0
 4         0.0
         ...  
 119      80.0
 120    3480.0
 121     710.0
 122       0.0
 123     110.0
 Length: 124, dtype: float64,
 0        0.000000
 1        0.000000
 2        0.000000
 3        0.000000
 4        0.000000
           ...    
 119     41.511811
 120    740.000000
 121    100.000000
 122      0.000000
 123     61.511811
 Length: 124, dtype: float64,
 0        0.000000
 1        0.000000
 2        0.000000
 3        0.000000
 4        0.

Finally, we can collect the results in a dataframe.

In [14]:
#dataset = pd.???
#dataset.to_csv('res/dataset.csv')
#dataset


Unnamed: 0,n_severe0,n_critical0,cum_diagnoses0,cum_deaths0,n_severe1,n_critical1,cum_diagnoses1,cum_deaths1,n_severe2,n_critical2,...,cum_diagnoses28,cum_deaths28,n_severe29,n_critical29,cum_diagnoses29,cum_deaths29,n_severe30,n_critical30,cum_diagnoses30,cum_deaths30
