In [None]:
import time
import json
import numpy as np
import pandas as pd
import covasim as cv
from pyDOE import lhs
from src.data import get_regional_data
from src.interventions import get_sampling_interventions

# Data Generation

## Initial Configuration

Once the dataset is calibrated, we would like to use it to predict possible future evolutions of the pandemic.

Given that *Covasim* needs a warm-up period to reach a stabler pandemic condition, we will proceed by generating the dataset in this way:
* we run a simulation for a total of *zones_starting_day $+$ num_zones $\cdot$ time_interval days*
* we discard the first *zones_starting_day* days (251, in our case), meaning that we will collect data from 01/11/2020 on, which is when the evolution of the pandemic started having a behaviour more similar to the one we have right now (i.e., after the summer period)
* we aggregate the data on a time window of size *2 $\cdot$ time_interval* (6 weeks, in our case), in which we consider the first half of the data as "inputs", and the last half as "outputs"
  * right in the middle of these windows there is a zone change
  * it will be used as input of the surrogate model together with the other data

In this way, we aim at maximizing the number of samples that we can get from each run, while trying at the same time to cover the data space in a (hopefully) almost uniform way.

In [None]:
num_zones = 8
time_interval = 21
zones_starting_day = 251

We can now retrieve the parameters' configuration that we obtained from the calibration step and set the length of the simulation from the previously set values.

In [None]:
with open('../res/parameters.json', 'r') as json_file:
    j = json.load(json_file)

intervention_params = j['intervention_params']
initial_params = j['initial_params']
initial_params['n_days'] = zones_starting_day + num_zones * time_interval
df = get_regional_data(4.46e6 / initial_params['pop_size'])

initial_params

## Latin-Hypercube Sampling

We have *4* possible categories for the zones, and each run of the simulator will consider a sequence of *8* zone changes. Thus, the total number of possible sequences is $4^8 = 65536$, which, if we consider an average of 2 minutes for each run, will end the generation phase after 90 days.

This is clearly not desirable, thus we rely on *Latin-Hypercube Sampling* to generate a subset of *300* sequences which, however, is supposed to cover the sequences space almost uniformly. Again, if we consider an average of 2 minutes for each run, will end the generation phase after 10 hours.

In [None]:
samples = lhs(n=num_zones, samples=300)
samples = pd.DataFrame(samples)
for i in range(num_zones):
    samples[i] = samples[i].map(lambda v: int(4 * v)).map({0: 'W', 1: 'Y', 2: 'O', 3: 'R'})
samples

We can start the generation script, from which we will collect the data about *hospitalized* individuals and cumulative *diagnosed* cases and *deaths*.

In [None]:
data = []
for idx, zones in samples.iterrows():
    print(f'Generating samples for simulation {idx + 1:0{len(str(samples.shape[0]))}}/{samples.shape[0]}', end='')
    start_time = time.time()
    intervs = get_sampling_interventions(zones, intervention_params, time_interval)
    sim = cv.Sim(pars={**initial_params, 'rand_seed': idx}, interventions=intervs, datafile=df)
    sim.run()
    # concatenate data in an array of shape (num_days, 4)
    result = np.concatenate((
        sim.results['n_severe'].values + sim.results['n_critical'].values,
        sim.results['cum_diagnoses'].values,
        sim.results['cum_deaths'].values
    )).reshape(3, -1).transpose()
    # retrieve data about last num_zones * time_interval and flatten it to get num_zones rows
    result = result[-num_zones * time_interval:].reshape(num_zones, -1)
    # get data about two subsequent zones with respective zone colors
    data += [np.concatenate((inp, out, zones[z:z + 2])) for z, (inp, out), in enumerate(zip(result[:-1], result[1:]))]
    print(f' -- elapsed time: {time.time() - start_time:.4}s')

## Saving the Dataset

We can now collect the data into a pandas dataframe and store it in a *csv* file to be used to train the surrogate model.

In [None]:
pd.options.display.max_columns = 2 * time_interval + 2

columns = [f'{c}_{d}' for d in range(0, 2 * time_interval) for c in ['hosp', 'diag', 'dead']]
data = pd.DataFrame(data, columns=columns + ['init_zone', 'actuated_zone'])
data.to_csv('../res/dataset.csv', index=False)
data