## Read csv files into dataframes

In [6]:
import pandas as pd
import numpy as np
import json
from pathlib import Path

ofcl_data_folder = Path('official-data')
csv_vac = ofcl_data_folder / 'vac.csv'
csv_vue = ofcl_data_folder / 'vue-densemble.csv'
csv_indic = ofcl_data_folder / 'indicators.csv'
csv_spf = ofcl_data_folder / 'spf.csv'

df_vac = pd.read_csv(csv_vac, index_col=0)
df_vue = pd.read_csv(csv_vue, index_col=0)
df_indic = pd.read_csv(csv_indic, index_col=0)
df_spf = pd.read_csv(csv_spf, index_col=0)

## Define data types explicitly
Use `pd.Int32Dtype()` to prevent type promotion `int64` -> `float64` when `NaN` is present.

No need to set `str` type. This also avoids `NaN` -> `'nan'`.

In [7]:
types_spf = {
    'hospiWeek': pd.Int32Dtype(),
    'icuWeek': pd.Int32Dtype(),
    'vacEhpadUsldPct': float,
    # 'vacEhpadUsldPct_date': str,
    'casesRtPcrCumul': pd.Int32Dtype(),
    'casesAntigCumul': pd.Int32Dtype(),
    'highVul': pd.Int32Dtype(),
    'clusters': pd.Int32Dtype(),
    'clusters_ehpad': pd.Int32Dtype(), 
    # 'clusters_date': str,
}

df_vac = df_vac.astype(pd.Int32Dtype())
df_vue = df_vue.astype(pd.Int32Dtype())
df_indic = df_indic.astype(float)
df_spf = df_spf.astype(types_spf)

## Combine columns into `list`

In [8]:
def combine_columns(df, cols):
    '''
    Combine columns of a dataframe `df` into list in place,
    where `cols` is a list of columns to combine. 
    The new column keeps the column name `cols[0]`,
    and other columns in `cols` are dropped in place. 
    '''
    na_value = 'nan'

    def na_list (list_of_lists):
        return [np.nan if na_value in x else x for x in list_of_lists]

    df[cols[0]] = na_list(df[cols].to_numpy(na_value = na_value).tolist())
    df.drop(cols[1:], axis=1, inplace=True)

combine_columns(df_spf, ['vacEhpadUsldPct', 'vacEhpadUsldPct_date'])
combine_columns(df_spf, ['clusters', 'clusters_ehpad', 'clusters_date'])

## Merge dataframes into `df`

In [9]:
df = pd.concat([df_vac, df_vue, df_indic, df_spf], axis=1, sort=True)

## Write `df` to json

In [10]:
# beautified json is only for git diff

json_ofcl_data = ofcl_data_folder / 'official-data.json'
json_ofcl_data_beautified = ofcl_data_folder / 'official-data_beautified.json'

json_content = json.dumps({ index: row.dropna().to_dict() for index, row in df.iterrows() })
json_content_beautified = json.dumps({ index: row.dropna().to_dict() for index, row in df.iterrows() }, indent=4)

with open(json_ofcl_data, 'w') as f:
    f.write(json_content)
with open(json_ofcl_data_beautified, 'w') as f:
    f.write(json_content_beautified)