In [13]:
import pandas as pd
from pathlib import Path
import os
import numpy as np

Define paths

In [14]:
gpl_covid_path = Path(os.getcwd()).parent.parent.parent

dir_data_interim = gpl_covid_path / 'data' / 'interim' / 'iran'
dir_data_processed = gpl_covid_path / 'data' / 'processed'

# Input
path_iran_interim_adm0 = dir_data_interim / 'adm0' / 'IRN_interim.csv'
path_iran_interim_adm2 = dir_data_interim / 'IRN_interim.csv'

# Outputs
path_iran_processed_adm0 = dir_data_processed / 'adm0' / 'IRN_processed.csv'
path_iran_processed_adm2 = dir_data_processed / 'adm2' / 'IRN_processed.csv'

In [15]:
adm0_df = pd.read_csv(path_iran_interim_adm0, parse_dates=['date'])
adm2_df = pd.read_csv(path_iran_interim_adm2, parse_dates=['date'])

Clean `adm2_df` and `adm0_df`

In [16]:
adm0_df = adm0_df.drop(columns=['new_confirmed_cases', 'new_deaths_national'])
adm2_df = adm2_df.drop(columns=['new_confirmed_cases', 'new_confirmed_cases_imputed'])

In [17]:
def convert_non_monotonic_to_nan(array):
    """Converts a numpy array to a monotonically increasing one.
    Args:
        array (numpy.ndarray [N,]): input array
    Returns:
        numpy.ndarray [N,]: some values marked as missing, all non-missing
            values should be monotonically increasing
    Usage:
        >>> convert_non_monotonic_to_nan(np.array([0, 0, 5, 3, 4, 6, 3, 7, 6, 7, 8]))
        np.array([ 0.,  0., np.nan,  3., np.nan, np.nan,  3., np.nan,  6.,  7.,  8.])
    """
    keep = np.arange(0, len(array))
    is_monotonic = False
    while not is_monotonic:
        is_monotonic_array = np.hstack((
            array[keep][1:] >= array[keep][:-1], np.array(True)))
        is_monotonic = is_monotonic_array.all()
        keep = keep[is_monotonic_array]
    out_array = np.full_like(array.astype(np.float), np.nan)
    out_array[keep] = array[keep]
    return out_array

def log_interpolate(array):
    """Interpolates assuming log growth.
    Args:
        array (numpy.ndarray [N,]): input array with missing values
    Returns:
        numpy.ndarray [N,]: all missing values will be filled
    Usage:
        >>> log_interpolate(np.array([0, np.nan, 2, np.nan, 4, 6, np.nan, 7, 8]))
        np.array([0, 0, 2, 3, 4, 6, 7, 7, 8])
    """
    idx = np.arange(0, len(array))
    log_array = np.log(array.astype(np.float32) + 1e-1)
    interp_array = np.interp(
        x=idx, xp=idx[~np.isnan(array)], fp=log_array[~np.isnan(array)])
    return np.round(np.exp(interp_array)).astype(np.int32)

def impute_cumulative_array(array):
    """Ensures array is cumulative, imputing where necessary
    Args:
        array-like (numpy.ndarray [N,], pandas.Series, etc.): input array with missing values
    Returns:
        numpy.ndarray [N,]: all non-monotonic values will be filled by logarithmic interpolation
    Usage:
        >>> impute_cumulative_array(np.array([0, 0, 5, 3, 4, 6, 3, 7, 6, 7, 8]))
        np.array([0, 0, 2, 3, 4, 6, 7, 7, 8])
    """
    array = np.array(array)
    array = convert_non_monotonic_to_nan(array)
    array = log_interpolate(array)
    return array

def impute_cumulative_df(df, src_col, dst_col, groupby_col):
    """Calculates imputed columns and returns 
    Args:
        df (pandas.DataFrame): input DataFrame with a cumulative column
        src_col (str): name of cumulative column to impute
        dst_col (str): name of imputed cumulative column
        groupby_col (str): name of column containing names of administrative units,
            values should correspond to groups whose values should be accumulating
    Returns:
        pandas.DataFrame: a copy of `df` with a newly imputed column specified by `dst_col`
    Usage:
        >>> impute_cumulative_df(pandas.DataFrame([[0, 'a'], [5, 'b'], [3, 'a'], [2, 'a'], [6, 'b']]), 0, 1)
        pandas.DataFrame([[0, 'a', 0], [5, 'b', 5], [3, 'a', 0], [2, 'a', 2], [6, 'b', 6]], columns=[0, 1, 'imputed'])
    """
    if src_col not in df.columns:
        raise ValueError(f"'{src_col}' not found")
    
    if dst_col not in df.columns:
        df[dst_col] = -1
        
    for adm_name in df[groupby_col].unique():
        sub = df.loc[df[groupby_col] == adm_name].copy()
        sub[dst_col] = impute_cumulative_array(sub[src_col])
        df.loc[df[groupby_col] == adm_name] = sub
        
    return df

In [18]:
imputed_suffix = "_imputed"
cumulative_prefix = "cum_"

src_col = cumulative_prefix + 'confirmed_cases' + imputed_suffix
dst_col = src_col
adm2_df = impute_cumulative_df(adm2_df, src_col, dst_col, 'adm2_name')

In [21]:
adm2_df = adm2_df.sort_values(['date', 'adm2_name'])

In [22]:
template = pd.read_csv(gpl_covid_path / 'data' / 'processed' / '[country]_processed.csv')

In [23]:
set(adm0_df.columns) - set(template.columns)

set()

In [24]:
set(adm2_df.columns) - set(template.columns)

set()

In [25]:
adm2_df.to_csv(path_iran_processed_adm2, index=False)

In [26]:
adm0_df.to_csv(path_iran_processed_adm0, index=False)