## Task 1. Compiling Ebola Data

The `DATA_FOLDER/ebola` folder contains summarized reports of Ebola cases from three countries (Guinea, Liberia and Sierra Leone) during the recent outbreak of the disease in West Africa. For each country, there are daily reports that contain various information about the outbreak in several cities in each country.

Use pandas to import these data files into a single `Dataframe`.
Using this `DataFrame`, calculate for *each country*, the *daily average* per year of *new cases* and *deaths*.
Make sure you handle all the different expressions for *new cases* and *deaths* that are used in the reports.

In [217]:
# Import libraries and initialize file paths for each country
import os
import pandas as pd
import numpy as n
from typing import Generator

def get_file_paths(root: str) -> Generator[str, None, None]:
    """
    Returns the paths for all the files in the subtree of `root`
    """
    for (dir_path, _, file_names) in os.walk(root):
        yield from [os.path.join(dir_path, file_name) for file_name in file_names]

gn_file_paths = [*get_file_paths("./data/guinea_data")]
lr_file_paths = [*get_file_paths("./data/liberia_data")]
sl_file_paths = [*get_file_paths("./data/sl_data")]

In [218]:
def normalize_daily_data(raw_frame: pd.core.frame.DataFrame, variable_col):
    """
    Normalize the dataframe obtained by reading a a daily CSV to have
    one row per (region/date) pair and use columns for each variable.
    """
    # Normalize column names
    raw_frame.columns = map(str.lower, raw_frame.columns)
    
    DATE: str = "date"
    VARIABLE: str = variable_col
    REGION: str = "region"
    VALUE: str = "value"
    
    values_stack = []
    var_names = [] # Index of variables

    # Iterate over each variable (row)
    for index, row in raw_frame.iterrows():
        date = row[DATE]
        var_name = row[VARIABLE]
        if var_name in var_names:
            # Ignore variable: it was already defined
            continue
        var_names.append(var_name)
        
        # Iterate over each region (column)
        for col_name, value in row.iteritems():
            if col_name in {DATE, VARIABLE, "western area combined", "national", "totals"}:
                # Ignore: not a region
                continue
            values_stack.append({DATE: pd.to_datetime(date), REGION: col_name, VARIABLE: var_name, VALUE: value})
            
    stack_frame = pd.DataFrame(values_stack)
    pivoted = stack_frame.set_index([DATE, REGION, VARIABLE]).unstack(VARIABLE)
    # Drop to level ("value") of the column's multi index
    pivoted.columns = [col[1] for col in pivoted.columns] 
    return pivoted


## Compute merged dataframes for each country

In [219]:
def normalize_sl(raw_frame: pd.core.frame.DataFrame):
    """
    Normalize the dataframe obtained by reading a Sierra Leone CSV to have
    one row per (region/date) pair and use columns for each variable.
    """
    return normalize_daily_data(raw_frame, "variable")

sl = pd.concat([normalize_sl(pd.read_csv(path)) for path in sl_file_paths])

In [220]:
def normalize_lr(raw_frame: pd.core.frame.DataFrame):
    """
    Normalize the dataframe obtained by reading a Liberia CSV to have
    one row per (region/date) pair and use columns for each variable.
    """
    return normalize_daily_data(raw_frame, "variable")

lr = pd.concat([normalize_lr(pd.read_csv(path)) for path in lr_file_paths])

In [221]:
def normalize_gn(raw_frame: pd.core.frame.DataFrame):
    """
    Normalize the dataframe obtained by reading a Guinea CSV to have
    one row per (region/date) pair and use columns for each variable.
    """
    return normalize_daily_data(raw_frame, "description")

gn = pd.concat([normalize_gn(pd.read_csv(path)) for path in gn_file_paths])

In [222]:
# Ensure that the column names are lowercase
for df in [gn, sl, lr]:
    df.columns = map(str.lower, df.columns)

In [223]:
# Ensure that the percentages are expressed as floats

import math

def percent_to_float(str_or_nan):
    val = str_or_nan
    if type(str_or_nan) == str:
        str_or_nan = str_or_nan.replace("%", "")
        try:
            val = float(str_or_nan)
        except ValueError:
            val = float('nan')
    return val / 100

gn["fatality rate for confirmed and probables"] = gn["fatality rate for confirmed and probables"].map(percent_to_float)
sl["percent_seen"] = sl["percent_seen"].map(percent_to_float)

## Create columns for deaths and new cases analysis

For the Liberia and Sierra Leone, we select already available that are mostly complete.
We select the confirmed cases and deaths to reduce the uncertainty of the data.
For the Guinea, the daily data is too sparse (missing in too many regions/days) so we work with the delta on cumulated values instead.

In [224]:
# Copy columns to normalize the name
lr["$deaths"] = pd.to_numeric(lr["newly reported deaths"])
lr["$new_cases"] = pd.to_numeric(lr["new case/s (confirmed)"])
# See next cell for Guinea
# gn["$deaths"] = pd.to_numeric(gn["total deaths of confirmed"])
# gn["$new_cases"] = pd.to_numeric(gn["total cases of confirmed"])
sl["$deaths"] = pd.to_numeric(sl["death_confirmed"])
sl["$new_cases"] = pd.to_numeric(sl["new_confirmed"])

In [225]:
# Use delta of cumulated values for Guinea
# We split the dataframe per region, compute the deltas and then merge the results.

regional_data_frames = []

gn["total deaths of confirmed"] = pd.to_numeric(gn["total deaths of confirmed"])
gn["total cases of confirmed"] = pd.to_numeric(gn["total cases of confirmed"])
tmp_gn = gn[[*gn.columns]]
tmp_gn.reset_index(inplace=True)
tmp_gn = tmp_gn.set_index(["region", "date"])
tmp_gn.sort_index(inplace=True)
# tmp_gn is now sorted and indexed by region and date

for region in tmp_gn.index.levels[0]:
    regional_df = tmp_gn.loc[region, :].copy()
    # Compute $deaths and $new_cases as the rate of change of "total deaths of confirmed" and "total cases of confirmed"
    # After the diff, replace the first NaN by the original value (assume that the total is zero at `time == -Inf`)
    regional_df["$deaths"] = regional_df["total deaths of confirmed"].fillna(method="pad").diff().clip(0).fillna(regional_df["total deaths of confirmed"])
    regional_df["$new_cases"] = regional_df["total cases of confirmed"].fillna(method="pad").diff().clip(0).fillna(regional_df["total cases of confirmed"])
    regional_df["region"] = region
    regional_data_frames.append(regional_df)

regional_data_frames[0]

gn = pd.concat(regional_data_frames)
gn.reset_index(inplace=True)
#     gn.drop("index")
gn = gn.set_index(["date", "region"])
gn.sort_index(inplace=True)

In [226]:
# Emit merged data frames to the disk (to better view them)

COUNTRY = "country"
REGION: str = "region"
DATE: str = "date"

# gn.rename(columns={'cumulative cfr': 'cfr'}, inplace=True)

gn.to_csv("./gn.csv")
sl.to_csv("./sl.csv")
lr.to_csv("./lr.csv")

# Anotate the country
gn[COUNTRY] = "gn"
sl[COUNTRY] = "sl"
lr[COUNTRY] = "lr"

# Concat all and reindex by country/region/date
ebola = pd.concat([gn, lr, sl])
ebola.reset_index(inplace=True)
ebola = ebola.set_index([COUNTRY, DATE, REGION])
ebola.to_csv("./ebola.csv")

### Daily average per month of new cases and deaths

In [288]:
def duration_to_days(x):
    return x 

def get_monthly_new_cases_and_deaths(df):
    # Keep only the 2 columns we need
    data = df.reset_index(level=1)[["$deaths", "$new_cases"]]
    # Sum over all the regions for each date
    all_regions = data.dropna().groupby("date").sum()
    # Compute "days since last data-point" (to deal with mi)
    all_regions["days"] = all_regions.index
    all_regions["days"] = all_regions["days"].diff() / pd.Timedelta(1, unit='d')
    all_regions["$premultiplied_daily_deaths"] = all_regions["$deaths"] / all_regions["days"]
    all_regions["$premultiplied_daily_new_cases"] = all_regions["$new_cases"] / all_regions["days"]

    result = all_regions.groupby([all_regions.index.year, all_regions.index.month]).sum()
    # Remove temporary columns
    result = result.drop("$deaths", axis=1).drop("$new_cases", axis=1).drop("days", axis=1)
    result.rename(columns={"$premultiplied_daily_deaths": "Daily deaths"}, inplace=True)
    result.rename(columns={"$premultiplied_daily_new_cases": "Daily new cases"}, inplace=True)
    return result

Unnamed: 0_level_0,Unnamed: 1_level_0,Daily deaths,Daily new cases
date,date,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,8,30.287879,58.166667
2014,9,95.916667,208.25
2014,10,28.0,25.0


## Guinea

In [289]:
get_monthly_new_cases_and_deaths(gn)

Unnamed: 0_level_0,Unnamed: 1_level_0,Daily deaths,Daily new cases
date,date,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,8,30.287879,58.166667
2014,9,95.916667,208.25
2014,10,28.0,25.0


## Liberia

In [203]:
get_monthly_new_cases_and_deaths(lr)

Unnamed: 0_level_0,Unnamed: 1_level_0,$deaths,$new_cases
date,date,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,6,1.857143,2.142857
2014,7,4.272727,1.818182
2014,8,8.666667,8.166667
2014,9,37.608696,6.217391
2014,10,27.285714,1.47619
2014,11,11.5,10.5


## Sierra Leone

In [205]:
get_monthly_new_cases_and_deaths(sl)

Unnamed: 0_level_0,Unnamed: 1_level_0,$deaths,$new_cases
date,date,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,8,331.15,19.6
2014,9,470.642857,36.714286
2014,10,944.888889,58.851852
2014,11,1356.428571,71.785714
2014,12,1622.0,54.333333
