## Task 1. Compiling Ebola Data

The `DATA_FOLDER/ebola` folder contains summarized reports of Ebola cases from three countries (Guinea, Liberia and Sierra Leone) during the recent outbreak of the disease in West Africa. For each country, there are daily reports that contain various information about the outbreak in several cities in each country.

Use pandas to import these data files into a single `Dataframe`.
Using this `DataFrame`, calculate for *each country*, the *daily average* per year of *new cases* and *deaths*.
Make sure you handle all the different expressions for *new cases* and *deaths* that are used in the reports.

In [82]:
import os
import pandas as pd
import numpy as n
from typing import Generator

def get_file_paths(root: str) -> Generator[str, None, None]:
    """
    Returns the paths for all the files in the subtree of `root`
    """
    for (dir_path, _, file_names) in os.walk(root):
        yield from [os.path.join(dir_path, file_name) for file_name in file_names]

gn_file_paths = [*get_file_paths("./data/guinea_data")]
lr_file_paths = [*get_file_paths("./data/liberia_data")]
sl_file_paths = [*get_file_paths("./data/sl_data")]

In [83]:
def normalize_sl(raw_frame: pd.core.frame.DataFrame):
    """
    Normalize the dataframe obtained by reading a Sierra Leone CSV to have
    one row per (region/date) pair and use columns for each variable.
    """
    # Normalize column names
    raw_frame.columns = map(str.lower, raw_frame.columns)
    
    DATE: str = "date"
    VARIABLE: str = "variable"
    REGION: str = "region"
    VALUE: str = "value"
    
    values_stack = []
    var_names = [] # Index of variables

    # Iterate over each variable (row)
    for index, row in raw_frame.iterrows():
        date = row[DATE]
        var_name = row[VARIABLE]
        if var_name in var_names:
            # Ignore variable: it was already defined
            continue
        var_names.append(var_name)
        
        # Iterate over each region (column)
        for col_name, value in row.iteritems():
            if col_name in {DATE, VARIABLE, "western area combined", "national"}:
                # Ignore: not a region
                continue
            values_stack.append({REGION: col_name, DATE: pd.to_datetime(date), VARIABLE: var_name, VALUE: value})
            
    stack_frame = pd.DataFrame(values_stack)
    pivoted = stack_frame.set_index([REGION, DATE, VARIABLE]).unstack(VARIABLE)
    # Drop to level ("value") of the column's multi index
    pivoted.columns = [col[1] for col in pivoted.columns] 
    return pivoted

# pd.read_csv(sl_file_paths[13])
# normalize_sl(pd.read_csv(sl_file_paths[13]))
# print(sl_file_paths[13])
sl = pd.concat([normalize_sl(pd.read_csv(path)) for path in sl_file_paths])

In [84]:
def normalize_lr(raw_frame: pd.core.frame.DataFrame):
    """
    Normalize the dataframe obtained by reading a Liberia CSV to have
    one row per (region/date) pair and use columns for each variable.
    """
    
    # Normalize column names
    raw_frame.columns = map(str.lower, raw_frame.columns)
    
    DATE: str = "date"
    VARIABLE: str = "variable"
    REGION: str = "region"
    VALUE: str = "value"
    
    values_stack = []
    var_names = [] # Index of variables

    # Iterate over each variable (row)
    for index, row in raw_frame.iterrows():
        date = row[DATE]
        var_name = row[VARIABLE]
        if var_name in var_names:
            # Ignore variable: it was already defined
            continue
        var_names.append(var_name)
        
        # Iterate over each region (column)
        for col_name, value in row.iteritems():
            if col_name in {DATE, VARIABLE, "national"}:
                # Ignore: not a region
                continue
            values_stack.append({REGION: col_name, DATE: pd.to_datetime(date), VARIABLE: var_name, VALUE: value})
            
    stack_frame = pd.DataFrame(values_stack)
    pivoted = stack_frame.set_index([REGION, DATE, VARIABLE]).unstack(VARIABLE)
    # Drop to level ("value") of the column's multi index
    pivoted.columns = [col[1] for col in pivoted.columns] 
    return pivoted

lr = pd.concat([normalize_lr(pd.read_csv(path)) for path in lr_file_paths])

In [85]:
def normalize_gn(raw_frame: pd.core.frame.DataFrame):
    """
    Normalize the dataframe obtained by reading a Guinea CSV to have
    one row per (region/date) pair and use columns for each variable.
    """
    
    # Normalize column names
    raw_frame.columns = map(str.lower, raw_frame.columns)
    
    DATE: str = "date"
    VARIABLE: str = "description"
    REGION: str = "region"
    VALUE: str = "value"
    
    values_stack = []
    var_names = [] # Index of variables

    # Iterate over each variable (row)
    for index, row in raw_frame.iterrows():
        date = row[DATE]
        var_name = row[VARIABLE]
        if var_name in var_names:
            # Ignore variable: it was already defined
            continue
        var_names.append(var_name)

        # Iterate over each region (column)
        for col_name, value in row.iteritems():
            if col_name in {DATE, VARIABLE, "national"}:
                # Ignore: not a region
                continue
            values_stack.append({REGION: col_name, DATE: pd.to_datetime(date), VARIABLE: var_name, VALUE: value})
            
    stack_frame = pd.DataFrame(values_stack)
    pivoted = stack_frame.set_index([REGION, DATE, VARIABLE]).unstack(VARIABLE)
    # Drop to level ("value") of the column's multi index
    pivoted.columns = [col[1] for col in pivoted.columns] 
    return pivoted

gn = pd.concat([normalize_gn(pd.read_csv(path)) for path in gn_file_paths])

In [88]:
COUNTRY = "country"
REGION: str = "region"
DATE: str = "date"

# Normalize the column names for shared columns
for df in [gn, sl, lr]:
    df.columns = map(str.lower, df.columns)

# gn.rename(columns={'cumulative cfr': 'cfr'}, inplace=True)

gn.to_csv("./gn.csv")
sl.to_csv("./sl.csv")
lr.to_csv("./lr.csv")

# Anotate the country
gn[COUNTRY] = "gn"
sl[COUNTRY] = "sl"
lr[COUNTRY] = "lr"

# Concat all and reindex by country/region/date
ebola = pd.concat([gn, lr, sl])
ebola.reset_index(inplace=True)
ebola = ebola.set_index([COUNTRY, REGION, DATE])
ebola.to_csv("./ebola.csv")

Index(['cfr', 'contacts_followed', 'contacts_healthy', 'contacts_ill',
       'contacts_not_seen', 'cum_completed_contacts', 'cum_confirmed',
       'cum_contacts', 'cum_noncase', 'cum_probable', 'cum_suspected',
       'death_confirmed', 'death_probable', 'death_suspected',
       'etc_cum_admission', 'etc_cum_deaths', 'etc_cum_discharges',
       'etc_currently_admitted', 'etc_new_admission', 'etc_new_deaths',
       'etc_new_discharges', 'negative_corpse', 'new_completed_contacts',
       'new_confirmed', 'new_contacts', 'new_negative', 'new_noncase',
       'new_positive', 'new_probable', 'new_samples', 'new_suspected',
       'pending', 'percent_seen', 'population', 'positive_corpse',
       'repeat_samples', 'total_lab_samples', 'country'],
      dtype='object')


## Columns

Here is a list of all the columns used by the 3 countries, and the normalized name used in the main table.

| Normalized | Guinea                                                   | Liberia                                                          | Sierra Leone           | Comment    |
|------------|----------------------------------------------------------|------------------------------------------------------------------|------------------------|------------|
|            | cumulative (confirmed + probable + suspects)             |                                                                  |                        |            |
|            | fatality rate for confirmed and probables                |                                                                  |                        |            |
|            | new admits to cte so far                                 |                                                                  |                        |            |
|            | new cases of confirmed                                   |                                                                  |                        |            |
|            | new cases of confirmed among health workers              |                                                                  |                        |            |
|            | new cases of probables                                   |                                                                  |                        |            |
|            | new cases of suspects                                    |                                                                  |                        |            |
|            | new contacts registered so far                           |                                                                  |                        |            |
|            | new deaths registered                                    |                                                                  |                        |            |
|            | new deaths registered among health workers               |                                                                  |                        |            |
|            | new deaths registered today                              |                                                                  |                        |            |
|            | new deaths registered today (confirmed)                  |                                                                  |                        |            |
|            | new deaths registered today (probables)                  |                                                                  |                        |            |
|            | new deaths registered today (suspects)                   |                                                                  |                        |            |
|            | number of confirmed cases among health workers           |                                                                  |                        |            |
|            | number of contacts followed today                        |                                                                  |                        |            |
|            | number of contacts followed yesterday                    |                                                                  |                        |            |
|            | number of contacts lost to follow up                     |                                                                  |                        |            |
|            | number of contacts out of the track 21 days              |                                                                  |                        |            |
|            | number of contacts out of track                          |                                                                  |                        |            |
|            | number of contacts to follow today                       |                                                                  |                        |            |
|            | number of death of confirmed cases among health workers  |                                                                  |                        |            |
|            | number of deaths of confirmed cases among health workers |                                                                  |                        |            |
|            | number of deaths of probables cases among health workers |                                                                  |                        |            |
|            | number of female confirmed cases                         |                                                                  |                        |            |
|            | number of female probables cases                         |                                                                  |                        |            |
|            | number of female suspects cases                          |                                                                  |                        |            |
|            | number of male confirmed cases                           |                                                                  |                        |            |
|            | number of male probables cases                           |                                                                  |                        |            |
|            | number of male suspects cases                            |                                                                  |                        |            |
|            | number of patients tested                                |                                                                  |                        |            |
|            | number of probables cases among health workers           |                                                                  |                        |            |
|            | number of samples collected                              |                                                                  |                        |            |
|            | number of samples collected today                        |                                                                  |                        |            |
|            | number of samples under test                             |                                                                  |                        |            |
|            | number of suspects cases among health workers            |                                                                  |                        |            |
|            | total pec center today                                   |                                                                  |                        |            |
|            | total pec center today (confirmed)                       |                                                                  |                        |            |
|            | total pec center today (probables)                       |                                                                  |                        |            |
|            | total pec center today (suspects)                        |                                                                  |                        |            |
|            | total case of confirmed among health workers             |                                                                  |                        |            |
|            | total cases of confirmed                                 |                                                                  |                        |            |
|            | total cases of probables                                 |                                                                  |                        |            |
|            | total cases of suspects                                  |                                                                  |                        |            |
|            | total contacts registered from start date                |                                                                  |                        |            |
|            | total deaths (confirmed + probables + suspects)          |                                                                  |                        |            |
|            | total deaths of confirmed                                |                                                                  |                        |            |
|            | total deaths of probables                                |                                                                  |                        |            |
|            | total deaths of suspects                                 |                                                                  |                        |            |
|            | total deaths registered among health workers             |                                                                  |                        |            |
|            | total new cases registered so far                        |                                                                  |                        |            |
|            | total number of admissions to cte                        |                                                                  |                        |            |
|            | total number of exits from cte                           |                                                                  |                        |            |
|            | total number of female cases                             |                                                                  |                        |            |
|            | total number of hospitalized cases in cte                |                                                                  |                        |            |
|            | total number of male cases                               |                                                                  |                        |            |
|            | total of cured in confirmed cases in cte                 |                                                                  |                        |            |
|            | total of deaths in confirmed cases in cte                |                                                                  |                        |            |
|            | total samples tested                                     |                                                                  |                        |            |
|            | total suspected non-class case                           |                                                                  |                        |            |
|            |                                                          | case fatality rate (cfr) - \n confirmed & probable cases         |                        |            |
|            |                                                          | case fatality rate (cfr) - confirmed & probable cases            |                        |            |
|            |                                                          | contacts lost to follow-up                                       |                        |            |
|            |                                                          | contacts seen                                                    |                        |            |
|            |                                                          | contacts who completed 21 day \n follow-up                       |                        |            |
|            |                                                          | contacts who completed 21 day follow-up                          |                        |            |
|            |                                                          | cumulative (confirmed + probable + suspected)                    |                        |            |
|            |                                                          | cumulative (confirmed + probable + suspects)                     |                        |            |
|            |                                                          | cumulative cfr                                                   |                        |            |
|            |                                                          | cumulative admission/isolation                                   |                        |            |
|            |                                                          | cumulative cases among hcw                                       |                        |            |
|            |                                                          | cumulative confirmed, probable and suspected cases               |                        |            |
|            |                                                          | cumulative deaths among hcw                                      |                        |            |
|            |                                                          | currently under follow-up                                        |                        |            |
|            |                                                          | new case/s (probable)                                            |                        |            |
|            |                                                          | new case/s (suspected)                                           |                        |            |
|            |                                                          | new admissions                                                   |                        |            |
|            |                                                          | new case/s (confirmed)                                           |                        |            |
|            |                                                          | newly reported cases in hcw                                      |                        |            |
|            |                                                          | newly reported deaths in hcw                                     |                        |            |
|            |                                                          | newly reported contacts                                          |                        |            |
|            |                                                          | newly reported deaths                                            |                        |            |
|            |                                                          | specimens collected                                              |                        |            |
|            |                                                          | specimens pending for testing                                    |                        |            |
|            |                                                          | total case/s (probable)                                          |                        |            |
|            |                                                          | total case/s (suspected)                                         |                        |            |
|            |                                                          | total number of confirmed cases \n of guinean nationality        |                        |            |
|            |                                                          | total number of confirmed cases \n of sierra leonean nationality |                        |            |
|            |                                                          | total number of confirmed cases of guinean nationality           |                        |            |
|            |                                                          | total number of confirmed cases of sierra leonean nationality    |                        |            |
|            |                                                          | total case/s (confirmed)                                         |                        |            |
|            |                                                          | total confirmed cases                                            |                        |            |
|            |                                                          | total contacts listed                                            |                        |            |
|            |                                                          | total death/s in confirmed cases                                 |                        |            |
|            |                                                          | total death/s in confirmed, \n probable, suspected cases         |                        |            |
|            |                                                          | total death/s in confirmed,  probable, suspected cases           |                        |            |
|            |                                                          | total death/s in confirmed, probable, suspected cases            |                        |            |
|            |                                                          | total death/s in probable cases                                  |                        |            |
|            |                                                          | total death/s in suspected cases                                 |                        |            |
|            |                                                          | total discharges                                                 |                        |            |
|            |                                                          | total no. currently in treatment \n units                        |                        |            |
|            |                                                          | total no. currently in treatment units                           |                        |            |
|            |                                                          | total probable cases                                             |                        |            |
|            |                                                          | total specimens tested                                           |                        |            |
|            |                                                          | total suspected cases                                            |                        |            |
|            |                                                          |                                                                  | cfr                    |            |
|            |                                                          |                                                                  | contacts_followed      |            |
|            |                                                          |                                                                  | contacts_healthy       |            |
|            |                                                          |                                                                  | contacts_ill           |            |
|            |                                                          |                                                                  | contacts_not_seen      |            |
|            |                                                          |                                                                  | cum_completed_contacts |            |
|            |                                                          |                                                                  | cum_confirmed          |            |
|            |                                                          |                                                                  | cum_contacts           |            |
|            |                                                          |                                                                  | cum_noncase            |            |
|            |                                                          |                                                                  | cum_probable           |            |
|            |                                                          |                                                                  | cum_suspected          |            |
|            |                                                          |                                                                  | death_confirmed        |            |
|            |                                                          |                                                                  | death_probable         |            |
|            |                                                          |                                                                  | death_suspected        |            |
|            |                                                          |                                                                  | etc_cum_admission      |            |
|            |                                                          |                                                                  | etc_cum_deaths         |            |
|            |                                                          |                                                                  | etc_cum_discharges     |            |
|            |                                                          |                                                                  | etc_currently_admitted |            |
|            |                                                          |                                                                  | etc_new_admission      |            |
|            |                                                          |                                                                  | etc_new_deaths         |            |
|            |                                                          |                                                                  | etc_new_discharges     |            |
|            |                                                          |                                                                  | negative_corpse        |            |
|            |                                                          |                                                                  | new_completed_contacts |            |
|            |                                                          |                                                                  | new_confirmed          |            |
|            |                                                          |                                                                  | new_contacts           |            |
|            |                                                          |                                                                  | new_negative           |            |
|            |                                                          |                                                                  | new_noncase            |            |
|            |                                                          |                                                                  | new_positive           |            |
|            |                                                          |                                                                  | new_probable           |            |
|            |                                                          |                                                                  | new_samples            |            |
|            |                                                          |                                                                  | new_suspected          |            |
|            |                                                          |                                                                  | pending                |            |
|            |                                                          |                                                                  | percent_seen           |            |
|            |                                                          |                                                                  | population             |            |
|            |                                                          |                                                                  | positive_corpse        |            |
|            |                                                          |                                                                  | repeat_samples         |            |
|            |                                                          |                                                                  | total_lab_sample       |            |
