In [None]:
#Enriche es hacer los analisis de Machine Learning 

In [1]:
import os

import pandas as pd

In [2]:
INPUT_FOLDER = "preproc"
OUTPUT_FOLDER = "enrich"

In [3]:
os.makedirs("data/enrich", exist_ok=True)

In [4]:
datasources = {
    source.replace(".zip",""):source
    for source in os.listdir(f"data/{INPUT_FOLDER}")
    if source.endswith(".zip")
}

datasources

{'epidemiology': 'epidemiology.zip',
 'vaccinations': 'vaccinations.zip',
 'health': 'health.zip',
 'hospitalizations': 'hospitalizations.zip',
 'index': 'index.zip'}

## ENRICH

This is the most optional part of the whole process. The main goal of this stage is to enrich current data tables from the previous `preproc` stage with any other variable present in other tables that are necessary for the aggregation process in the next satage.

In this case, we're going to do just one thing:
 - **Append the column `Country` to all tables** - Remember that we need to build a predictor per country, so we need that column in order to make later aggregations (in the next step) by country.
 - **Impute missing values in the demographics using the recetly new acquired `Country` column** - We have a lot of missing values here, so we'll take advantage of the recent `Country` column to apply a smart missing value imputation strategy.

### Join: Include `country_name` from `index` table in the rest of tables

We skip `index` because is the table from which we have to extract the `Country`, and also the `demographics` table, because we're going to apply a special treatment for it

In [5]:
index = pd.read_csv(f"data/{INPUT_FOLDER}/index.zip")

In [6]:
index

Unnamed: 0,location_key,country_name
0,DE_BB_12051,Germany
1,DE_BB_12052,Germany
2,DE_BB_12053,Germany
3,DE_BB_12054,Germany
4,DE_BB_12060,Germany
...,...,...
5116,US_WY_56037,United States of America
5117,US_WY_56039,United States of America
5118,US_WY_56041,United States of America
5119,US_WY_56043,United States of America


In [None]:
for key,value in datasources.items():
    data = pd.read_csv(f"data/{INPUT_FOLDER}/{key}.zip")
    if key not in ["index","demographics"]: # No voy a unir index con index y no necesito el country en demographic
        data = data.merge(index, on="location_key")
        print(f"Table {key} processed!")
        
    data.to_csv(f"data/{OUTPUT_FOLDER}/{key}.zip", index=False)

Table epidemiology processed!
Table vaccinations processed!
Table health processed!
Table hospitalizations processed!


### Missing values: `demographics`

We have a lot of missing values here, so we'll take advantage of the recent `Country` column to apply a smart missing value imputation strategy.

The imputation strategy will consist on
 1. For all regions without missing values, *calculate the proportion of population for every age range, per country*.
 2. For all regions with missing values, *extract the total population and append the proportions from the previous step*
 3. Impute missing values for each age range by multiplying population in each region by the proportion from step 1.

In [10]:
data = pd.read_csv(f"data/{INPUT_FOLDER}/demographics.zip")
data = data.merge(index, on="location_key")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5097 entries, 0 to 5096
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   location_key                 5097 non-null   object 
 1   population                   5097 non-null   float64
 2   population_age_00_09         3743 non-null   float64
 3   population_age_10_19         3743 non-null   float64
 4   population_age_20_29         3743 non-null   float64
 5   population_age_30_39         3743 non-null   float64
 6   population_age_40_49         3743 non-null   float64
 7   population_age_50_59         3743 non-null   float64
 8   population_age_60_69         3743 non-null   float64
 9   population_age_70_79         3743 non-null   float64
 10  population_age_80_and_older  3743 non-null   float64
 11  country_name                 5097 non-null   object 
dtypes: float64(10), object(2)
memory usage: 478.0+ KB


**1. For non missing records: calculate the proportion of population per age range, per each country**

In [None]:
whole_population = data[
    ~data['population_age_00_09'].isna()].
    groupby("country_name").sum() #It merge the location key 
nonmissing_population = whole_population["population"]
nonmissing_population_age = whole_population.filter(regex=r"population_age")
proportion = nonmissing_population_age.div(nonmissing_population, axis=0) #Divicion of two arrays

In [12]:
proportion

Unnamed: 0_level_0,population_age_00_09,population_age_10_19,population_age_20_29,population_age_30_39,population_age_40_49,population_age_50_59,population_age_60_69,population_age_70_79,population_age_80_and_older
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Germany,0.091071,0.093314,0.117336,0.126432,0.125563,0.163111,0.124957,0.092827,0.065388
Italy,0.083573,0.094359,0.101427,0.116718,0.153449,0.155305,0.122008,0.099806,0.073353
Spain,0.090795,0.107426,0.110814,0.139488,0.178141,0.158841,0.114566,0.081385,0.054615
United States of America,0.123588,0.128175,0.140244,0.133599,0.124809,0.132929,0.11251,0.065933,0.038126


**2. For missing records: get the population and append proportion of population per age range, per each country**

In [None]:
missings = data[data["population_age_00_09"].isna()]
missings = missings[["location_key", "population", "country_name"]]

In [14]:
missings.head()

Unnamed: 0,location_key,population,country_name
18,DE_BE_11001,385748.0,Germany
19,DE_BE_11002,290386.0,Germany
20,DE_BE_11003,57113.0,Germany
21,DE_BE_11004,343592.0,Germany
22,DE_BE_11005,245197.0,Germany


In [15]:
missings = missings.merge(proportion.reset_index(), on="country_name")
missings = missings.set_index(["location_key"])

In [16]:
missings.head()

Unnamed: 0_level_0,population,country_name,population_age_00_09,population_age_10_19,population_age_20_29,population_age_30_39,population_age_40_49,population_age_50_59,population_age_60_69,population_age_70_79,population_age_80_and_older
location_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
DE_BE_11001,385748.0,Germany,0.091071,0.093314,0.117336,0.126432,0.125563,0.163111,0.124957,0.092827,0.065388
DE_BE_11002,290386.0,Germany,0.091071,0.093314,0.117336,0.126432,0.125563,0.163111,0.124957,0.092827,0.065388
DE_BE_11003,57113.0,Germany,0.091071,0.093314,0.117336,0.126432,0.125563,0.163111,0.124957,0.092827,0.065388
DE_BE_11004,343592.0,Germany,0.091071,0.093314,0.117336,0.126432,0.125563,0.163111,0.124957,0.092827,0.065388
DE_BE_11005,245197.0,Germany,0.091071,0.093314,0.117336,0.126432,0.125563,0.163111,0.124957,0.092827,0.065388


**3. Calculate the estimated population per region from the proportions**

In [None]:
missings_population = missings['population']
missings_population_ages = missings.filter(regex=r"population_age")
result = missings_population_ages.mul(missings_population, axis=0)

In [18]:
result.head()

Unnamed: 0_level_0,population_age_00_09,population_age_10_19,population_age_20_29,population_age_30_39,population_age_40_49,population_age_50_59,population_age_60_69,population_age_70_79,population_age_80_and_older
location_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DE_BE_11001,35130.635234,35995.538041,45262.288508,48771.038075,48435.681102,62919.618874,48201.84101,35807.965827,25223.393329
DE_BE_11002,26445.878249,27096.965661,34072.853031,36714.193366,36461.741065,47365.058137,36285.709332,26955.763775,18987.837384
DE_BE_11003,5201.364544,5329.42015,6701.434832,7220.932571,7171.280356,9315.740309,7136.658507,5301.648621,3734.520109
DE_BE_11004,31291.426582,32061.809541,40315.854478,43441.154625,43142.44673,56043.525017,42934.161567,31894.735927,22466.885533
DE_BE_11005,22330.449846,22880.216984,28770.537645,31000.840505,30787.674075,39994.249585,30639.035873,22760.988513,16033.006974


Impute missing values in original dataset from the previously calculated result

In [19]:
data = data.set_index("location_key")

In [None]:
data = data.fillna(result) # Fill the data frame with the value of the other 

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5097 entries, DE_BB_12051 to US_WY_56045
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   population                   5097 non-null   float64
 1   population_age_00_09         5097 non-null   float64
 2   population_age_10_19         5097 non-null   float64
 3   population_age_20_29         5097 non-null   float64
 4   population_age_30_39         5097 non-null   float64
 5   population_age_40_49         5097 non-null   float64
 6   population_age_50_59         5097 non-null   float64
 7   population_age_60_69         5097 non-null   float64
 8   population_age_70_79         5097 non-null   float64
 9   population_age_80_and_older  5097 non-null   float64
 10  country_name                 5097 non-null   object 
dtypes: float64(10), object(1)
memory usage: 606.9+ KB


Save resulting table

In [22]:
data.reset_index().to_csv(f"data/{OUTPUT_FOLDER}/demographics.zip", index=False)