In [22]:
import os
import pathlib
import pandas as pd

from column_bindings.infostat_columns import InfostatColumns
from translations.age_translations import INFOSTAT_DECODE_AGE_GROUPS
from translations.nuts3_translate_infostat import TRANSLATE_REGIONS

In [2]:
raw_population_file = pathlib.Path(f'{os.getcwd()}/../../raw_data/Population_all_ages_2017-2019.csv')
df = pd.read_csv(raw_population_file)


In [3]:
df.drop(index=df[df[InfostatColumns.age]=='Общо'].index, inplace=True)
df = df.melt(id_vars=[InfostatColumns.year, InfostatColumns.location, InfostatColumns.age], 
             var_name=InfostatColumns.sex, 
             value_name=InfostatColumns.population)

In [4]:
df [InfostatColumns.year] = df[InfostatColumns.year].map(str)
df[InfostatColumns.population] = df.apply(lambda x: x[InfostatColumns.population].replace('-', '0'), axis=1)

In [5]:
df = df.pivot_table(columns=[InfostatColumns.year], 
                    index=[InfostatColumns.location, InfostatColumns.age, InfostatColumns.sex], 
                    values=InfostatColumns.population)
df.reset_index(inplace=True)

In [6]:
df[InfostatColumns.age] = df.apply(lambda x: INFOSTAT_DECODE_AGE_GROUPS.get(x[InfostatColumns.age], x[InfostatColumns.age]), axis=1)
df[InfostatColumns.location] = df.apply(lambda x: TRANSLATE_REGIONS.get(x[InfostatColumns.location], x[InfostatColumns.location]), axis=1)

In [7]:
df = df.groupby([InfostatColumns.location, InfostatColumns.age, InfostatColumns.sex], as_index=False).sum()
df[InfostatColumns.avg_population] = df.loc[:, ['2017', '2018', '2019']].mean(axis=1).round(2)


In [None]:
cleaned_avg_population_file = pathlib.Path(f'{os.getcwd()}/../../cleaned_data/Population_all_ages_2017-2019(Bulgaria).csv')
df.to_csv(cleaned_avg_population_file, index=False)
