In [1]:
import os
import pathlib
import pandas as pd

from column_bindings.infostat_columns import InfostatColumns
from translations.age_translations import INFOSTAT_DECODE_AGE_GROUPS
from translations.nuts3_translate_infostat import TRANSLATE_REGIONS
from translations.sex_translations import INFOSTAT_SEX_CONVERSION

In [2]:
raw_mortality_infostat_file = pathlib.Path(f'{os.getcwd()}/../../raw_data/Municipalities - Mortality.csv')
df = pd.read_csv(raw_mortality_infostat_file)

In [3]:
cols = dict(df.iloc[0, 1:])

replace_headers = {}

for key, val in cols.items():
    replace_headers[key] = f'{key[0:4]}_{INFOSTAT_SEX_CONVERSION[val]}'
replace_headers['Unnamed: 0'] =  InfostatColumns.location

In [4]:
df.rename(columns=replace_headers, inplace=True)
df.drop([0, 1], axis=0, inplace=True)

In [5]:
for index, row in df.iterrows():
    location = df.loc[index, InfostatColumns.location]
    is_region = TRANSLATE_REGIONS.get(location)

    if is_region:
        df.loc[index, InfostatColumns.region] = location
    else:
        df.loc[index, InfostatColumns.region] = df.loc[index - 1, InfostatColumns.region]

In [6]:
reorder_cols = [df.columns[-1]]
reorder_cols.extend(list(df.columns[0:-1]))
df = df.reindex(columns=reorder_cols)

In [7]:
regions_mask = df.duplicated(subset=[InfostatColumns.location], keep='last')
sofia_region_mask = df[InfostatColumns.location].str.contains("Столична")
sofia_region_mask = df[InfostatColumns.location].str.contains("София")
drop_regions = df[regions_mask | sofia_region_mask].index
df.drop(drop_regions, inplace=True)

In [8]:
df = df.melt(id_vars=[InfostatColumns.region, InfostatColumns.location], 
            var_name=f'{InfostatColumns.year}_{InfostatColumns.sex}',
            value_name=InfostatColumns.mortality)

In [9]:
df[[InfostatColumns.year, InfostatColumns.sex]] = df[f'{InfostatColumns.year}_{InfostatColumns.sex}'].str.split('_', expand=True)
df.drop(f'{InfostatColumns.year}_{InfostatColumns.sex}', axis=1, inplace=True)

In [10]:
df = df.pivot_table(index=[InfostatColumns.region, InfostatColumns.location, InfostatColumns.sex], 
                    columns=InfostatColumns.year, 
                    values=InfostatColumns.mortality).reset_index()

In [11]:
mortality_infostat_file = pathlib.Path(f'{os.getcwd()}/../../cleaned_data/Municipalities - Mortality.csv')
df.to_csv(mortality_infostat_file, index=False)