In [35]:
import os
import pandas as pd
from epiweeks import Week, Year

In [30]:
LOCATION_CODES = {'Deutschland': 'DE',
                  'Schleswig-Holstein': 'DE-SH',
                  'Hamburg': 'DE-HH',
                  'Niedersachsen': 'DE-NI',
                  'Bremen': 'DE-HB',
                  'Nordrhein-Westfalen': 'DE-NW',
                  'Hessen': 'DE-HE',
                  'Rheinland-Pfalz': 'DE-RP',
                  'Baden-Württemberg': 'DE-BW',
                  'Bayern': 'DE-BY',
                  'Saarland': 'DE-SL',
                  'Berlin': 'DE-BE',
                  'Brandenburg': 'DE-BB',
                  'Mecklenburg-Vorpommern': 'DE-MV',
                  'Sachsen': 'DE-SN',
                  'Sachsen-Anhalt': 'DE-ST',
                  'Thüringen': 'DE-TH'}

In [31]:
def ages_by_group(age_group):
    if age_group == '80+':
        return {'A80.': '80+'}
    limits = age_group.split('-')
    keys = [f'A{a:02d}..{a:02d}' for a in range(int(limits[0]), int(limits[1]) + 1)]
    return dict.fromkeys(keys, age_group)

AGE_GROUPS = ['00+', '00-04', '05-14', '15-34', '35-59', '60-79', '80+']

AGE_DICT = dict()
for age_group in AGE_GROUPS[1:]:
    AGE_DICT.update(ages_by_group(age_group))

In [37]:
def process_state_file(df):
    # add iso date (end date of the corresponding week)
    df['date'] = df.apply(lambda x: Week(x.year, x.week, system = 'iso').enddate(), axis = 1)
    
    df = df.rename(columns = {'stratum' : 'location'})
    
    # fix state names and replace with abbreviations
    df.location = df.location.replace({'Ã.': 'ü', '\.': '-'}, regex = True)
    df.location = df.location.replace(LOCATION_CODES)
    
    # fill in age_group
    df['age_group'] = '00+'
    
    df = df[['location', 'age_group', 'year', 'week', 'date', 'value']]
    df = df.sort_values(['location', 'age_group', 'date'], ignore_index = True)

    return df

In [38]:
def process_age_file(df):
    # add iso date (end date of the corresponding iso week)
    df['date'] = df.apply(lambda x: Week(x.year, x.week, system = 'iso').enddate(), axis = 1)
    
    df = df.rename(columns = {'stratum' : 'age_group'})
    
    # drop entries with unknown age group
    df = df[df.age_group != "Unbekannt"]
    
    # summarize age groups (from yearly resolution to specified groups)
    df.age_group = df.age_group.replace(AGE_DICT)
    df = df.groupby(['date', 'year', 'week', 'age_group'], as_index = False)['value'].sum()
    
    # compute sum for age group 00+
    df_all = df.groupby(['date', 'year', 'week'], as_index = False)['value'].sum()
    df_all['age_group'] = '00+'
    
    df = pd.concat([df, df_all])
    
    # fill in location
    df['location'] = 'DE'
    
    df = df[['location', 'age_group', 'year', 'week', 'date', 'value']]
    df = df.sort_values(['location', 'age_group', 'date'], ignore_index = True)

    return df

In [34]:
AGE_GROUPS

['00+', '00-04', '05-14', '15-34', '35-59', '60-79', '80+']

In [7]:
PATH = '../data/truth/history/'

In [8]:
diseases = os.listdir(PATH)

In [40]:
all_files = os.listdir(PATH + diseases[1])

In [15]:
files_age = [f for f in files if 'age' in f]

In [16]:
files_states = [f for f in files if 'states' in f]

In [17]:
files_states

['RSV_Infection-states-2015.csv',
 'RSV_Infection-states-2016.csv',
 'RSV_Infection-states-2017.csv',
 'RSV_Infection-states-2018.csv',
 'RSV_Infection-states-2019.csv',
 'RSV_Infection-states-2020.csv',
 'RSV_Infection-states-2021.csv',
 'RSV_Infection-states-2022.csv']

In [22]:
dfs = []

for f in files_states:
    df = pd.read_csv(f"{PATH}/{diseases[1]}/{f}")
    dfs.append(df)

In [24]:
df = pd.concat(dfs)

In [25]:
df

Unnamed: 0,stratum,value,week,year
0,Sachsen,1,7,2015
1,Sachsen,1,38,2015
2,Sachsen,2,41,2015
3,Sachsen,2,44,2015
4,Sachsen,3,45,2015
...,...,...,...,...
15,Sachsen,9,16,2022
16,Sachsen,7,17,2022
17,Sachsen,5,18,2022
18,Sachsen,9,19,2022


In [39]:
process_state_file(df)

Unnamed: 0,location,age_group,year,week,date,value
0,DE-SN,00+,2015,7,2015-02-15,1
1,DE-SN,00+,2015,38,2015-09-20,1
2,DE-SN,00+,2015,41,2015-10-11,2
3,DE-SN,00+,2015,44,2015-11-01,2
4,DE-SN,00+,2015,45,2015-11-08,3
...,...,...,...,...,...,...
280,DE-SN,00+,2022,16,2022-04-24,9
281,DE-SN,00+,2022,17,2022-05-01,7
282,DE-SN,00+,2022,18,2022-05-08,5
283,DE-SN,00+,2022,19,2022-05-15,9


In [45]:
for d in diseases:
    all_files = os.listdir(PATH + d)
    
    dfs = []
    for kind in ['age', 'states']:
        files = [f for f in all_files if kind in f]
        
        dfs_temp = []
        for f in files:
            df_temp = pd.read_csv(f"{PATH}/{d}/{f}")
            dfs_temp.append(df_temp)
        df_temp = pd.concat(dfs_temp)
        
        if kind == 'age':
            df_temp = process_age_file(df_temp)
        elif kind == 'states':
            df_temp = process_state_file(df_temp)
        
        dfs.append(df_temp)
    
    df = pd.concat(dfs)
    df.to_csv(f"../data/truth/history/latest_truth_{d.lower()}.csv", index = False)

In [44]:
df

Unnamed: 0,location,age_group,year,week,date,value
0,DE,00+,2012,1,2012-01-08,16.0
1,DE,00+,2012,2,2012-01-15,24.0
2,DE,00+,2012,3,2012-01-22,43.0
3,DE,00+,2012,4,2012-01-29,89.0
4,DE,00+,2012,5,2012-02-05,102.0
...,...,...,...,...,...,...
8603,DE-TH,00+,2022,17,2022-05-01,39.0
8604,DE-TH,00+,2022,18,2022-05-08,55.0
8605,DE-TH,00+,2022,19,2022-05-15,52.0
8606,DE-TH,00+,2022,20,2022-05-22,55.0
