In [1]:
import numpy as np
import pandas as pd
import requests
from epiweeks import Week, Year
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
PATH = 'https://raw.githubusercontent.com/KITmetricslab/nowcasting-data/main/'

MAX_DELAY = 10

In [3]:
# DISEASE = 'Seasonal_Influenza' 
# DISEASE = 'RSV_Infection'
DISEASE = 'Pneumococcal_Disease'  

In [4]:
LOCATION_CODES = {'Deutschland': 'DE',
                  'Schleswig-Holstein': 'DE-SH',
                  'Hamburg': 'DE-HH',
                  'Niedersachsen': 'DE-NI',
                  'Bremen': 'DE-HB',
                  'Nordrhein-Westfalen': 'DE-NW',
                  'Hessen': 'DE-HE',
                  'Rheinland-Pfalz': 'DE-RP',
                  'Baden-Württemberg': 'DE-BW',
                  'Bayern': 'DE-BY',
                  'Saarland': 'DE-SL',
                  'Berlin': 'DE-BE',
                  'Brandenburg': 'DE-BB',
                  'Mecklenburg-Vorpommern': 'DE-MV',
                  'Sachsen': 'DE-SN',
                  'Sachsen-Anhalt': 'DE-ST',
                  'Thüringen': 'DE-TH'}

In [5]:
def ages_by_group(age_group):
    if age_group == '80+':
        return {'A80.': '80+'}
    limits = age_group.split('-')
    keys = [f'A{a:02d}..{a:02d}' for a in range(int(limits[0]), int(limits[1]) + 1)]
    return dict.fromkeys(keys, age_group)

AGE_GROUPS = ['00+', '00-04', '05-14', '15-34', '35-59', '60-79', '80+']

AGE_DICT = dict()
for age_group in AGE_GROUPS[1:]:
    AGE_DICT.update(ages_by_group(age_group))

In [6]:
def process_state_file(df):
    # add iso date (end date of the corresponding week)
    df['date'] = df.apply(lambda x: Week(x.year, x.week, system = 'iso').enddate(), axis = 1)
    
    df = df.rename(columns = {'stratum' : 'location'})
    
    # fix state names and replace with abbreviations
    df.location = df.location.replace({'Ã.': 'ü', '\.': '-'}, regex = True)
    df.location = df.location.replace(LOCATION_CODES)
    
    # fill in age_group
    df['age_group'] = '00+'
    
    df = df[['date', 'year', 'week', 'location', 'age_group', 'value']]
    df = df.sort_values(['location', 'age_group', 'date'], ignore_index = True)

    return df

In [7]:
def process_age_file(df):
    # add iso date (end date of the corresponding iso week)
    df['date'] = df.apply(lambda x: Week(x.year, x.week, system = 'iso').enddate(), axis = 1)
    
    df = df.rename(columns = {'stratum' : 'age_group'})
    
    # drop entries with unknown age group
    df = df[df.age_group != "Unbekannt"]
    
    # summarize age groups (from yearly resolution to specified groups)
    df.age_group = df.age_group.replace(AGE_DICT)
    df = df.groupby(['date', 'year', 'week', 'age_group'], as_index = False)['value'].sum()
    
    # compute sum for age group 00+
    df_all = df.groupby(['date', 'year', 'week'], as_index = False)['value'].sum()
    df_all['age_group'] = '00+'
    
    df = pd.concat([df, df_all])
    
    # fill in location
    df['location'] = 'DE'
    
    df = df[['date', 'year', 'week', 'location', 'age_group', 'value']]
    df = df.sort_values(['location', 'age_group', 'date'], ignore_index = True)

    return df

In [8]:
def load_data(disease, date):
    try:
        df1 = pd.read_csv(f"{PATH}/{disease}/{disease}-states-{date}.csv")
        df2 = pd.read_csv(f"{PATH}/{disease}/{disease}-age-{date}.csv")

        df1 = process_state_file(df1)
        df2 = process_age_file(df2)

        df = pd.concat([df1, df2])
        df = df.sort_values(['location', 'age_group', 'date'], ignore_index = True)
        
        return df
        
    except:
        return None 

In [9]:
def add_iso_dates(df):
    '''
    Adds iso_week, iso_year and iso_date (end date of the week) to dataframe.
    '''
    df['iso_week'] = df.date.apply(lambda x: Week.fromdate(x, system = 'iso').week)
    df['iso_year'] = df.date.apply(lambda x: Week.fromdate(x, system = 'iso').year)
    df['iso_date'] = df.apply(lambda x: Week(x.iso_year, x.iso_week, system = 'iso'), axis = 1)
    
    return(df)

In [10]:
# Todo: Check if there's a difference for age/state
def list_all_files(disease, stratum='state'):
    # download all files from repo
    url = 'https://api.github.com/repos/KITmetricslab/nowcasting-data/git/trees/main?recursive=1'
    r = requests.get(url)
    res = r.json()
    
    # filter relevant files
    files = sorted([file['path'] for file in res['tree'] if (file['path'].startswith(disease) and 
                                                             file['path'].endswith('.csv') and
                                                             stratum in file['path'])])
    
    # create dataframe so we can easily select files by date
    df_files = pd.DataFrame({'filename':files})

    # extract date from filename
    df_files['date'] = df_files.filename.str[-14:-4]
    df_files.date = pd.to_datetime(df_files.date)
    
    df_files = add_iso_dates(df_files)
    
    # only keep latest file per week
    df_files = df_files.sort_values('date').groupby(['iso_year', 'iso_week']).tail(1).reset_index(drop = True)
    
    return df_files

In [11]:
def get_relevant_dates(df_files):
    # map iso_date to date of the latest available file of the corresponding week
    date_dict = dict(zip(df_files.iso_date, df_files.date))
    
    max_date = df_files.iso_date.max().enddate()
    min_date = df_files.iso_date.min().enddate()
    
    dates = pd.date_range(min_date, max_date, freq = "1W")
    dates = [Week.fromdate(d, system = 'iso') for d in dates]
    
    # remove current week as the data might not be available/final yet
    dates.remove(Week.thisweek(system = 'iso'))
    
    return date_dict, dates

In [12]:
def make_placeholder(date, 
                     states = ['DE-BB', 'DE-BE', 'DE-BW', 'DE-BY', 'DE-HB', 'DE-HE',
                               'DE-HH', 'DE-MV', 'DE-NI', 'DE-NW', 'DE-RP', 'DE-SH', 'DE-SL',
                               'DE-SN', 'DE-ST', 'DE-TH'], 
                     age_groups = ['00+', '00-04', '05-14', '15-34', '35-59', '60-79', '80+']):
    
    if DISEASE == 'RSV_Infection':
        states = ['DE-SN']
    
    df_age_groups = pd.DataFrame({'date'     : date.enddate(),
                      'year'     : date.year, 
                      'week'     : date.week,
                      'location' : 'DE', 
                      'age_group': age_groups,
                      'value'    : np.nan})

    df_states = pd.DataFrame({'date'     : date.enddate(),
                      'year'     : date.year, 
                      'week'     : date.week,
                      'location' : states, 
                      'age_group': '00+',
                      'value'    : np.nan})

    return pd.concat([df_age_groups, df_states])

In [13]:
def load_delayed_data(date, data_version):
    if data_version in date_dict.keys():
        df = load_data(disease = DISEASE, date = date_dict[data_version].date())
    else:
        df = None

    # select relevant rows if df_temp is not None
    if df is not None:
        df = df[(df.date == date.enddate())]

    # add rows (with valid year, week, date but no value) if date is missing in file or file couldn't be loaded
    if df is None or len(df) == 0:
        df = make_placeholder(date)
        
    return df

In [14]:
df_files = list_all_files(DISEASE)

In [15]:
df_files.head()

Unnamed: 0,filename,date,iso_week,iso_year,iso_date
0,Pneumococcal_Disease/Pneumococcal_Disease-stat...,2021-11-07,44,2021,2021W44
1,Pneumococcal_Disease/Pneumococcal_Disease-stat...,2021-11-14,45,2021,2021W45
2,Pneumococcal_Disease/Pneumococcal_Disease-stat...,2021-11-21,46,2021,2021W46
3,Pneumococcal_Disease/Pneumococcal_Disease-stat...,2021-11-28,47,2021,2021W47
4,Pneumococcal_Disease/Pneumococcal_Disease-stat...,2021-12-03,48,2021,2021W48


In [16]:
date_dict, dates = get_relevant_dates(df_files)

In [17]:
for delay in tqdm(range(0, MAX_DELAY + 1), total = MAX_DELAY + 1):
    dfs = []
    for date in dates:
        data_version = date + delay 
        if data_version <= max(dates):
            df_temp = load_delayed_data(date, data_version)
            dfs.append(df_temp)    
    df_temp = pd.concat(dfs)

    # we flag missing values to fill later on (not all should be filled to preserve reporting triangle shape)
    df_temp.value = df_temp.value.fillna('to_fill')  
    
    df_temp = df_temp.rename(columns = {'value': f'value_{delay}w'})
    
    if delay == 0:
        df = df_temp.copy()
    else:
        df = df.merge(df_temp, how = 'left')

  0%|          | 0/11 [00:00<?, ?it/s]

In [18]:
# use latest file to compute column for remaining correction beyond the specified largest delay
df_latest = load_data(DISEASE, dates[-1].enddate())

df_latest.value = df_latest.value.fillna('to_fill')
df_latest = df_latest.rename(columns = {'value': f'value_>{MAX_DELAY}w'})

df = df.merge(df_latest, how = 'left')

In [19]:
# we want to keep the triangle shape and avoid filling the corresponding entries
df = df.fillna('not_observed')

df = df.replace({'to_fill' : np.nan})

# if initial report is missing replace with zero
df.value_0w = df.value_0w.fillna(0)

# we use forward filling to fill missing values in between
df = df.fillna(method = "ffill", axis = 1)

df = df.replace({'not_observed' : np.nan})

In [20]:
# compute differences
df.iloc[:, 6:] = df.iloc[:, 5:].diff(axis=1).iloc[:, 1:]

In [21]:
value_cols = [c for c in df.columns if 'value' in c]
for col in value_cols:
    df[col] = df[col].astype('Int64')

df = df[['location', 'age_group', 'year', 'week', 'date'] + value_cols]

df = df.sort_values(['location', 'age_group', 'date'], ignore_index = True)

In [22]:
df

Unnamed: 0,location,age_group,year,week,date,value_0w,value_1w,value_2w,value_3w,value_4w,value_5w,value_6w,value_7w,value_8w,value_9w,value_10w,value_>10w
0,DE,00+,2021,44,2021-11-07,51,4,0,0,0,-1,0,-1,0,0,0,3
1,DE,00+,2021,45,2021-11-14,64,1,0,0,-1,-1,0,-1,0,0,0,7
2,DE,00+,2021,46,2021-11-21,52,3,2,-2,-1,0,0,-2,0,0,0,5
3,DE,00+,2021,47,2021-11-28,51,3,-2,-6,0,0,0,0,0,0,-1,21
4,DE,00+,2021,48,2021-12-05,39,4,-1,-5,0,0,0,0,0,0,0,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
576,DE-TH,00+,2022,16,2022-04-24,2,-1,0,0,0,,,,,,,
577,DE-TH,00+,2022,17,2022-05-01,1,0,0,0,,,,,,,,
578,DE-TH,00+,2022,18,2022-05-08,1,0,0,,,,,,,,,
579,DE-TH,00+,2022,19,2022-05-15,0,0,,,,,,,,,,


In [23]:
df.to_csv(f'../data/truth/truth_{DISEASE.lower()}.csv', index = False)