# Flu Data Symptom/Landeskreis Preprocessing

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('flu_data.csv', sep=',', encoding='latin-1')
data.head(20)

Unnamed: 0.1,Unnamed: 0,[PathogenMitKrankheit].[Meldeweg Web71].[Meldeweg ID Web71].[MEMBER_CAPTION],[PathogenMitKrankheit].[Meldeweg Web71].[Krankheit ID Web71].[MEMBER_CAPTION],[Symptome].[ID].[ID].[MEMBER_CAPTION],[ReportingDate].[YearWeek].[YearWeek].[MEMBER_CAPTION],[DeutschlandNodes].[CountyKey71].[CountyKey71].[MEMBER_CAPTION],[Geschlecht].[SortGruppe].[SortGruppe].[MEMBER_CAPTION],[AlterPerson80].[AgeGroupName6].[AgeGroupName6].[MEMBER_CAPTION],[Measures].[FallCount]
0,1,Gemäß IfSG,Influenza,akuter Krankheitsbeginn,2003-KW41,LK Dahme-Spreewald,weiblich,A25..29,1
1,2,Gemäß IfSG,Influenza,akuter Krankheitsbeginn,2004-KW14,SK Berlin Spandau,männlich,A40..44,1
2,3,Gemäß IfSG,Influenza,akuter Krankheitsbeginn,2004-KW42,LK Pfaffenhofen a.d.Ilm,männlich,A45..49,1
3,4,Gemäß IfSG,Influenza,akuter Krankheitsbeginn,2004-KW43,SK München,männlich,A20..24,1
4,5,Gemäß IfSG,Influenza,akuter Krankheitsbeginn,2004-KW43,LK Spree-Neiße,weiblich,A60..64,1
5,6,Gemäß IfSG,Influenza,akuter Krankheitsbeginn,2004-KW44,LK Elbe-Elster,männlich,A60..64,1
6,7,Gemäß IfSG,Influenza,akuter Krankheitsbeginn,2004-KW45,LK Elbe-Elster,männlich,A60..64,1
7,8,Gemäß IfSG,Influenza,akuter Krankheitsbeginn,2004-KW45,LK Spree-Neiße,männlich,A60..64,1
8,9,Gemäß IfSG,Influenza,akuter Krankheitsbeginn,2004-KW47,LK Elbe-Elster,männlich,A80+,1
9,10,Gemäß IfSG,Influenza,akuter Krankheitsbeginn,2004-KW48,LK Bergstraße,weiblich,A40..44,1


In [3]:
# Column names
print(list(data))

['Unnamed: 0', '[PathogenMitKrankheit].[Meldeweg Web71].[Meldeweg ID Web71].[MEMBER_CAPTION]', '[PathogenMitKrankheit].[Meldeweg Web71].[Krankheit ID Web71].[MEMBER_CAPTION]', '[Symptome].[ID].[ID].[MEMBER_CAPTION]', '[ReportingDate].[YearWeek].[YearWeek].[MEMBER_CAPTION]', '[DeutschlandNodes].[CountyKey71].[CountyKey71].[MEMBER_CAPTION]', '[Geschlecht].[SortGruppe].[SortGruppe].[MEMBER_CAPTION]', '[AlterPerson80].[AgeGroupName6].[AgeGroupName6].[MEMBER_CAPTION]', '[Measures].[FallCount]']


In [4]:
# Delete useless columns
data_dropped = data.drop(['Unnamed: 0', '[PathogenMitKrankheit].[Meldeweg Web71].[Meldeweg ID Web71].[MEMBER_CAPTION]', 
           '[PathogenMitKrankheit].[Meldeweg Web71].[Krankheit ID Web71].[MEMBER_CAPTION]'], axis=1)
data_dropped.head()

Unnamed: 0,[Symptome].[ID].[ID].[MEMBER_CAPTION],[ReportingDate].[YearWeek].[YearWeek].[MEMBER_CAPTION],[DeutschlandNodes].[CountyKey71].[CountyKey71].[MEMBER_CAPTION],[Geschlecht].[SortGruppe].[SortGruppe].[MEMBER_CAPTION],[AlterPerson80].[AgeGroupName6].[AgeGroupName6].[MEMBER_CAPTION],[Measures].[FallCount]
0,akuter Krankheitsbeginn,2003-KW41,LK Dahme-Spreewald,weiblich,A25..29,1
1,akuter Krankheitsbeginn,2004-KW14,SK Berlin Spandau,männlich,A40..44,1
2,akuter Krankheitsbeginn,2004-KW42,LK Pfaffenhofen a.d.Ilm,männlich,A45..49,1
3,akuter Krankheitsbeginn,2004-KW43,SK München,männlich,A20..24,1
4,akuter Krankheitsbeginn,2004-KW43,LK Spree-Neiße,weiblich,A60..64,1


In [5]:
# Sort on week
data_sorted = data_dropped.sort_values('[ReportingDate].[YearWeek].[YearWeek].[MEMBER_CAPTION]')
data_sorted.head()

Unnamed: 0,[Symptome].[ID].[ID].[MEMBER_CAPTION],[ReportingDate].[YearWeek].[YearWeek].[MEMBER_CAPTION],[DeutschlandNodes].[CountyKey71].[CountyKey71].[MEMBER_CAPTION],[Geschlecht].[SortGruppe].[SortGruppe].[MEMBER_CAPTION],[AlterPerson80].[AgeGroupName6].[AgeGroupName6].[MEMBER_CAPTION],[Measures].[FallCount]
209054,Husten,2001-KW01,LK Darmstadt-Dieburg,männlich,A40..44,1
737228,Fieber,2001-KW01,SK Hamburg,weiblich,A10..14,1
1056667,andere Symptome,2001-KW01,SK Hamburg,weiblich,A10..14,1
737226,Fieber,2001-KW01,LK Darmstadt-Dieburg,männlich,A40..44,1
209055,Husten,2001-KW01,SK Frankfurt am Main,männlich,A40..44,1


In [6]:
# Rename column names cause they are all over the place
data_renamed = data_sorted.rename(index=str, columns={"[Symptome].[ID].[ID].[MEMBER_CAPTION]": "Symptom",
                                                      "[ReportingDate].[YearWeek].[YearWeek].[MEMBER_CAPTION]": "Week",
                                                     "[DeutschlandNodes].[CountyKey71].[CountyKey71].[MEMBER_CAPTION]": "Landkreis",
                                                     "[Geschlecht].[SortGruppe].[SortGruppe].[MEMBER_CAPTION]": "Gender",
                                                     "[AlterPerson80].[AgeGroupName6].[AgeGroupName6].[MEMBER_CAPTION]": "Age Group",
                                                     "[Measures].[FallCount]": "Count"})
data_renamed.head()

Unnamed: 0,Symptom,Week,Landkreis,Gender,Age Group,Count
209054,Husten,2001-KW01,LK Darmstadt-Dieburg,männlich,A40..44,1
737228,Fieber,2001-KW01,SK Hamburg,weiblich,A10..14,1
1056667,andere Symptome,2001-KW01,SK Hamburg,weiblich,A10..14,1
737226,Fieber,2001-KW01,LK Darmstadt-Dieburg,männlich,A40..44,1
209055,Husten,2001-KW01,SK Frankfurt am Main,männlich,A40..44,1


In [7]:
# Get the unique age groups to make a proper label out of them
age_groups = data_renamed['Age Group'].unique()
print(age_groups)
print(len(age_groups))

['A40..44' 'A10..14' 'A30..34' 'A50..54' 'A05..09' 'A15..19' 'A45..49'
 'A20..24' 'A00..04' 'A35..39' 'A25..29' 'A70..74' 'A55..59' 'A60..64'
 'A80+' 'A75..79' 'Unbekannt' 'A65..69']
18


In [8]:
age_converter = {'Age Group':{'A00..04': '100000000000000000',
                             'A05..09': '010000000000000000',
                             'A10..14': '001000000000000000',
                             'A15..19': '000100000000000000',
                             'A20..24': '000010000000000000',
                             'A25..29': '000001000000000000',
                             'A30..34': '000000100000000000',
                             'A35..39': '000000010000000000',
                             'A40..44': '000000001000000000',
                             'A45..49': '000000000100000000',
                             'A50..54': '000000000010000000',
                             'A55..59': '000000000001000000',
                             'A60..64': '000000000000100000',
                             'A65..69': '000000000000010000',
                             'A70..74': '000000000000001000',
                             'A75..79': '000000000000000100',
                             'A80+':    '000000000000000010',
                             'Unbekannt':'000000000000000001'}}

In [9]:
data_one_hot = data_renamed.replace(to_replace=age_converter, value=None)
print(data_one_hot.head())

                 Symptom       Week             Landkreis    Gender  \
209054            Husten  2001-KW01  LK Darmstadt-Dieburg  männlich   
737228            Fieber  2001-KW01            SK Hamburg  weiblich   
1056667  andere Symptome  2001-KW01            SK Hamburg  weiblich   
737226            Fieber  2001-KW01  LK Darmstadt-Dieburg  männlich   
209055            Husten  2001-KW01  SK Frankfurt am Main  männlich   

                  Age Group  Count  
209054   000000001000000000      1  
737228   001000000000000000      1  
1056667  001000000000000000      1  
737226   000000001000000000      1  
209055   000000001000000000      1  


In [10]:
# Change locations to encoded numbers
data_one_hot.Landkreis = pd.Categorical(data_one_hot.Landkreis)
codes = data_one_hot["Landkreis Codes"] = data_one_hot.Landkreis.cat.codes
print(max(codes))
print(data_one_hot.head())

412
                 Symptom       Week             Landkreis    Gender  \
209054            Husten  2001-KW01  LK Darmstadt-Dieburg  männlich   
737228            Fieber  2001-KW01            SK Hamburg  weiblich   
1056667  andere Symptome  2001-KW01            SK Hamburg  weiblich   
737226            Fieber  2001-KW01  LK Darmstadt-Dieburg  männlich   
209055            Husten  2001-KW01  SK Frankfurt am Main  männlich   

                  Age Group  Count  Landkreis Codes  
209054   000000001000000000      1               45  
737228   001000000000000000      1              345  
1056667  001000000000000000      1              345  
737226   000000001000000000      1               45  
209055   000000001000000000      1              338  


In [11]:
# Resort on week, then symptom, for ease of processing
data_semi_final = data_one_hot.sort_values(['Week','Symptom'],ascending=[True,True])
print(data_semi_final.head())

       Symptom       Week             Landkreis    Gender           Age Group  \
737228  Fieber  2001-KW01            SK Hamburg  weiblich  001000000000000000   
737226  Fieber  2001-KW01  LK Darmstadt-Dieburg  männlich  000000001000000000   
737227  Fieber  2001-KW01  SK Frankfurt am Main  männlich  000000001000000000   
209054  Husten  2001-KW01  LK Darmstadt-Dieburg  männlich  000000001000000000   
209055  Husten  2001-KW01  SK Frankfurt am Main  männlich  000000001000000000   

        Count  Landkreis Codes  
737228      1              345  
737226      1               45  
737227      1              338  
209054      1               45  
209055      1              338  


In [12]:
# Find what are the Berlin related Landkreise
berlin_lk = data_semi_final['Landkreis'].unique()
for lk in berlin_lk:
    print(lk)

SK Hamburg
LK Darmstadt-Dieburg
SK Frankfurt am Main
SK Berlin Spandau
LK Wartburgkreis
LK Börde
LK Dahme-Spreewald
LK Gütersloh
LK Paderborn
LK Siegen-Wittgenstein
SK Erfurt
LK Schleswig-Flensburg
LK Stade
LK Stormarn
LK Herzogtum Lauenburg
LK Havelland
Region Hannover
LK Göttingen
LK Donau-Ries
LK LudwigslustParchim
LK Ostholstein
LK Unterallgäu
LK Uckermark
LK Segeberg
LK Rhein-Sieg-Kreis
LK Rhein-Erft-Kreis
SK Remscheid
LK Pfaffenhofen a.d.Ilm
LK Peine
LK Sömmerda
LK Erzgebirgskreis
LK Schmalkalden-Meiningen
LK Saalekreis
SK Suhl
SK Berlin Pankow
LK Unstrut-Hainich-Kreis
SK Weimar
SK Wolfsburg
SK Berlin Reinickendorf
LK Oberallgäu
SK Berlin Steglitz-Zehlendorf
SK Berlin Treptow-Köpenick
SK Bonn
SK Chemnitz
LK Altenkirchen
SK Berlin Mitte
SK Berlin Lichtenberg
SK Berlin Friedrichshain-Kreuzberg
SK Berlin Charlottenburg-Wilmersdorf
LK Ansbach
SK Ingolstadt
SK Magdeburg
LK Hildesheim
LK Meißen
LK Neustadt a.d.Waldnaab
LK Neuburg-Schrobenhausen
LK Hildburghausen
LK Ebersberg
LK Greiz


In [13]:
data_final = data_semi_final[data_semi_final['Landkreis'].str.startswith("SK Berlin")]

In [14]:
data_final.head()

Unnamed: 0,Symptom,Week,Landkreis,Gender,Age Group,Count,Landkreis Codes
737229,Fieber,2001-KW02,SK Berlin Spandau,weiblich,100000000000000,1,308
737300,Fieber,2001-KW05,SK Berlin Pankow,weiblich,1000000000,1,306
737301,Fieber,2001-KW05,SK Berlin Reinickendorf,männlich,10000000000000000,5,307
737305,Fieber,2001-KW05,SK Berlin Reinickendorf,weiblich,1000000000000000,1,307
737306,Fieber,2001-KW05,SK Berlin Reinickendorf,weiblich,1000000000,1,307


In [18]:
data_final = data_final.reset_index(drop=True)
data_final.head(20)

Unnamed: 0,Symptom,Week,Landkreis,Gender,Age Group,Count,Landkreis Codes
0,Fieber,2001-KW02,SK Berlin Spandau,weiblich,100000000000000,1,308
1,Fieber,2001-KW05,SK Berlin Pankow,weiblich,1000000000,1,306
2,Fieber,2001-KW05,SK Berlin Reinickendorf,männlich,10000000000000000,5,307
3,Fieber,2001-KW05,SK Berlin Reinickendorf,weiblich,1000000000000000,1,307
4,Fieber,2001-KW05,SK Berlin Reinickendorf,weiblich,1000000000,1,307
5,Fieber,2001-KW05,SK Berlin Reinickendorf,weiblich,100000000,1,307
6,Fieber,2001-KW05,SK Berlin Spandau,männlich,100000000000000000,2,308
7,Fieber,2001-KW05,SK Berlin Spandau,männlich,10000000000000000,3,308
8,Fieber,2001-KW05,SK Berlin Spandau,männlich,1000000000000000,3,308
9,Fieber,2001-KW05,SK Berlin Reinickendorf,weiblich,10000000000000000,1,307


In [23]:
# Generate all week indicies

week_indicies = []
year = 2001
BREAKOFF = False

for year in range(2001,2019):
    for week in range(1, 53):
        if year == 2018:
            if week == 6:
                BREAKOFF = True

        if week < 10:
            week = '0'+str(week)
        else:
            week = str(week)

        week_indicies.append(str(year)+"-KW"+week)
        
        if BREAKOFF:
            break
            
print(week_indicies)

['2001-KW01', '2001-KW02', '2001-KW03', '2001-KW04', '2001-KW05', '2001-KW06', '2001-KW07', '2001-KW08', '2001-KW09', '2001-KW10', '2001-KW11', '2001-KW12', '2001-KW13', '2001-KW14', '2001-KW15', '2001-KW16', '2001-KW17', '2001-KW18', '2001-KW19', '2001-KW20', '2001-KW21', '2001-KW22', '2001-KW23', '2001-KW24', '2001-KW25', '2001-KW26', '2001-KW27', '2001-KW28', '2001-KW29', '2001-KW30', '2001-KW31', '2001-KW32', '2001-KW33', '2001-KW34', '2001-KW35', '2001-KW36', '2001-KW37', '2001-KW38', '2001-KW39', '2001-KW40', '2001-KW41', '2001-KW42', '2001-KW43', '2001-KW44', '2001-KW45', '2001-KW46', '2001-KW47', '2001-KW48', '2001-KW49', '2001-KW50', '2001-KW51', '2001-KW52', '2002-KW01', '2002-KW02', '2002-KW03', '2002-KW04', '2002-KW05', '2002-KW06', '2002-KW07', '2002-KW08', '2002-KW09', '2002-KW10', '2002-KW11', '2002-KW12', '2002-KW13', '2002-KW14', '2002-KW15', '2002-KW16', '2002-KW17', '2002-KW18', '2002-KW19', '2002-KW20', '2002-KW21', '2002-KW22', '2002-KW23', '2002-KW24', '2002-KW25'

In [None]:
# Generate symptom counts per week
symptom_counts = data_final.groupby(['Symptom', 'Week'])['Count'].count()

In [24]:
# Generate the input vectors!

weather_data = np.load('weather_data_np.npy')



for week in week_indicies:
    # Pull out the dataframe for the week
    dataframe = data_final[data_final["Week"] == week]
    
    # Get the counts per symptom (ignores location)
    fever
    

SyntaxError: invalid syntax (<ipython-input-24-1c40919b48de>, line 5)

In [27]:
data_final[data_final["Week"] == week_indicies[5]]

Unnamed: 0,Symptom,Week,Landkreis,Gender,Age Group,Count,Landkreis Codes
103,Fieber,2001-KW06,SK Berlin Friedrichshain-Kreuzberg,männlich,100000000000000000,1,301
104,Fieber,2001-KW06,SK Berlin Friedrichshain-Kreuzberg,männlich,10000000000,1,301
105,Fieber,2001-KW06,SK Berlin Spandau,weiblich,100000000000000000,1,308
106,Fieber,2001-KW06,SK Berlin Spandau,männlich,1000000000000000,2,308
107,Fieber,2001-KW06,SK Berlin Spandau,männlich,100000000000000000,1,308
108,Fieber,2001-KW06,SK Berlin Reinickendorf,weiblich,100000000000,1,307
109,Fieber,2001-KW06,SK Berlin Reinickendorf,weiblich,10000000000000,1,307
110,Fieber,2001-KW06,SK Berlin Reinickendorf,weiblich,100000000000000,1,307
111,Fieber,2001-KW06,SK Berlin Reinickendorf,männlich,1000000000000000,1,307
112,Fieber,2001-KW06,SK Berlin Reinickendorf,männlich,10000000000000000,3,307


In [31]:
data_final.groupby(['Symptom', 'Week'])['Count'].count()

Symptom                                 Week     
Fieber                                  2001-KW02     1
                                        2001-KW05    33
                                        2001-KW06    18
                                        2001-KW07    20
                                        2001-KW08     7
                                        2001-KW09     4
                                        2001-KW10     2
                                        2001-KW12     3
                                        2001-KW13     3
                                        2001-KW14     3
                                        2001-KW16     6
                                        2001-KW17     9
                                        2001-KW18     2
                                        2001-KW19     4
                                        2002-KW03     1
                                        2002-KW04     3
                                        2002-KW05     