# Symptom Code

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import zeep
from zeep import Client
from lxml import etree
from zeep import Plugin


In [2]:
#Read in data
df = pd.read_csv('symptoms.csv', sep=',',encoding='latin-1')

In [3]:
df.reset_index(drop=True)

Unnamed: 0.1,Unnamed: 0,[PathogenMitKrankheit].[Meldeweg Web71].[Meldeweg ID Web71].[MEMBER_CAPTION],[PathogenMitKrankheit].[Meldeweg Web71].[Krankheit ID Web71].[MEMBER_CAPTION],[Symptome].[ID].[ID].[MEMBER_CAPTION],[ReportingDate].[YearWeek].[YearWeek].[MEMBER_CAPTION],[Measures].[FallCount]
0,1,Gemäß IfSG,Adenovirus-K(eratok)onjunktivitis,Karunkelschwellung,2015-KW13,1
1,2,Gemäß IfSG,Adenovirus-K(eratok)onjunktivitis,Karunkelschwellung,2015-KW33,1
2,3,Gemäß IfSG,Adenovirus-K(eratok)onjunktivitis,Karunkelschwellung,2015-KW44,1
3,4,Gemäß IfSG,Adenovirus-K(eratok)onjunktivitis,Karunkelschwellung,2015-KW53,1
4,5,Gemäß IfSG,Adenovirus-K(eratok)onjunktivitis,Karunkelschwellung,2016-KW05,1
5,6,Gemäß IfSG,Adenovirus-K(eratok)onjunktivitis,Karunkelschwellung,2016-KW06,5
6,7,Gemäß IfSG,Adenovirus-K(eratok)onjunktivitis,Karunkelschwellung,2016-KW07,3
7,8,Gemäß IfSG,Adenovirus-K(eratok)onjunktivitis,Karunkelschwellung,2016-KW08,4
8,9,Gemäß IfSG,Adenovirus-K(eratok)onjunktivitis,Karunkelschwellung,2016-KW10,7
9,10,Gemäß IfSG,Adenovirus-K(eratok)onjunktivitis,Karunkelschwellung,2016-KW11,5


In [4]:
#Drop unecessary columns
df = df.drop(columns=['Unnamed: 0','[PathogenMitKrankheit].[Meldeweg Web71].[Meldeweg ID Web71].[MEMBER_CAPTION]'])

In [5]:
#Give columns more comprehensible names
df = df.rename(columns={'[PathogenMitKrankheit].[Meldeweg Web71].[Krankheit ID Web71].[MEMBER_CAPTION]': 'Krankheit', '[Symptome].[ID].[ID].[MEMBER_CAPTION]': 'Symptom', '[ReportingDate].[YearWeek].[YearWeek].[MEMBER_CAPTION]': 'Kalenderwoche', '[Measures].[FallCount]': 'Anzahl'}) 


In [6]:
#Since we want to focus only on Norovirus, Infulenza, and Windpocken, we throw all the other data away
df_selection = df[df.isin({'Krankheit': ['Norovirus-Gastroenteritis','Influenza', 'Windpocken']})['Krankheit']]


In [7]:
#df_selection_symptoms = df_selection.groupby(['Krankheit','Symptom'])['Anzahl'].sum().reset_index()
#df_selection_symptoms

In [8]:
#Further dropping
df_selection = df_selection.drop(columns='Krankheit')


In [9]:
df_selection_pivot = df_selection.groupby(['Symptom','Kalenderwoche'])['Anzahl'].sum().reset_index()
df_selection_pivot.sort_values('Kalenderwoche')

Unnamed: 0,Symptom,Kalenderwoche,Anzahl
3118,Husten,2001-KW01,2
6105,andere Symptome,2001-KW01,1
2055,Fieber,2001-KW01,3
3904,"Muskel-, Glieder-, Rücken- oder Kopfschmerzen",2001-KW01,1
253,Ausschlag an Haut oder Schleimhaut mit gleichz...,2001-KW01,10
3905,"Muskel-, Glieder-, Rücken- oder Kopfschmerzen",2001-KW02,5
3119,Husten,2001-KW02,3
254,Ausschlag an Haut oder Schleimhaut mit gleichz...,2001-KW02,15
2056,Fieber,2001-KW02,6
3120,Husten,2001-KW03,7


In [10]:
#get a list of all relevant symptoms
df_selection_all_symptoms = df_selection.groupby(['Symptom'])['Anzahl'].sum().reset_index()['Symptom'].tolist()


In [11]:
#for each combination of symptom and week:
#if the combination is not in the dataframe, then insert a row with this combination and 'Anzahl' = 0
new_rows = []
for week in df['Kalenderwoche'].drop_duplicates().sort_values():
    for symptom in df_selection_all_symptoms:
        sample_week = df_selection_pivot[df_selection_pivot['Kalenderwoche'] == week]
        if not sample_week.isin({'Symptom': [symptom]})['Symptom'].any():
            new_row = [symptom, week, 0]
            new_rows.append(new_row)
            
            
new_values = pd.DataFrame(new_rows, columns=['Symptom','Kalenderwoche','Anzahl'])

#append these illnesses with 0 'Anzahl'
df_selection_pivot= df_selection_pivot.append(new_values)
df_weekly_symptoms = df_selection_pivot.sort_values('Kalenderwoche').reset_index(drop=True)
df_weekly_symptoms = df_weekly_symptoms.sort_values(['Kalenderwoche','Symptom'],ascending=[True,True])



In [12]:
#create a dictonary that contains a dataframe of the symptoms for each week
df_dict_weeks_symptomps = {}
weeks = list(df_weekly_symptoms["Kalenderwoche"])

dataframes = [df_weekly_symptoms[df_weekly_symptoms["Kalenderwoche"]=='{}'.format(week)] for week in weeks]
dict_symptomps = dict(zip(weeks,dataframes))

In [13]:
#and drop the uncessary columns to be left with a one-hot vector 
for key in dict_symptomps:
    dict_symptomps[key].drop(columns=['Kalenderwoche','Symptom'],inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


# Disease Code

In [15]:
# proxy for testing in rki.local
# os.environ['https_proxy'] = "http://fw-bln.rki.ivbb.bund.de:8020"

# Global
language = "German"
cube = "SurvStat"


# Logging Plugin
class LoggingPlugin(Plugin):
    # Set Printing
    printGeneratedXml = True

    def ingress(self, envelope, http_headers, operation):
        if self.printGeneratedXml:
            print(
                etree.tostring(
                    envelope, pretty_print=True, encoding=sys.stdout.encoding))
        return envelope, http_headers

    def egress(self, envelope, http_headers, operation, binding_options):
        if self.printGeneratedXml:
            print(
                etree.tostring(
                    envelope, pretty_print=True, encoding=sys.stdout.encoding))
        return envelope, http_headers


#create the client object
client = Client(
    'https://tools.rki.de/SurvStat/SurvStatWebService.svc?singleWsdl',
    plugins=[])
# create a type_factory which includes the required data types
factory = client.type_factory('ns2')


# function to get all dimensions from the web Service
# Parameter:
#   cube = "SurvStat"
#   language ="German" || "English"
def GetAllDimensions(cube, language):
    dcRequest = factory.DimensionCollectionRequest(
        Cube=cube, Language=language)
    response = client.service.GetAllDimensions(dcRequest)
    return response


# function to get all hierarchy members from the web Service
# Parameter:
#   cube = "SurvStat"
#   hierarchyId = ~
#   language ="German" || "English"
def GetAllHierarchyMembers(cube, hierarchyId, language):
    hmcRequest = factory.HierarchyMemberCollectionRequest(
        Cube=cube, HierarchyId=hierarchyId, Language=language)
    response = client.service.GetAllHierarchyMembers(hmcRequest)
    return response


# function to get the olap result data from the web Service
# Parameter:
#   ColumnHierarchy: string,
#   Cube: string,
#   HierarchyFilters: ns2:FilterCollection,
#   IncludeNullColumns: bool,
#   IncludeNullRows: bool,
#   IncludeTotalColumn: bool,
#   IncludeTotalRow: bool,
#   Language: string,
#   Measure: string,
#   RowHierarchy: string
def GetOlapResultData(colHierarchy_, cube_, hierarchyFilter_, incNullCols_,
                      incNullRows_, incTotalCol_, incTotalRow_, language_,
                      measure_, rowHierarchy_):
    ordRequest = factory.OlapResultDataRequest(
        ColumnHierarchy=colHierarchy_,
        Cube=cube_,
        HierarchyFilters=hierarchyFilter_,
        IncludeNullColumns=incNullCols_,
        IncludeNullRows=incNullRows_,
        IncludeTotalColumn=incTotalCol_,
        IncludeTotalRow=incTotalRow_,
        Language=language_,
        Measure=measure_,
        RowHierarchy=rowHierarchy_)

    response = client.service.GetOlapResultData(ordRequest)
    return response


colHierarchy = "[PathogenMitKrankheit].[Meldeweg Web71].[Krankheit ID Web71]"

incNullCols = False
incNullRows = False
incTotalCol = False
incTotalRow = False
measure = "[Measures].[FallCount_71]"
rowHierarchy = "[ReportingDate].[YearWeek].[YearWeek]"
resOlapResultData = GetOlapResultData(colHierarchy, cube, None, incNullCols,
                                      incNullRows, incTotalCol, incTotalRow,
                                      language, measure, rowHierarchy)


In [16]:
resOlapResultData.Columns
columns = []
for column_obj in resOlapResultData.Columns.QueryResultColumn:
    caption = column_obj.Caption
    if caption is not None: 
        columns.append(caption)
columns = ['Kalenderwoche'] + columns


In [17]:
rows = []
for query_row in resOlapResultData.QueryResults.QueryResultRow:
    row = []
    row.append(query_row.Caption)
    for i, val in enumerate(query_row.Values.string):
        if i == 0:
            continue
        row_val = int(val.replace('.', '')) if val is not None else 0
        row.append(row_val)
    rows.append(row)

In [20]:
weekly_diseases = pd.DataFrame.from_records(data=rows, columns=columns)
weekly_diseases = weekly_diseases.filter(['Norovirus-Gastroenteritis','Influenza', 'Windpocken','Kalenderwoche'])

weekly_diseases[weekly_diseases["Kalenderwoche"]=='2001-KW01']

Unnamed: 0,Norovirus-Gastroenteritis,Influenza,Windpocken,Kalenderwoche
0,136,7,0,2001-KW01


In [68]:
#Create a dictonary for data frames for each week of the diseases
df_dict_weeks_diseases = {}
weeks = list(weekly_diseases["Kalenderwoche"])

#since length of disease and symptom length is not equal
weeks_diseases =- weekly_diseases

dataframes_diseases = [weekly_diseases[weekly_diseases["Kalenderwoche"]=='{}'.format(week)] for week in weeks]

dict_diseases = dict(zip(weeks_diseases,dataframes_diseases))

TypeError: bad operand type for unary -: 'str'

In [22]:
#and drop the unnecessary colums to get a one-hot vector
for key in dict_diseases:
    dict_diseases[key].drop(columns=['Kalenderwoche'],inplace=True)
    dict_diseases[key] = dict_diseases[key].transpose()
    



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [24]:
#create multi layer perceptron object
from sklearn import neural_network
mlp = neural_network.MLPRegressor(activation='relu', solver='lbfgs',verbose=True,max_iter=300,alpha=0.0001)

In [80]:
#read out all the dictonaries and input them into numpy arrays
X = np.zeros([len(dict_symptomps),18])
Y = np.zeros([len(dict_diseases),3])
for i in range(len(dict_symptomps)):
    X[i] = dict_symptomps[weeks[i]].as_matrix().flatten()
    
for i in range(len(dict_symptomps)):
    Y[i] = dict_diseases[weeks[i]].as_matrix().flatten()
Y = Y[:893]

In [81]:
print(len(Y))

893


In [82]:

mlp.fit(X,Y)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=300, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [86]:
prediction= mlp.predict(dict_symptomps[weeks[5]].as_matrix().flatten())
target = dict_diseases[weeks[5]].as_matrix().flatten()
print(np.round(prediction-target))


ValueError: Expected 2D array, got 1D array instead:
array=[  0   7   0   0   0   0   0 291   0 220 230   0   0   0   0   0  44   0].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [94]:
np.random.seed(707)
import tensorflow as tf
from tf.keras.models import Sequential
from tf.keras.layers import Dense, Activation

model = Sequential()
model.add(Dense(100, input_dim=18))
model.add(Activation('relu'))
model.add(Dense(3))
model.add(Activation('relu')) # may take out and test later...
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X[:-100], Y[:-100], epochs=250, batch_size=32)

pred = model.predict(X[-100:])

print(pred.shape)
print(pred[0])
print('\n', Y[-100:][0])

print('\n',np.mean(pred))
print('\n', np.mean(Y[-100:]))

ModuleNotFoundError: No module named 'tf'

In [95]:
tf.VERSION

'1.1.0'