In [1]:
##### Copyright 2021 Lukas Deis.

# This work is licensed under the
# Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License.
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://creativecommons.org/licenses/by-nc/3.0/legalcode
#
# A human readable summary of the License is available at
#
# https://creativecommons.org/licenses/by-nc/3.0/
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Install and Import necessary libraries


In [2]:
"""
In a normal environment the following will install all necessary packages:
!pip install sklearn
!pip install numpy
!pip install pandas
!pip install tensorflow #if posible use -gpu
!pip install pydot
!pip install pydotplus
!pip install graphviz
!pip install datetime
!pip install packaging
!pip install keras
"""

In [3]:
"""
In this specific environment everything was installed like this:
from local_package_installer.local_package_installer import install_local

install_local('sklearn')
install_local('pandas')
#install_local('tensorflow')
#install_local('pydot')
#install_local('pydotplus')
#install_local('graphviz')
install_local('datetime')
install_local('packaging')
install_local('keras')
install_local('numpy==18.4')
install_local('tensorflow-gpu') #tensorflow-gpu
install_local("pyreadstat==1.0.5") # this one required me to manually copy a dll to a different location

"""

In [4]:
"""
#using this function:
#Run following commands in your Python session (only once per virtual machine per Python
# version/environment):
import sys
import subprocess
import re

def installer_local(package):
    try:
        print('Installing local package installer.')
        call = [sys.executable, '-m', 'pip', 'install', '--user', '--upgrade',\
                '--trusted-host=drefilesrv01.researchenvironment.org',\
                '--index-url=http://drefilesrv01.researchenvironment.org/PythonInstaller/',\
                package]
        process = subprocess.Popen(call, stdin=subprocess.PIPE, stdout=subprocess.PIPE,\
                                   stderr=subprocess.STDOUT, universal_newlines=True)
        while True:
            output = process.stdout.readline()
            if process.poll() is not None:
                break
            if any(re.findall(r'error', output.strip(), re.IGNORECASE)):
                raise Exception
            if output:
                print(output.strip())
        print('Even if the package is installed, you possibly have to restart Python before you '\
              'can import the module.')
    except:
        print('Package could not be installed.')

installer_local('pip')
installer_local('local_package_installer')

#Run following commands in your Python session (per Python session):
from local_package_installer.local_package_installer import install_local

#Examples, remove the hashtag and run the command. Replace the package
#name (and version if applicable) for the package you want to install:
#install_local('numpy')
"""

Install for graph: https://graphviz.gitlab.io/download/
maybe follow: https://bobswift.atlassian.net/wiki/spaces/GVIZ/pages/131924165/Graphviz+installation

In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.constraints import max_norm
from tensorflow.keras import layers
import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing
from datetime import datetime
import tensorboard
import pyreadstat
from sklearn.utils import resample

In [6]:
# to see if your computer can utilize its GPU to speed everything up, let's take a look at how many GPUs are available
# if you installed tensorflow-gpu and all necessary CUDA toolkits and drivers it should be at least one
# If you feel like you should see more then you are, try looking at the console in which Jupyter is running
# it might give you an information about which cuda DLLs are missing. 
# Often they end on the version of CUDA you're missing
# Yes, sometimes them seem ancient.
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


## Reading in the data

The data is read into a pandas dataframe

Again:\
As the real data is sensitive, large and expensive to use,
for now I use a dummy dataset about adoption-speed

In [7]:
data_file = 'C:/Users/Lukas.Deis/Documents/dataset/MIND_Set_Data_exported.csv'
# convert to csv
dataframe = pd.read_csv(data_file)
#dataframe = dataframe.replace(np.nan,"UNKNOWN")

print('-----------')
#for typ in dataframe.dtypes:
    #print(typ.?)
    #get column name of each non object column and convert to string?
print('-----------')

#Alternative way to read things, less intuitive, other customization-options
#dataframe, meta = pyreadstat.read_sav(data_file,user_missing=True, apply_value_formats=False)
#print(meta.missing_ranges["AQ50q1"])

#If I use the second method with apply_value_formats=False all numerical ones should be fine and I have to only interpret the two dates and strings manually

-----------
-----------


In [8]:
dataframe.head()

Unnamed: 0,record_id,Volgnummer,InformedConsent,SD_0,SD_00,SD_1,LevEduc,GbLandNL,SD_2,SD_2b,...,OQ_45,OQ_Tot,OQ_SD,OQ_IR,OQ_SR,OQ_ASD,OQ_SD_Av,OQ_IR_Av,OQ_SR_Av,OQ_ASD_Av
0,MS16P1A-0005,5,Ja,6/6/2016,Man,3/4/1993,Laag (basisond. of lager/voorbereidend beroeps...,nee,Kosovo,1999.0,...,Vaak,104.0,58.0,28.0,18.0,28.0,2.32,2.54545454545455,2.0,2.33333333333333
1,MS16P1A-0012,12,Ja,6/13/2016,Vrouw,6/29/1963,Laag (basisond. of lager/voorbereidend beroeps...,ja,Nederland,,...,Soms,77.0,41.0,19.0,17.0,23.0,1.64,1.72727272727273,1.88888888888889,1.91666666666667
2,MS16P1A-0013,13,Ja,6/9/2016,Vrouw,11/22/1997,Middel (middelbaar beroepsond./beroepsbegeleid...,ja,Nederland,,...,Zelden,61.0,34.0,12.0,15.0,13.0,1.36,1.09090909090909,1.66666666666667,1.08333333333333
3,MS16P1A-0014,14,Ja,6/22/2016,Vrouw,11/14/1979,Middel (middelbaar beroepsond./beroepsbegeleid...,ja,Nederland,,...,,,,,,,,,,
4,MS16P1A-0017,17,Ja,6/6/2016,Man,9/2/1975,Middel (middelbaar beroepsond./beroepsbegeleid...,ja,Nederland,1975.0,...,Nooit,50.0,23.0,20.0,7.0,8.0,0.92,1.81818181818182,0.777777777777778,0.666666666666667


## Creating the target variable

I have to select the variable I want to train for and drop the columns that are not important or contain that information from the normal dataset.

Valid for the example data:
The task in the Kaggle competition was to predict the speed at which a pet will be adopted (e.g., in the first week, the first month, the first three months, and so on). Let's simplify this for our purposes. It is transformed into a binary classification problem:
I simply predict whether the pet was adopted, or not.

After modifying the label column, 
0 will indicate the person does not experience suicidal ideation, 
1 will indicate it does.

In [9]:
target = "OQ_8" 
string_targets = dataframe[target]
severity_sorting = { #TODO do these values make sense?
    "Nooit": 0.0,
    "Zelden": 0.25,
    "Soms": 0.5,
    "Vaak": 0.75,
    "Bijna altijd": 1
}


target_float = string_targets.map(severity_sorting)
target_categorical = np.where(target_float > severity_sorting["Nooit"], 1, 0)
dataframe['target'] = target_categorical # TODO this is now simple classification (0 or 1) but it could be more defined, would that not be better?

# Drop un-used columns. (including our now target which can not be used for training)
unused_cols = [target]
dataframe = dataframe.drop(columns=unused_cols)

# Patients that did not answer the target question can not be evaluated and are thus removed.
dataframe = dataframe[dataframe['target'].notna()]


tf.print("targets:", dataframe['target'])
#TODO note this in the report

targets: 0      1
1      0
2      1
3      0
4      0
      ..
700    0
701    0
702    0
703    0
704    0
Name: target, Length: 705, dtype: int32


In [10]:
dataframe.head()

Unnamed: 0,record_id,Volgnummer,InformedConsent,SD_0,SD_00,SD_1,LevEduc,GbLandNL,SD_2,SD_2b,...,OQ_Tot,OQ_SD,OQ_IR,OQ_SR,OQ_ASD,OQ_SD_Av,OQ_IR_Av,OQ_SR_Av,OQ_ASD_Av,target
0,MS16P1A-0005,5,Ja,6/6/2016,Man,3/4/1993,Laag (basisond. of lager/voorbereidend beroeps...,nee,Kosovo,1999.0,...,104.0,58.0,28.0,18.0,28.0,2.32,2.54545454545455,2.0,2.33333333333333,1
1,MS16P1A-0012,12,Ja,6/13/2016,Vrouw,6/29/1963,Laag (basisond. of lager/voorbereidend beroeps...,ja,Nederland,,...,77.0,41.0,19.0,17.0,23.0,1.64,1.72727272727273,1.88888888888889,1.91666666666667,0
2,MS16P1A-0013,13,Ja,6/9/2016,Vrouw,11/22/1997,Middel (middelbaar beroepsond./beroepsbegeleid...,ja,Nederland,,...,61.0,34.0,12.0,15.0,13.0,1.36,1.09090909090909,1.66666666666667,1.08333333333333,1
3,MS16P1A-0014,14,Ja,6/22/2016,Vrouw,11/14/1979,Middel (middelbaar beroepsond./beroepsbegeleid...,ja,Nederland,,...,,,,,,,,,,0
4,MS16P1A-0017,17,Ja,6/6/2016,Man,9/2/1975,Middel (middelbaar beroepsond./beroepsbegeleid...,ja,Nederland,1975.0,...,50.0,23.0,20.0,7.0,8.0,0.92,1.81818181818182,0.777777777777778,0.666666666666667,0


In [11]:
# To preprocess the input into usable data, we need to know which column contains what kind of data.
# Data can either be:
#     - a scalar value (a fee one has to pay)
#     - a numeric value that should be interpreted as categorical (age in groups)
#     - a string that should be interpreted as categorical ( very,a bit, not really, no)
# Usually not everything is encoded that nicely, in this dataset there are some dates that can not easily be converted.
#     - columns with type date need to be converted to a value of age (in years) and then sorted into categories, before further processing as categorical, numeric values


In [12]:
# read different types for coulumns to treat them accordingly
# TODO read the headers from headers.csv here and remove the old hardcoded keys
numerical_features = []
categorical_int_features = []
categorical_cols = [] 
date_cols = []
year_cols = []
to_be_removed = []
headers_loc = 'C:/Users/Lukas.Deis/Documents/dataset/headers.csv'
headers = pd.read_csv(headers_loc) # TODO I should use a JSON file for this

headers.pop("notes")
type_sorting = { # todo, this should be stored in a JSON file instead of code
    "date": date_cols,
    "year": year_cols,
    "skalar": numerical_features,
    "categorical_int": categorical_int_features,
    "categorical_string": categorical_cols, 
    "remove": to_be_removed # cols like id's and the date of the test that don't actually carry information for the prediction
}

for row in headers.itertuples(): # TODO can I iterate through a df like that?
    heading, col_type = row.heading, row.type
    # the target variable, should not be sorted as it is removed from the dataset earlier.
    if heading == target:
        print(row)
        pass
    right_column_list = type_sorting.get(col_type, lambda: tf.print("invalid type:", col_type, " for column:", heading)) # find the type of this key #TODO fix that instead of a lambda, the error message is give, otherwise this leads to: "AttributeError: 'function' object has no attribute 'append'" later on
    try:
        right_column_list.append(heading) # append the key to the right col
    except:
        tf.print("ERROR: Something went wrong while reading the headers.csv file")
        tf.print("       Could it be that '", col_type, "' is not actually a valid type for a heading? It is used in row: ", heading)

Pandas(Index=404, heading='OQ_8', type='categorical_string')


In [13]:
#to look at a specific list of columns, the corresponding name just needs to be uncommented
tf.print(
    #target
    #numerical_features
    #categorical_int_features
    #categorical_cols
    #date_cols
    #year_cols
    #to_be_removed
)




In [14]:
# age and date columns are easiet to work with if they are in time-format. 
# Thus the columns are converted to time since [date / year] and added to the categorical_int_features as ages in years
for date_col in date_cols:
    # convert to years
    dataframe[date_col] = dataframe[date_col].replace(' ',np.nan, regex=True) #replace empty strings with parsable NaN
    dataframe[date_col] = pd.to_datetime(dataframe[date_col], format='%m/%d/%Y') # convert to date format
    dataframe[date_col] = pd.DatetimeIndex(dataframe[date_col]).year# take only year of date
    # add to year cols
    year_cols.append(date_col)
    
for year_col in year_cols:
    current_year = 2020
    dataframe[year_col] = dataframe[year_col].replace(' ',np.nan, regex=True) #replace empty strings with parsable NaN
    dataframe[year_col] = current_year - dataframe[year_col].astype(float)
    # remove NaN values as they would break further computations
    # this bears the risk of seeing factors in the wrong way, 0 can mean: always, never, don't know...
    dataframe[year_col] = dataframe[year_col].replace(np.nan,0, regex=True) 
    # add to categorical_int features because that is what ages are
    categorical_int_features.append(year_col)
    print(dataframe[year_col])
   

0      21.0
1       0.0
2       0.0
3       0.0
4      45.0
       ... 
700     0.0
701     0.0
702     0.0
703    33.0
704     0.0
Name: SD_2b, Length: 705, dtype: float64
0      4.0
1      4.0
2      4.0
3      4.0
4      4.0
      ... 
700    1.0
701    0.0
702    0.0
703    0.0
704    0.0
Name: SD_0, Length: 705, dtype: float64
0      27.0
1      57.0
2      23.0
3      41.0
4      45.0
       ... 
700    22.0
701    43.0
702    27.0
703    58.0
704    21.0
Name: SD_1, Length: 705, dtype: float64


In [15]:
# categorical_int_features are all ages
# while age in general is important, small differences in age max cause more confusion than clarity
# 20 VS 40 should be considered by the network, 20 VS 21 not so much
# age should be considered in categories of age-groups, but the network can figure all that out for itself
# for simplicity, age will considered as a normal numeric value, any complex relationship to the outcome should be learned

# It happens that all categorical_ints in this set are ages and none are left afterwards
# If that was different, one would need to do this differently
# TODO make a different tag "age" that can be used or just mark ages as skalars
numerical_features.extend(categorical_int_features)
categorical_int_features = []


In [16]:
dataframe.dtypes
# make numerical columns readable as floats
for feature in numerical_features:
    dataframe[feature] = dataframe[feature].replace(' ',0, regex=True) # used to be np.NaN, but that does not work too well
    dataframe[feature] = dataframe[feature].astype(float)

In [17]:
# remove cells that should not be considered (generalizable (long texts) or do not carry value (ID's))
dataframe = dataframe.drop(columns=to_be_removed)

## Spliting the dataframe into train, validation, and test

The loaded dataset was a single file. It has to be split into train, validation, and test sets.

In [18]:
train, test = train_test_split(dataframe, test_size=0.25)
train, val = train_test_split(train, test_size=0.1)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

475 train examples
53 validation examples
177 test examples


In [19]:
# To tackle the class implanace, the minority-class will be oversampled
# separate the two classes
# S_I = Suicidal Ideation
do_experience_S_I = train[train.target==1]
do_not_experience_S_I = train[train.target==0]

tf.print("Number of samples that DO experience suicidal ideation", len(do_experience_S_I))
tf.print("Number of samples that do NOT experience suicidal ideation", len(do_not_experience_S_I))

upsampled_do_experience_S_I = resample(
    do_experience_S_I,
    replace=True, # sampling WITH replacement
    n_samples=len(do_not_experience_S_I), #so the ammount of samples is the same for both classes
    random_state=20 # so the results are reproducable (like seed)
    )

#overwrite the old dataframe with the new, balanced one
balanced_frame = pd.concat([upsampled_do_experience_S_I, do_not_experience_S_I])
train = balanced_frame
print("after resampling, the number of targets is balanced:")
tf.print(train.target.value_counts())
#TODO note this in the report

Number of samples that DO experience suicidal ideation 181
Number of samples that do NOT experience suicidal ideation 294
after resampling, the number of targets is balanced:
1    294
0    294
Name: target, dtype: int64


## Input pipeline

The dataframe is wrapped with [tf.data](https://www.tensorflow.org/guide/datasets).
This is done to easily shuffle and batch the data. 

If the RAM is not sufficient, tf.data could be used directly to read it from disk in batches.

In [20]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=1):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    
    #tf.print(dataframe.dtypes) #[539 rows x 396 columns] when including dates
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    
    #ds = tf.data.Dataset.from_tensor_slices((values, labels.values))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

The general pipeline for input is finished here.
What does it look like?

In [21]:
batch_size = 1
train_ds = df_to_dataset(train, batch_size=batch_size)

In [22]:
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()), "\n")
print('A batch of PTSDFinal:', train_features['PTSDFinal'], "\n")
print('A batch of targets:', label_batch )

Every feature: ['SD_0', 'SD_00', 'SD_1', 'LevEduc', 'GbLandNL', 'SD_2', 'SD_2b', 'SD_3', 'SD_4', 'SD_3b', 'SD_3b_1', 'SD_4b', 'SD_4b_1', 'SD_6', 'SD_7', 'SD_8', 'SD_9', 'SD_10', 'SD_12', 'SD_13', 'SD_15', 'SD_15_1', 'SD_15_2', 'SD_16', 'SD_16_1', 'SD_16_2', 'SD_18', 'SD_19', 'ASI1', 'ASI2', 'ASI3', 'ASI4', 'ASI5', 'ASI6', 'ASI7', 'ASI8', 'ASI9', 'ASI10', 'ASI11', 'ASI12', 'ASI13', 'ASI14', 'ASI15', 'ASI16', 'ASI_Tot', 'ids_1', 'ids_2', 'ids_3', 'ids_4', 'ids_5', 'ids_6', 'ids_7', 'ids_8', 'ids_9', 'ids_9A', 'ids_9B', 'ids_10', 'ids_11', 'ids_12', 'ids_13', 'ids_14', 'ids_15', 'ids_16', 'ids_17', 'ids_19', 'ids_20', 'ids_21', 'ids_22', 'ids_23', 'ids_24', 'ids_25', 'ids_26', 'ids_27', 'ids_28', 'ids_29', 'ids_30', 'IDS_Tot', 'CAARSq1', 'CAARSq2', 'CAARSq3', 'CAARSq4', 'CAARSq5', 'CAARSq6', 'CAARSq7', 'CAARSq8', 'CAARSq9', 'CAARSq10', 'CAARSq11', 'CAARSq12', 'CAARSq13', 'CAARSq14', 'CAARSq15', 'CAARSq16', 'CAARSq17', 'CAARSq18', 'CAARSq19', 'CAARSq20', 'CAARSq21', 'CAARSq22', 'CAARSq23',

The dataset returns a dictionary of column names (from the dataframe) that map to column values from rows in the dataframe.

## Preprocessing layers

I will have to adapt the pipelines when I replace the dummy-code, but afterwards I will be able to input plain string data etc from new data as well.

Information about the pre-processing layers for easy access when I am there:

*   [`Normalization`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/Normalization) - Feature-wise normalization of the data.
*   [`CategoryEncoding`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/CategoryEncoding) - Category encoding layer.
*   [`StringLookup`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/StringLookup) - Maps strings from a vocabulary to integer indices.
*   [`IntegerLookup`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/IntegerLookup) - Maps integers from a vocabulary to integer indices.

A list of available preprocessing layers can be found [here](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing).

### Numeric columns
A Normalization() layer ensures that each numeric feature has a mean of 0 and a standard deviation of 1.

The `get_normalization_layer` function returns a keras layer.
It applies featurewise normalization to numerical features.

In [23]:
def get_normalization_layer(name, dataset):
    # Create a Normalization layer for our feature.
    normalizer = preprocessing.Normalization()

    # Prepare a Dataset that only yields our feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)

    return normalizer

In [24]:
""" old example
photo_count_col = train_features['PhotoAmt']
layer = get_normalization_layer('PhotoAmt', train_ds)
layer(photo_count_col)
"""

" old example\nphoto_count_col = train_features['PhotoAmt']\nlayer = get_normalization_layer('PhotoAmt', train_ds)\nlayer(photo_count_col)\n"

TODO: If I will indeed have many numeric features (hundreds, or more), it would be more efficient to concatenate them first and use a single [normalization](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/Normalization) layer.

### Categorical columns

In the dummy dataset, Type is represented as a string (e.g. 'Dog', or 'Cat'). Sadly, one can not feed strings directly to a model. The preprocessing layer takes care of representing strings as a one-hot vector.

The `get_category_encoding_layer` function returns a layer, mapping values from a vocabulary to integer indices and one-hot encodes the features.

In [25]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a StringLookup layer which will turn strings into integer indices
  if dtype == 'string':
    index = preprocessing.StringLookup(max_tokens=max_tokens)
  else:
    index = preprocessing.IntegerLookup(max_values=max_tokens)

  # Prepare a Dataset that only yields our feature
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Create a Discretization for our integer indices.
  encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

  # Prepare a Dataset that only yields our feature.
  feature_ds = feature_ds.map(index)

  # Learn the space of possible indices.
  encoder.adapt(feature_ds)

  # Apply one-hot encoding to our indices. The lambda function captures the
  # layer so we can use them, or include them in the functional model later.
  return lambda feature: encoder(index(feature))

## Choosing and preparing columns to use

While we can deal with all types of data, we have to make a list of all columns for each type.\
That way I am able to define which layer needs to be treated how\

In [26]:
batch_size = 1
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [27]:
all_inputs = []
encoded_features = []

##bookkeeping for interpretation
output_sizes = {}

# Numeric features.
for header in numerical_features:  # TODO use all headers in UMC set minus the ones I know are something else
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)
    output_sizes[header] = encoded_numeric_col.get_shape()[1]

In [28]:
# Categorical features encoded as integers.

# TODO at the UMC data, this will be more common, some tests have a categorical scale 
# However, most of them can just be interpreted as normal numerical feature, so I won't have to overdo it

# Numeric features.
for header in categorical_int_features:  
    print(header)
    num_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')
    encoding_layer = get_category_encoding_layer(header, train_ds, dtype='int64',
                                                 max_tokens=5)
    encoded_col = encoding_layer(num_col)
    all_inputs.append(num_col)
    encoded_features.append(encoded_col)
    output_sizes[header] = encoded_col.get_shape()[1]

In [29]:
# Categorical features encoded as string.
#TODO include progress-bar
categorical_cols.remove(target)
for header in categorical_cols:
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
    encoding_layer = get_category_encoding_layer(header, train_ds, dtype='string',
                                               max_tokens=5) # TODO maybe, this line has to be duplicated and slightly changed to accomodate for different max_tokens
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)
    output_sizes[header] = encoded_categorical_col.get_shape()[1]

In [30]:
# Currently I do not think the UMC data needs to be balanced.
# It will be evaluated on the same dataset (though a different part of it)
# We do not have a large number of samples that are underrepresented, probably causing large inaccura

#use:
#    https://www.tensorflow.org/tutorials/structured_data/imbalanced_data

## The model


In [31]:
# The first step towards a working model
# is our preprocessed input.
# As that is a relative complex task, that is regarded it's owy model.

preprocessed_layers = layers.Concatenate()(encoded_features) #encoded_features
preprocesessing_model = tf.keras.Model(all_inputs, preprocessed_layers)
preprocesessing_model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
SD_00 (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
LevEduc (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
GbLandNL (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
SD_2 (InputLayer)               [(None, 1)]          0                                            
_______________________________________________________________________________________

string_lookup_122 (StringLookup (None, 1)            0           AQ50q26[0][0]                    
__________________________________________________________________________________________________
string_lookup_123 (StringLookup (None, 1)            0           AQ50q27[0][0]                    
__________________________________________________________________________________________________
string_lookup_124 (StringLookup (None, 1)            0           AQ50q28[0][0]                    
__________________________________________________________________________________________________
string_lookup_125 (StringLookup (None, 1)            0           AQ50q29[0][0]                    
__________________________________________________________________________________________________
string_lookup_126 (StringLookup (None, 1)            0           AQ50q30[0][0]                    
__________________________________________________________________________________________________
string_loo

__________________________________________________________________________________________________
category_encoding_183 (Category (None, 5)            0           string_lookup_183[0][0]          
__________________________________________________________________________________________________
category_encoding_184 (Category (None, 5)            0           string_lookup_184[0][0]          
__________________________________________________________________________________________________
category_encoding_185 (Category (None, 5)            0           string_lookup_185[0][0]          
__________________________________________________________________________________________________
category_encoding_186 (Category (None, 5)            0           string_lookup_186[0][0]          
__________________________________________________________________________________________________
category_encoding_187 (Category (None, 5)            0           string_lookup_187[0][0]          
__________

Within the models structure, there are repetetive patterns.

For readability those layers are combined into custom layers and models:

In [32]:
# A combination of layers, common in the parameterizer

class ParameterizerLayer(layers.Layer):
    
    def __init__(self, out_shape, dropout_rate):
        super(ParameterizerLayer, self).__init__()
        self.para_lin = layers.Dense(out_shape, activation='linear')
        self.para_drop = layers.Dropout(dropout_rate)
        self.para_relu = layers.Dense(out_shape, activation=tf.keras.layers.LeakyReLU(alpha=0.05))
        
    
    def call(self, input_tensor,  training=False):
        x = self.para_lin(input_tensor)
        if training:
            x = self.para_drop(x, training=training)
        x = self.para_relu(x)        
        return x
    
# should minimize robustness loss

In [42]:
#functional model def

#TODO no more static sizes
batch_size = 1
preprocessed_inputs_shape = 1930 #TODO why does this change???
dropout_rate=0.1
hidden_sizes = [200, 100, 50, 50]  # TODO fix this to actual size
out_shape = preprocessed_inputs_shape

input_shape = [batch_size, preprocessed_inputs_shape]
input_layer = layers.Input(batch_shape = input_shape)

x = input_layer
###Parameterizer###
x = ParameterizerLayer(hidden_sizes[0], dropout_rate)(x)
x = ParameterizerLayer(hidden_sizes[1], dropout_rate)(x)
x = ParameterizerLayer(hidden_sizes[2], dropout_rate)(x)
x = ParameterizerLayer(hidden_sizes[3], dropout_rate)(x)
x = layers.Dense(out_shape, activation='linear')(x)
relevances = layers.Dropout(rate=dropout_rate, name="relevances")(x)

###Conceptizer###
concepts = layers.Lambda(lambda t: t, name="concepts")(input_layer)

###Aggregator###

aggregated = layers.multiply([relevances, concepts])
aggregated = layers.Lambda(lambda t: tf.keras.backend.sum(t, axis=-1))(aggregated)
aggregated = layers.Lambda(lambda t: tf.keras.activations.sigmoid(t), name="output")(aggregated)

out_layer = [aggregated, concepts, relevances]

functional_model = tf.keras.Model(inputs=input_layer, outputs=out_layer)

In [43]:
#custom fun with more inputs

def get_custom_loss(some_other_argument):
    
    def custom_loss(y_true, y_pred): 
        loss = 0
        loss = loss + some_other_argument
        loss = keras.losses.binary_crossentropy(y_true, y_pred)
        return loss
    
    return custom_loss

In [44]:
def zero_loss(y_true, y_pred):
    return 0

In [45]:
loss_dict = {
    "relevances": keras.losses.mean_absolute_error, #get_custom_loss(some_other_argument=1),
    "output": keras.losses.binary_crossentropy, #tf.nn.log_poisson_loss,
    #"concepts": zero_loss
}

functional_model.compile(
    optimizer="adam", 
    loss=loss_dict,
    loss_weights=[1, 5, 0],
    metrics= ['accuracy']
)  
functional_model.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(1, 1930)]          0                                            
__________________________________________________________________________________________________
parameterizer_layer_4 (Paramete (1, 200)             426400      input_2[0][0]                    
__________________________________________________________________________________________________
parameterizer_layer_5 (Paramete (1, 100)             30200       parameterizer_layer_4[0][0]      
__________________________________________________________________________________________________
parameterizer_layer_6 (Paramete (1, 50)              7600        parameterizer_layer_5[0][0]      
_______________________________________________________________________________________

In [46]:
# do pre-processing of data separately
processed_train_ds = train_ds.map(
  lambda x, y: (
      tf.cast(preprocesessing_model(x), dtype=tf.float32), # TODO this breaks if the batch-size is anything but 1 
      tf.cast(y, dtype=tf.float32)
  )
)

In [47]:
for d in processed_train_ds.enumerate():
    tf.print(d)

(0, ([[-0.23205778 -0.421609 0.254957527 ... 0 0 0]], [0]))
(1, ([[0.164311022 0.838466287 0.254957527 ... 1 0 0]], [1]))
(2, ([[-0.826611 -1.57211256 -0.468105257 ... 0 0 1]], [0]))
(3, ([[-1.32207203 -1.57211256 -1.67320991 ... 0 0 0]], [0]))
(4, ([[0.758864224 1.60546863 0.495978475 ... 0 0 0]], [1]))
(5, ([[0.164311022 -0.0381078348 0.254957527 ... 0 0 0]], [0]))
(6, ([[1.15523303 0.728894532 0.97802031 ... 1 0 0]], [0]))
(7, ([[-1.32207203 -1.57211256 -1.67320991 ... 0 0 0]], [0]))
(8, ([[1.25432527 1.55068266 0.97802031 ... 0 0 0]], [1]))
(9, ([[0.0652188286 0.181035697 0.97802031 ... 1 0 0]], [1]))
(10, ([[0.164311022 0.674108624 0.736999393 ... 0 0 1]], [1]))
(11, ([[-0.331149966 0.56453687 -0.950147152 ... 0 0 0]], [1]))
(12, ([[-0.0338733718 1.16718161 0.495978475 ... 0 0 1]], [1]))
(13, ([[0.461587638 -0.914681911 0.0139365969 ... 0 0 1]], [0]))
(14, ([[1.55160189 1.71504033 0.495978475 ... 0 1 0]], [1]))
(15, ([[-1.22297978 -0.805110157 -1.43218899 ... 0 1 0]], [0]))
(16, (

(130, ([[-0.826611 0.56453687 -0.468105257 ... 0 0 1]], [1]))
(131, ([[-0.727518797 -1.29818308 -0.950147152 ... 1 0 0]], [0]))
(132, ([[-1.22297978 -0.859896064 -1.19116807 ... 0 1 0]], [0]))
(133, ([[1.25432527 1.33153915 0.736999393 ... 0 1 0]], [1]))
(134, ([[-1.32207203 -1.57211256 -1.67320991 ... 0 0 0]], [0]))
(135, ([[0.560679853 0.838466287 0.97802031 ... 0 0 0]], [1]))
(136, ([[1.3534174 0.509750962 -1.43218899 ... 0 0 0]], [0]))
(137, ([[0.0652188286 0.619322717 0.254957527 ... 1 0 0]], [1]))
(138, ([[-0.628426552 0.181035697 0.97802031 ... 0 1 0]], [1]))
(139, ([[-0.529334366 0.34539333 -0.709126174 ... 1 0 0]], [1]))
(140, ([[0.164311022 0.674108624 0.736999393 ... 0 0 1]], [1]))
(141, ([[0.758864224 1.60546863 1.46006215 ... 0 0 0]], [1]))
(142, ([[0.758864224 -0.476394892 0.495978475 ... 1 0 0]], [0]))
(143, ([[1.15523303 1.16718161 -0.227084339 ... 0 0 0]], [1]))
(144, ([[1.74978626 1.22196746 -0.227084339 ... 0 0 0]], [1]))
(145, ([[0.0652188286 -0.147679597 0.97802031

(259, ([[0.0652188286 0.400179207 1.21904123 ... 1 0 0]], [1]))
(260, ([[-1.02479541 -0.695538402 -1.19116807 ... 0 0 1]], [0]))
(261, ([[-0.13296558 -1.4077549 -0.950147152 ... 0 0 1]], [0]))
(262, ([[-0.0338733718 0.235821575 0.0139365969 ... 0 1 0]], [0]))
(263, ([[-0.13296558 -0.585966647 -0.227084339 ... 0 0 0]], [0]))
(264, ([[-1.12388754 -1.4077549 -0.950147152 ... 0 1 0]], [0]))
(265, ([[0.362495422 -0.0928937122 0.97802031 ... 1 0 0]], [0]))
(266, ([[0.659772038 0.400179207 1.9421041 ... 0 0 0]], [0]))
(267, ([[-0.628426552 0.893252134 0.254957527 ... 0 0 0]], [0]))
(268, ([[-0.628426552 -1.35296893 -1.19116807 ... 0 0 1]], [0]))
(269, ([[0.0652188286 -1.46254075 -1.67320991 ... 0 0 1]], [0]))
(270, ([[-0.727518797 -0.969467819 -0.709126174 ... 1 0 0]], [0]))
(271, ([[-0.727518797 0.181035697 0.0139365969 ... 0 0 1]], [1]))
(272, ([[-1.02479541 0.181035697 0.495978475 ... 0 0 1]], [0]))
(273, ([[-0.628426552 0.893252134 -0.227084339 ... 1 0 0]], [1]))
(274, ([[1.25432527 1.276

(387, ([[0.362495422 -0.0381078348 -0.227084339 ... 0 0 1]], [1]))
(388, ([[-0.925703168 -1.18861139 -1.19116807 ... 0 0 1]], [0]))
(389, ([[0.560679853 0.290607452 1.21904123 ... 1 0 0]], [1]))
(390, ([[-0.331149966 0.0166780483 -0.227084339 ... 0 1 0]], [1]))
(391, ([[-0.628426552 0.181035697 0.97802031 ... 0 1 0]], [1]))
(392, ([[-0.529334366 0.126249805 1.46006215 ... 0 1 0]], [0]))
(393, ([[2.83980036 0.619322717 0.495978475 ... 0 0 1]], [1]))
(394, ([[1.25432527 0.838466287 0.495978475 ... 0 0 0]], [1]))
(395, ([[0.164311022 0.0166780483 0.97802031 ... 0 0 0]], [0]))
(396, ([[0.362495422 0.0166780483 0.97802031 ... 0 0 0]], [1]))
(397, ([[-0.727518797 -1.35296893 -1.67320991 ... 1 0 0]], [0]))
(398, ([[-0.23205778 0.893252134 0.254957527 ... 0 1 0]], [0]))
(399, ([[2.9388926 1.82461202 0.736999393 ... 1 0 0]], [1]))
(400, ([[-1.32207203 -1.57211256 -1.67320991 ... 0 0 0]], [0]))
(401, ([[0.560679853 0.509750962 0.495978475 ... 1 0 0]], [0]))
(402, ([[1.55160189 0.290607452 0.7369

(515, ([[0.164311022 0.290607452 1.70108318 ... 0 0 1]], [1]))
(516, ([[-0.925703168 0.783680379 -0.709126174 ... 0 0 0]], [1]))
(517, ([[4.82164478 2.15332723 1.9421041 ... 0 0 0]], [1]))
(518, ([[-0.628426552 0.893252134 -0.227084339 ... 1 0 0]], [1]))
(519, ([[0.0652188286 0.948038042 0.495978475 ... 0 0 0]], [1]))
(520, ([[-0.826611 0.400179207 0.736999393 ... 0 0 1]], [0]))
(521, ([[0.461587638 0.454965085 0.495978475 ... 0 0 0]], [1]))
(522, ([[2.9388926 1.82461202 0.736999393 ... 1 0 0]], [1]))
(523, ([[0.164311022 0.235821575 0.254957527 ... 0 0 1]], [1]))
(524, ([[-0.826611 -0.257251352 0.0139365969 ... 0 0 0]], [0]))
(525, ([[-1.32207203 -1.57211256 -1.67320991 ... 0 0 0]], [0]))
(526, ([[1.25432527 1.27675331 -0.227084339 ... 0 0 0]], [1]))
(527, ([[0.164311022 0.0166780483 0.254957527 ... 1 0 0]], [1]))
(528, ([[-0.529334366 -1.46254075 -0.950147152 ... 0 0 1]], [0]))
(529, ([[-0.430242181 0.290607452 0.97802031 ... 0 0 1]], [0]))
(530, ([[0.560679853 1.00282395 -0.70912617

In [48]:
# unwanted points: ['record_id', 'Volgnummer', 'InformedConsent', 'SD_0', 'SD_1', 'SD_2b', 'SD_14']
# Some of them should indeed have been removed from the model, 
# some are dates that still need to be added

In [49]:
# Define the Keras TensorBoard callback, used for the animated, interactive tensorboard visualizatioon
logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)

#This should plot the exhaustive graph, but is a bit unreliable
tf.keras.utils.plot_model(functional_model, show_shapes=True, rankdir="LR")

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [50]:
#tf.print("processed_train_ds shape:", processed_train_ds.take(0))
functional_model.fit(processed_train_ds, epochs=5, callbacks=tensorboard_callback)

Epoch 1/5
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2349b63c088>

Let's visualize our connectivity graph:


### Test the model


In [51]:
# do pre-processing of data separately
processed_test_ds = test_ds.map(
  lambda x, y: (
      tf.cast(preprocesessing_model(x), dtype=tf.float32), 
      tf.cast(y, dtype=tf.float32)
  )
)

In [52]:
accuracy = functional_model.evaluate(processed_test_ds)
print("Accuracy", accuracy)

Accuracy [0.5799718499183655, 0.5799718499183655, 0.46620601415634155, 0.8135592937469482, 0.033898305147886276, 0.0]


In [53]:
#divide x and y of test set
x = []
y_true = []

for x_var, y_var in processed_test_ds:
    x.append(x_var)
    y_true.append(y_var[0])

In [54]:
##imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import (
    average_precision_score,
    precision_recall_curve,
    roc_auc_score,
    roc_curve,
)

In [55]:
## basic metrics

#skip NaN values here and in analysis later? TODO

def get_true_pos(y, pred, th):
    pred_t = (pred > th)
    return np.sum((pred_t == True) & (y == 1))


def get_true_neg(y, pred, th):
    pred_t = (pred > th)
    return np.sum((pred_t == False) & (y == 0))


def get_false_neg(y, pred, th):
    pred_t = (pred > th)
    return np.sum((pred_t == False) & (y == 1))


def get_false_pos(y, pred, th):
    pred_t = (pred > th)
    return np.sum((pred_t == True) & (y == 0))

def get_acc(tp, tn, fp, fn):
    total = sum([tp, tn, fp, fn])
    correct = sum([tp, tn])
    return correct / total

def get_prevalence(tp, tn, fp, fn):
    return (tp + fn) / (tp + tn + fp + fn)

def get_specificity(tp, tn, fp, fn):
    return tn / (tn + fp)

def get_sensitivity(tp, tn, fp, fn):
    return tp / (tp + fn)

def get_PPV(tp, tn, fp, fn):
    return (tp / (tp + fp))

def get_NPV(tp, tn, fp, fn):
    return (tn / (fn + tn))

In [56]:
#### based on coursera util.py for metrics


def get_performance_metrics(y, pred, class_labels, threshold, 
                            tp=get_true_pos,
                            tn=get_true_neg, fp=get_false_pos,
                            fn=get_false_neg,
                            acc=get_acc, prevalence=get_prevalence, spec=get_specificity,
                            sens=get_sensitivity, ppv=get_PPV, npv=get_NPV, auc=None, f1=None):

    columns = ["Label", "TP", "TN", "FP", "FN", "Accuracy", "Prevalence",
               "Sensitivity",
               "Specificity", "PPV", "NPV", "AUC", "F1", "Threshold"]
    df = pd.DataFrame(columns=columns)
    for i in range(len(class_labels)): ## i is the concerning class in each iteration
        class_pred = pred[i]
        #for separate classes:
        #count_preds = len(class_pred) # the class was tried to predict as often as a prediction was made
        #class_y = np.repeat(i, count_preds) # we filter for one class only anyway -> all the same
        #for one class:
        class_y = y[i]
        ## get base metrics
        true_p = round(tp(class_y, class_pred, threshold),3) if tp != None else "Not Defined"
        true_n = round(tn(class_y, class_pred, threshold),3) if tn != None else "Not Defined"
        false_p = round(fp(class_y, class_pred, threshold),3) if fp != None else "Not Defined"
        false_n = round(fn(class_y, class_pred, threshold),3) if fn != None else "Not Defined"
        
        ## construct df for all data concerning class
        row_data = {
            "Label": class_labels[i],
            "TP": true_p,
            "TN": true_n,
            "FP": false_p,
            "FN": false_n,
            "Accuracy": round(acc(true_p, true_n, false_p, false_n), 3) if acc != None else "Not Defined",
            "Prevalence": round(prevalence(true_p, true_n, false_p, false_n), 3) if prevalence != None else "Not Defined",
            "Sensitivity": round(sens(true_p, true_n, false_p, false_n), 3) if sens != None else "Not Defined",
            "Specificity": round(spec(true_p, true_n, false_p, false_n), 3) if spec != None else "Not Defined",
            "PPV": round(ppv(true_p, true_n, false_p, false_n), 3) if ppv != None else "Not Defined",
            "NPV": round(npv(true_p, true_n, false_p, false_n), 3) if npv != None else "Not Defined",
            "AUC": round(auc(class_y, class_pred), 3) if auc != None else "Not Defined",
            "F1": round(f1(class_y, class_pred > threshold), 3) if f1 != None else "Not Defined",
            "Threshold": round(threshold, 3)
        }
        tf.print("One row of metrics:", row_data)
        df = df.append(row_data, ignore_index=True)
    return df


def print_confidence_intervals(class_labels, statistics):
    df = pd.DataFrame(columns=["Mean AUC (CI 5%-95%)"])
    for i in range(len(class_labels)):
        mean = statistics.mean(axis=1)[i]
        max_ = np.quantile(statistics, .95, axis=1)[i]
        min_ = np.quantile(statistics, .05, axis=1)[i]
        df.loc[class_labels[i]] = ["%.2f (%.2f-%.2f)" % (mean, min_, max_)]
    return df


def get_curve(gt, pred, target_names, curve='roc'):
    for i in range(len(target_names)):
        if curve == 'roc':
            curve_function = roc_curve
            auc_roc = roc_auc_score(gt[:, i], pred[:, i])
            label = target_names[i] + " AUC: %.3f " % auc_roc
            xlabel = "False positive rate"
            ylabel = "True positive rate"
            a, b, _ = curve_function(gt[:, i], pred[:, i])
            plt.figure(1, figsize=(7, 7))
            plt.plot([0, 1], [0, 1], 'k--')
            plt.plot(a, b, label=label)
            plt.xlabel(xlabel)
            plt.ylabel(ylabel)

            plt.legend(loc='upper center', bbox_to_anchor=(1.3, 1),
                       fancybox=True, ncol=1)
        elif curve == 'prc':
            precision, recall, _ = precision_recall_curve(gt[:, i], pred[:, i])
            average_precision = average_precision_score(gt[:, i], pred[:, i])
            label = target_names[i] + " Avg.: %.3f " % average_precision
            plt.figure(1, figsize=(7, 7))
            plt.step(recall, precision, where='post', label=label)
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.ylim([0.0, 1.05])
            plt.xlim([0.0, 1.0])
            plt.legend(loc='upper center', bbox_to_anchor=(1.3, 1),
                       fancybox=True, ncol=1)


In [57]:
#make predictions
y_pred = []
stored_concepts = []
stored_relevances = []

for x_var in x:
    aggregated, concepts, relevances = functional_model.predict(x_var)
    y_pred.append(aggregated[0])
    stored_concepts.append(concepts)
    stored_relevances.append(relevances)
y_pred = np.array(y_pred)

In [58]:
y_true_array = np.array(y_true)
class_labels = ['suicidal ideation']

classification_thres = 0.5
metrics = get_performance_metrics([y_true_array], [y_pred], class_labels, classification_thres)
metrics

One row of metrics: {'AUC': 'Not Defined',
 'Accuracy': 0.814,
 'F1': 'Not Defined',
 'FN': 20,
 'FP': 13,
 'Label': 'suicidal ideation',
 'NPV': 0.818,
 'PPV': 0.806,
 'Prevalence': 0.418,
 'Sensitivity': 0.73,
 'Specificity': 0.874,
 'TN': 90,
 'TP': 54,
 'Threshold': 0.5}


Unnamed: 0,Label,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
0,suicidal ideation,54,90,13,20,0.814,0.418,0.73,0.874,0.806,0.818,Not Defined,Not Defined,0.5


In [59]:
functional_model.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(1, 1930)]          0                                            
__________________________________________________________________________________________________
parameterizer_layer_4 (Paramete (1, 200)             426400      input_2[0][0]                    
__________________________________________________________________________________________________
parameterizer_layer_5 (Paramete (1, 100)             30200       parameterizer_layer_4[0][0]      
__________________________________________________________________________________________________
parameterizer_layer_6 (Paramete (1, 50)              7600        parameterizer_layer_5[0][0]      
_______________________________________________________________________________________

In [60]:
#visualize model in an interactive way
#sadly only works until the preprocessing layers are over
# tensorboard sometimes thinks there still is an instance running when it is not
# fix that by deleting the contents of this folder or your equivalent of it
# C:\Users\deisl\AppData\Local\Temp\.tensorboard-info


# TODO reactivate this when tensorboard has been installed:
"""
%reload_ext tensorboard
# rankdir='LR' is used to make the graph horizontal.
#tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")
%tensorboard --logdir logs
"""

'\n%reload_ext tensorboard\n# rankdir=\'LR\' is used to make the graph horizontal.\n#tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")\n%tensorboard --logdir logs\n'

## Inference on new data

As the model contains all important parts, it should be able to work on any file of the right format


The model should be saved such that it can just be reloaded later.\
I will follow the tutorial [here](https://www.tensorflow.org/tutorials/keras/save_and_load)

In [61]:
preprocesessing_model.save('preprocessing_model')
functional_model.save('suicidal_ideation_model')
reloaded_preprocessing = tf.keras.models.load_model('preprocessing_model')
reloaded_model = tf.keras.models.load_model('suicidal_ideation_model')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: preprocessing_model\assets
INFO:tensorflow:Assets written to: suicidal_ideation_model\assets






































































To get a prediction for a new sample, you can simply call `model.predict()`. There are just two things you need to do:

1.   Wrap scalars into a list so as to have a batch dimension (models only process batches of data, not single samples)
2.   Call `convert_to_tensor` on each feature

In [62]:
# create a dataframe of all stored outputs from testing
stored_aggregates = y_pred 
outputs = pd.DataFrame(list(zip(stored_aggregates, stored_concepts, stored_relevances)),
                      columns =['aggregated', 'concepts', 'relevances'])

# creating a legend that contains which output belongs to which input
out_legend = []

for header in output_sizes:
    size = output_sizes[header]
    for layer in range(0, size):
        out_legend.append(header)

In [63]:
# select which sample to look at 

def investigate_sample(output):
    aggregated = output['aggregated']
    concept = output['concepts']
    relevance = output['relevances']

    relevance = np.array(relevance)
    
    # if I multiply 0 inputs with the relevances first, only relevant parts will be shown
    binary_concepts = [0 if concept==0 else 1 for concept in concepts[0]]
    binary_concepts = np.array(binary_concepts)
    polarized_relevances = np.multiply(binary_concepts, relevance[0])

    
    filtered_output = [
        (name,relevance) 
        for name, relevance in 
        zip(out_legend, polarized_relevances[0]) # TODO make this a dictionary
        if not relevance==0
    ]
    
    return aggregated, filtered_output

In [64]:
def normalize_frame(df):
    max_value = df.abs().max()
    return df / max_value

In [65]:
# if 
# TODO filtered_output was a dictionary,
# TODO and output was a param to investigate_sample()
# one could loop through all outputs and make a dataframe with all the explanations

pd.set_option('display.max_rows', 1000)

all_features = numerical_features + categorical_int_features + categorical_cols

aggregates = []
explanations = []
low_prop_explanations = []
high_prop_explanations = []

for target_index in range(0, len(outputs.index)):
    output = outputs.loc[[target_index]]
    aggregated, explanation = investigate_sample(output)
    aggregates.append(aggregated)
    dict_exp = dict(explanation)
    explanations.append(dict_exp)
    
    if aggregated[target_index] > classification_thres:
        high_prop_explanations.append(dict_exp)
    else:
        low_prop_explanations.append(dict_exp)  

#make  dataframes from records during loop
explanations_frame = pd.DataFrame.from_records(explanations)
explanations_frame.columns = all_features

low_prop_explanations_frame = pd.DataFrame.from_records(low_prop_explanations)
low_prop_explanations_frame.columns = all_features

high_prop_explanations_frame = pd.DataFrame.from_records(high_prop_explanations)
high_prop_explanations_frame.columns = all_features


#calculate average values
average_relevance = explanations_frame.mean(axis=0)

average_relevance_low = low_prop_explanations_frame.mean(axis=0)

average_relevance_high = high_prop_explanations_frame.mean(axis=0)

In [66]:
tf.print("The following frames have all been independently normalized and then sorted descendingly.")

The following frames have all been independently normalized and then sorted descendingly.


In [67]:
tf.print("normalized average_relevance\n",normalize_frame(average_relevance).sort_values(ascending=False), "\n")

normalized average_relevance
 ASI2                           0.903582
OQ_SD_Av                       0.865269
OQ_Tot                         0.859868
OQ_SD                          0.828366
ASI6                           0.656508
SF_RV_Scale                    0.653366
SF_RV                          0.647458
AQ50q38                        0.623444
SD_7                           0.616393
SF20_16                        0.615620
IDS_Tot                        0.595618
ASI1                           0.592702
CAARSq23                       0.590850
OQ_16                          0.572588
SF20_18                        0.549139
AQ50q23                        0.541503
OQ_5                           0.525171
AQ50q45                        0.522679
ids_7                          0.516708
OQ_IR                          0.513735
CAARSq17                       0.500666
SF20_12                        0.487711
ids_24                         0.482043
OQ_IR_Av                       0.454419
OQ_SR_Av  

In [68]:
normalized_average_relevance_low = normalize_frame(average_relevance_low)
tf.print("normalized low probabilities average_relevance\n",normalized_average_relevance_low.sort_values(ascending=False), "\n")

normalized low probabilities average_relevance
 ASI2                           0.900364
OQ_SD_Av                       0.843810
OQ_Tot                         0.837549
OQ_SD                          0.806857
ASI6                           0.636011
SF_RV                          0.629545
SD_7                           0.621746
SF_RV_Scale                    0.619748
AQ50q38                        0.619613
CAARSq23                       0.601211
SF20_16                        0.598967
ASI1                           0.565156
OQ_16                          0.554839
IDS_Tot                        0.553926
AQ50q23                        0.545798
SF20_18                        0.532095
OQ_5                           0.522112
CAARSq17                       0.499338
ids_7                          0.495785
OQ_IR                          0.495314
AQ50q45                        0.486580
SF20_12                        0.477323
OQ_19                          0.451433
OQ_SR_Av                       0

In [69]:
normalized_average_relevance_high = normalize_frame(average_relevance_high)
tf.print("normalized high probabilities average_relevance\n",normalized_average_relevance_high.sort_values(ascending=False))

normalized high probabilities average_relevance
 OQ_SD_Av                       0.911502
ASI2                           0.910515
OQ_Tot                         0.907956
OQ_SD                          0.874709
SF_RV_Scale                    0.725798
ASI6                           0.700670
SF_RV                          0.686050
IDS_Tot                        0.685447
ASI1                           0.652051
SF20_16                        0.651499
AQ50q38                        0.631699
OQ_16                          0.610829
SD_7                           0.604857
AQ50q45                        0.600455
SF20_18                        0.585862
CAARSq23                       0.568526
ids_7                          0.561789
ids_24                         0.558241
OQ_IR                          0.553425
AQ50q23                        0.532250
OQ_5                           0.531760
CAARSq10                       0.512957
SF20_12                        0.510094
CAARSq17                       

In [70]:
differential_frame = normalized_average_relevance_high - normalized_average_relevance_low
normalized_diff_frame = normalize_frame(differential_frame)
sorted_normalized_diff_frame = normalized_diff_frame.sort_values(ascending=False)
tf.print("normalized differences between high and low probabilities\n", sorted_normalized_diff_frame)

normalized differences between high and low probabilities
 PID_7                          1.000000
CAARSq10                       0.928497
PID_R_Dis                      0.771721
IDS_Tot                        0.726172
NEMESIS10a8                    0.721937
SD_2b                          0.690408
CAARSq13                       0.680977
SF_SF                          0.650325
OQ_15                          0.630627
MoodDisFinal                   0.630265
AQ50q45                        0.628741
ids_25                         0.618538
ids_24                         0.615985
AQ50q33                        0.602065
AQ50q20                        0.597128
NEMESIS4b                      0.593670
SF_RV_Scale                    0.585539
SF20_4                         0.575104
AQ50q37                        0.569082
ids_9A                         0.566160
ids_9                          0.549659
SD_1                           0.546759
NEMESIS7b9                     0.541908
SF20_17              

In [71]:
# so the absence of a feature is actually not really taken into account explicitly.
# however: if a certain feature is absent, that shifts the relevances of other features.
# thus one can say that in absence of a specific feature, the other, shown features become relevant

# this does make the average values less informative - the complex relationships are ignored
# a decision tree would be more suited to look at those relationships

In [72]:
# to inspect a specific sample:
target_index = 80 # just enter the index of the sample
output = outputs.loc[[target_index]]
aggregated, explanation = investigate_sample(output)
print(
    "This particular person had a %.1f percent probability "
    "of experiencing suicidal ideation." % (100 * aggregated)
)
tf.print("The explanations are: \n", explanation)

This particular person had a 97.5 percent probability of experiencing suicidal ideation.
The explanations are: 
 [('ASI_Tot', -0.03783252835273743),
 ('IDS_Tot', 0.2459980696439743),
 ('CAARS_InMe', -0.050015516579151154),
 ('CAARS_HyRe', 0.13280431926250458),
 ('CAARS_ImEm', -0.013742633163928986),
 ('CAARS_SeCo', 0.14846612513065338),
 ('CAARS_ADHD', 0.045997269451618195),
 ('CAARS_Incons', 0.04306325316429138),
 ('AQ_Total_NL', -0.01474767830222845),
 ('AQ_Socialskill', 0.07927743345499039),
 ('AQ_AttSwitch', -0.02106461673974991),
 ('AQ_Communication', -0.16229265928268433),
 ('AQ_Imagination', -0.04578518867492676),
 ('AQ_AttDetail', -0.10548532009124756),
 ('SF_LF', 0.02521200105547905),
 ('SF_RV', 0.2556603252887726),
 ('SF_SF', 0.08663461357355118),
 ('SF_PG', -0.28014135360717773),
 ('SF_EG', 0.0038379766047000885),
 ('SF_LP', 0.0032455716282129288),
 ('SF_LF_Scale', 0.08102002739906311),
 ('SF_RV_Scale', 0.2642815113067627),
 ('SF_SF_Scale', 0.007180165499448776),
 ('SF_PG_Sc

In [73]:
# storing everything for later

#location
directory = "./saves/medium_high_risk"

# models:
preprocesessing_model.save(directory+'preprocessing')
functional_model.save(directory+'my_pet_classifier')

# results:
explanations_frame.to_pickle(directory+"explanations_frame")
low_prop_explanations_frame.to_pickle(directory+"low_prop_explanations_frame")
high_prop_explanations_frame.to_pickle(directory+"high_prop_explanations_frame")
output.to_pickle(directory+"output")

# evaluation
metrics.to_pickle(directory+"metrics")

INFO:tensorflow:Assets written to: ./saves/medium_high_riskpreprocessing\assets
INFO:tensorflow:Assets written to: ./saves/medium_high_riskmy_pet_classifier\assets
