# LSTM pipeline 

## Loading data

In [41]:
import pandas as pd
import numpy as np
import preprocessing

In [42]:
#loading data
df_female = pd.read_csv('data/preprocessed_data/df_female.csv', index_col=[0])
df_age = pd.read_csv('data/preprocessed_data/df_age.csv', index_col=[0])
df_SFI = pd.read_csv('data/preprocessed_data/df_SFI.csv', index_col=[0])
df_uti = pd.read_csv('data/preprocessed_data/df_uti.csv', index_col=[0])
df_admissions = pd.read_csv('data/preprocessed_data/df_admissions.csv', index_col=[0])
df_FIM_total = pd.read_csv('data/preprocessed_data/df_FIM_total.csv', index_col=[0])
df_acute_days = pd.read_csv('data/preprocessed_data/df_acute_days.csv', index_col=[0])
df_brain_injury = pd.read_csv('data/preprocessed_data/df_brain_injury.csv', index_col=[0])
df_outcome = pd.read_csv('data/preprocessed_data/df_outcome.csv', index_col=[0])

df_admissions_resampled = pd.read_csv('data/preprocessed_data/df_admissions_resampled.csv', index_col=[0])
df_SFI_resampled = pd.read_csv('test_data2/df_SFI_resampled.csv', index_col=[0])

In [43]:
df_age = df_age.rename(columns= {'age': 'value'})
df_female = df_female.rename(columns= {'sex': 'value'})
df_acute_days = df_acute_days.rename(columns= {' acute_days': 'value'})


df_acute_days

Unnamed: 0,ID,value
1,4361311,9.0
3,4384550,50.0
4,5008712,12.0
6,5314223,77.0
8,5771918,41.0
...,...,...
1853,4476147,14.0
1854,4732653,10.0
1855,4057433,48.0
1857,5467882,35.0


In [44]:
#Important to call predictors static, dynamic or text, otherwise they will be skipped because the function will not know how to handle them!
#Make a disclaimer message if there is such a df in the predictor dict
predictor_dict = {'df_SFI_text':df_SFI, 'df_age_static':df_age, 'df_female_static':df_female, 'df_acute_days_static':df_acute_days, 'df_FIM_total_dynamic':df_FIM_total }#, 'df_brain_injury_static':df_brain_injury}  #

predictor_dict_resampled = {'df_SFI_text':df_SFI_resampled, 'df_age_static':df_age, 'df_female_static':df_female, 'df_acute_days_static':df_acute_days, 'df_FIM_total_dynamic':df_FIM_total}#, 'df_brain_injury_static':df_brain_injury}  # 

In [45]:
predictor_dict_train, predictor_dict_test, y_train_df, y_test_df, df_admissions_train, df_admissions_test = preprocessing.data_split(df_outcome = df_outcome, predictor_dict = predictor_dict, df_admissions = df_admissions, test_size = 0.3)

predictor_dict_train_resampled, predictor_dict_test_resampled, y_train_df_resampled, y_test_df_resampled, df_admissions_train_resampled, df_admissions_test_resampled = preprocessing.data_split(df_outcome = df_outcome, predictor_dict = predictor_dict_resampled, df_admissions = df_admissions_resampled, test_size = 0.3)

Number of positive observations in trainingset:224
Percentage of positive class in trainingset: 31.197771587743734
Number of positive observations in testset:97
Percentage of positive class in testset: 31.3915857605178
Number of positive observations in trainingset:224
Percentage of positive class in trainingset: 31.197771587743734
Number of positive observations in testset:97
Percentage of positive class in testset: 31.3915857605178


## Creating embeddings

### Sentence embeddings

In [31]:
from sentence_embeddings import sentence_embeddings

df_sentence_train = sentence_embeddings(df_free_text = predictor_dict_train["df_SFI_text"], transformer_model = 'encoder-large-v1')
df_sentence_test = sentence_embeddings(df_free_text = predictor_dict_test["df_SFI_text"], transformer_model = 'encoder-large-v1')

df_MeDa_train = sentence_embeddings(df_free_text = predictor_dict_train["df_SFI_text"], transformer_model = 'MeDa-Bert')
df_MeDa_test = sentence_embeddings(df_free_text = predictor_dict_test["df_SFI_text"], transformer_model = 'MeDa-Bert')


#RESAMPLED
df_sentence_train_resampled = sentence_embeddings(df_free_text = predictor_dict_train_resampled["df_SFI_text"], transformer_model = 'encoder-large-v1')
df_sentence_test_resampled = sentence_embeddings(df_free_text = predictor_dict_test_resampled["df_SFI_text"], transformer_model = 'encoder-large-v1')

df_MeDa_train_resampled = sentence_embeddings(df_free_text = predictor_dict_train_resampled["df_SFI_text"], transformer_model = 'MeDa-Bert')
df_MeDa_test_resampled = sentence_embeddings(df_free_text = predictor_dict_test_resampled["df_SFI_text"], transformer_model = 'MeDa-Bert')


No sentence-transformers model found with name encoder-large-v1. Creating a new one with MEAN pooling.
No sentence-transformers model found with name encoder-large-v1. Creating a new one with MEAN pooling.
No sentence-transformers model found with name MeDa-Bert. Creating a new one with MEAN pooling.
No sentence-transformers model found with name MeDa-Bert. Creating a new one with MEAN pooling.
No sentence-transformers model found with name encoder-large-v1. Creating a new one with MEAN pooling.
No sentence-transformers model found with name encoder-large-v1. Creating a new one with MEAN pooling.
No sentence-transformers model found with name MeDa-Bert. Creating a new one with MEAN pooling.
No sentence-transformers model found with name MeDa-Bert. Creating a new one with MEAN pooling.


### TF-IDF 

#### Hyperparametersearch

In [9]:
%%capture

#Capture is important to have in this cell, otherwise there will be so much output that the kernel will crash
from tfidf_hyperparametersearch_LSTM import tfidf_hyperparametersearch

best_params = tfidf_hyperparametersearch(predictor_dict_train, 
                                            predictor_dict_test, 
                                            df_admissions_train, 
                                            df_admissions_test, 
                                            y_train_df, 
                                            y_test_df)


In [11]:
%%capture

#Capture is important to have in this cell, otherwise there will be so much output that the kernel will crash
from tfidf_hyperparametersearch_LSTM import tfidf_hyperparametersearch

best_params_resampled = tfidf_hyperparametersearch(predictor_dict_train_resampled, 
                                                    predictor_dict_test_resampled, 
                                                    df_admissions_train_resampled, 
                                                    df_admissions_test_resampled, 
                                                    y_train_df_resampled, 
                                                    y_test_df_resampled,
                                                    resampled = True)


[I 2024-04-09 09:28:04,478] A new study created in memory with name: no-name-f3bbdae8-6cdc-4d4c-a1e9-95cbaa724fa0
[I 2024-04-09 09:28:30,942] Trial 0 finished with value: 0.6157119237502431 and parameters: {'min_df_trial': 80, 'max_df_trial': 0.7, 'max_features_trial': 300, 'ngram_range_upper_trial': 1}. Best is trial 0 with value: 0.6157119237502431.
[I 2024-04-09 09:28:59,119] Trial 1 finished with value: 0.6482931336315891 and parameters: {'min_df_trial': 30, 'max_df_trial': 0.6, 'max_features_trial': 400, 'ngram_range_upper_trial': 2}. Best is trial 1 with value: 0.6482931336315891.
[I 2024-04-09 09:29:26,556] Trial 2 finished with value: 0.613183232834079 and parameters: {'min_df_trial': 40, 'max_df_trial': 0.6, 'max_features_trial': 200, 'ngram_range_upper_trial': 1}. Best is trial 1 with value: 0.6482931336315891.
[I 2024-04-09 09:29:53,777] Trial 3 finished with value: 0.6154444660571873 and parameters: {'min_df_trial': 40, 'max_df_trial': 0.8, 'max_features_trial': 500, 'ngram

In [None]:
best_params


best_params:
{'min_df_trial': 0,
 'max_df_trial': 1.0,
 'max_features_trial': 200,
 'ngram_range_upper_trial': 1}

In [12]:
best_params_resampled 


{'min_df_trial': 30,
 'max_df_trial': 0.6,
 'max_features_trial': 400,
 'ngram_range_upper_trial': 2}

best_params_resampled: 
{'min_df_trial': 30,
 'max_df_trial': 0.6,
 'max_features_trial': 400,
 'ngram_range_upper_trial': 2}

#### Vectorizor

In [32]:
from tfidf import tf_idf

df_tfidfvect_train, df_tfidfvect_test = tf_idf(predictor_dict_train["df_SFI_text"], predictor_dict_test["df_SFI_text"], min_df = 0, max_df = 1.0, max_features=200, ngram_range = (1, 1))

df_tfidfvect_train_resampled, df_tfidfvect_test_resampled = tf_idf(predictor_dict_train_resampled["df_SFI_text"], predictor_dict_test_resampled["df_SFI_text"], min_df = 30, max_df = 0.6, max_features=400, ngram_range = (1, 2))



Tokens ['adspurgt' 'aff' 'afføring' 'aften' 'aftenvagten' 'alm' 'av' 'bad'
 'beder' 'behandling' 'behov' 'besøg' 'ble' 'blee' 'bleen' 'bleer'
 'bleskift' 'blevet' 'blod' 'blære' 'blæren' 'blærescannet' 'brugt'
 'bukseble' 'bukser' 'bundskift' 'bundskifte' 'bundskiftet' 'bækken'
 'bækkenstol' 'ca' 'dag' 'diurese' 'dv' 'efterfølgende' 'ej' 'evt'
 'faldet' 'fik' 'fint' 'fjernet' 'forbindelse' 'forsøgt' 'fortsat'
 'fortæller' 'fungerer' 'fået' 'får' 'føler' 'gang' 'gange' 'gerne'
 'giver' 'givet' 'godt' 'grundet' 'gul' 'gået' 'går' 'haft' 'held' 'hele'
 'helt' 'hjælp' 'hjælpes' 'holde' 'hvil' 'hvilket' 'ifbm' 'ifm'
 'ildelugtende' 'inden' 'kad' 'kalder' 'kateter' 'kath' 'kl' 'klar'
 'kolbe' 'kolben' 'komme' 'kommet' 'konc' 'koncentreret' 'kort' 'lade'
 'lang' 'let' 'ligger' 'lys' 'læge' 'løbet' 'ml' 'morgen' 'mængde' 'mærke'
 'mærker' 'mørk' 'nat' 'natten' 'nedre' 'nitrit' 'normal' 'nåede' 'når'
 'obs' 'pga' 'pose' 'posen' 'positiv' 'prøve' 'pt' 'påsat' 'relevant'
 'residualurin' 'resultat

## Creating tensors

REMEMBER: It is possible to manually set look_back and look_ahead 

Look_back must be set to the same in both max_notes and tensors to create correct padding.

Default settings: (look_back = 4, look_ahead = 3)

## Maximum notes

NB: Doesn't matter which embeddings you use. All have the same number of clinical notes.

#### A prediction every day

In [33]:
from create_tensors import max_notes, tensors
import torch

maximum_notes_train = max_notes(df_sentence_train, df_admissions_train) 
print(maximum_notes_train)

maximum_notes_test = max_notes(df_sentence_test, df_admissions_test) 
print(maximum_notes_test)

maximum_notes = max(maximum_notes_train, maximum_notes_test)
print(maximum_notes)

39
35
39


#### One prediction per patient

In [34]:
from create_tensors_resampled import max_notes_resampled, tensors_resampled
import torch

#hvad hvis der er en i testsættet der har flere notes?? skal vel tjekke begge og tage den med flest??
maximum_notes_train_resampled = max_notes_resampled(df_sentence_train_resampled) 
print(maximum_notes_train_resampled)

maximum_notes_test_resampled = max_notes_resampled(df_sentence_test_resampled)
print(maximum_notes_test_resampled)

maximum_notes_resampled = max(maximum_notes_train_resampled, maximum_notes_test_resampled)
print(maximum_notes_resampled)


38
37
38


### Sentence Transformers 

#### A prediction every day

In [10]:
############################# Ordinary sentence transformer ############################

#Train
X_sentence_train_tensor, y_sentence_train_tensor, X_structured_train_tensor, n_notes_train = tensors(df_sentence_train, y_train_df, df_admissions_train, maximum_notes, predictor_dict_train) 
torch.save(X_sentence_train_tensor, 'data/tensors/X_sentence_train_tensor_4_days.pt')
torch.save(y_sentence_train_tensor, 'data/tensors/y_sentence_train_tensor_4_days.pt')


#Test
X_sentence_test_tensor, y_sentence_test_tensor, X_structured_test_tensor, n_notes_test = tensors(df_sentence_test, y_test_df, df_admissions_test, maximum_notes, predictor_dict_test)
torch.save(X_sentence_test_tensor, 'data/tensors/X_sentence_test_tensor_4_days.pt')
torch.save(y_sentence_test_tensor, 'data/tensors/y_sentence_test_tensor_4_days.pt')




############################# MeDa-BERT sentence transformer ############################

#Train
X_MeDa_train_tensor, y_MeDa_train_tensor, X_structured_train_tensor, n_notes_train = tensors(df_MeDa_train, y_train_df, df_admissions_train, maximum_notes, predictor_dict_train) 
torch.save(X_MeDa_train_tensor, 'data/tensors/X_MeDa_train_tensor_4_days.pt')
torch.save(y_MeDa_train_tensor, 'data/tensors/y_MeDa_train_tensor_4_days.pt')


#Test
X_MeDa_test_tensor, y_MeDa_test_tensor, X_structured_test_tensor, n_notes_test = tensors(df_MeDa_test, y_test_df, df_admissions_test, maximum_notes, predictor_dict_test)
torch.save(X_MeDa_test_tensor, 'data/tensors/X_MeDa_test_tensor_4_days.pt')
torch.save(y_MeDa_test_tensor, 'data/tensors/y_MeDa_test_tensor_4_days.pt')


#### One prediction per patient

In [40]:
############################# Ordinary sentence transformer ############################

#Train
X_sentence_train_tensor_resampled, y_sentence_train_tensor_resampled, X_structured_train_tensor_resampled, n_notes_train_resampled = tensors_resampled(df_sentence_train_resampled, y_train_df_resampled, df_admissions_train_resampled, maximum_notes_resampled, predictor_dict_train_resampled) 
torch.save(X_sentence_train_tensor_resampled, 'data/tensors/X_sentence_train_tensor_4_days_resampled.pt')
torch.save(y_sentence_train_tensor_resampled, 'data/tensors/y_sentence_train_tensor_4_days_resampled.pt')


#Test
X_sentence_test_tensor_resampled, y_sentence_test_tensor_resampled, X_structured_test_tensor_resampled, n_notes_test_resampled = tensors_resampled(df_sentence_test_resampled, y_test_df_resampled, df_admissions_test_resampled, maximum_notes_resampled, predictor_dict_test_resampled) 
torch.save(X_sentence_test_tensor_resampled, 'data/tensors/X_sentence_test_tensor_4_days_resampled.pt')
torch.save(y_sentence_test_tensor_resampled, 'data/tensors/y_sentence_test_tensor_4_days_resampled.pt')




############################# MeDa-BERT sentence transformer ############################

#Train
X_MeDa_train_tensor_resampled, y_MeDa_train_tensor_resampled, X_structured_train_tensor_resampled, n_notes_train_resampled = tensors_resampled(df_MeDa_train_resampled, y_train_df_resampled, df_admissions_train_resampled, maximum_notes_resampled, predictor_dict_train_resampled) 
torch.save(X_MeDa_train_tensor_resampled, 'data/tensors/X_MeDa_train_tensor_4_days_resampled.pt')
torch.save(y_MeDa_train_tensor_resampled, 'data/tensors/y_MeDa_train_tensor_4_days_resampled.pt')


#Test
X_MeDa_test_tensor_resampled, y_MeDa_test_tensor_resampled, X_structured_test_tensor_resampled, n_notes_test_resampled = tensors_resampled(df_MeDa_test_resampled, y_test_df_resampled, df_admissions_test_resampled, maximum_notes_resampled, predictor_dict_test_resampled) 
torch.save(X_MeDa_test_tensor_resampled, 'data/tensors/X_MeDa_test_tensor_4_days_resampled.pt')
torch.save(y_MeDa_test_tensor_resampled, 'data/tensors/y_MeDa_test_tensor_4_days_resampled.pt')


In [38]:
df_tfidfvect_train_resampled

Unnamed: 0,aff,aften,av,bad,behov,ble,ble samt,ble skiftet,bleen,bleen våd,...,vandladninger,vandladningstrang,vl,våd,våd ble,våde,wc,ønsker,ID,date
0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0000,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,5198881,2021-11-09 08:00:00
1,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0000,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,5198881,2021-11-09 08:00:00
2,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0000,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,5198881,2021-11-09 16:00:00
3,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0000,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,5198881,2021-11-10 09:00:00
4,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0000,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,5198881,2021-11-10 13:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5708,0.753397,0.000000,0.0,0.0,0.0,0.274085,0.0,0.0,0.0000,0.0,...,0.0,0.0,0.000000,0.314208,0.329108,0.0,0.0,0.0,6657351,2020-06-06 11:00:00
5709,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0000,0.0,...,0.0,0.0,0.000000,1.000000,0.000000,0.0,0.0,0.0,6657351,2020-06-06 16:30:00
5710,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.4503,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,6657351,2020-06-07 21:00:00
5711,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0000,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,6657351,2020-06-07 21:00:00


In [37]:
df_MeDa_train_resampled

Unnamed: 0,ID,date,0,1,2,3,4,5,6,7,...,758,759,760,761,762,763,764,765,766,767
0,5198881,2021-11-09 08:00:00,0.237289,0.183170,0.053262,-0.615169,0.236463,0.262590,-0.073624,2.128982,...,0.274637,-0.330232,0.614859,-0.548804,0.511345,0.183397,-0.399633,0.154639,0.353391,0.393023
1,5198881,2021-11-09 08:00:00,0.713899,0.413676,-0.652179,-1.086461,0.183000,0.358287,-0.316619,1.620838,...,-0.559281,-0.282974,0.360609,0.193561,-0.406666,1.168989,0.120684,1.168982,-0.520272,0.056682
2,5198881,2021-11-09 16:00:00,0.194175,-0.130962,1.164330,-0.218816,-0.286397,-0.396095,0.088525,1.212527,...,-0.299014,0.310303,0.490572,-0.105900,-0.145884,-0.179008,-0.435104,-0.204638,0.281645,0.447011
3,5198881,2021-11-10 09:00:00,-0.032486,-0.030799,0.944449,0.334251,-0.163785,0.501465,-0.090335,1.087806,...,-0.518898,-0.518911,-0.297891,0.486547,0.660673,1.479069,0.540789,0.365642,0.327292,0.511015
4,5198881,2021-11-10 13:00:00,0.163570,0.158571,0.919387,0.046360,-0.224025,0.362314,-0.167257,0.715174,...,-0.491126,-0.455356,0.034857,0.134077,0.546490,1.653240,0.589726,0.018277,0.381301,0.560651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5708,6657351,2020-06-06 11:00:00,0.033858,0.593699,1.073424,0.078024,-1.047423,0.371224,0.367220,1.139066,...,0.012559,0.266149,0.013498,0.432820,-0.039292,0.078743,-0.170353,-0.209077,0.316047,0.187814
5709,6657351,2020-06-06 16:30:00,-0.707499,0.722364,0.598516,-0.035516,-0.526244,0.749863,0.249340,0.727170,...,-0.700263,-0.102425,0.409774,0.428580,0.079062,1.631156,-0.113672,0.488495,0.505197,0.163490
5710,6657351,2020-06-07 21:00:00,-0.574095,0.236965,0.379526,0.566196,-0.151257,-0.045687,0.419263,0.393080,...,-0.264695,0.043642,0.007800,0.643807,-0.380336,-0.044480,-0.346146,-0.977863,-0.388439,0.189387
5711,6657351,2020-06-07 21:00:00,-1.133537,0.424599,-0.124653,0.730124,-0.662704,-0.700558,-0.886214,0.379581,...,-0.951335,0.037019,-0.490757,0.219145,-0.674459,0.965008,-0.224921,0.145112,0.583543,0.077202


In [39]:
X_tfidfvect_train_tensor_resampled

tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.

In [41]:
X_MeDa_train_tensor_resampled

tensor([[[ 2.3729e-01,  1.8317e-01,  5.3262e-02,  ...,  1.5464e-01,
           3.5339e-01,  3.9302e-01],
         [ 7.1390e-01,  4.1368e-01, -6.5218e-01,  ...,  1.1690e+00,
          -5.2027e-01,  5.6682e-02],
         [ 1.9418e-01, -1.3096e-01,  1.1643e+00,  ..., -2.0464e-01,
           2.8164e-01,  4.4701e-01],
         ...,
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[-3.5995e-01, -1.2188e-01,  1.3201e+00,  ..., -3.9213e-01,
           1.0357e+00,  4.2996e-01],
         [-3.5995e-01, -1.2188e-01,  1.3201e+00,  ..., -3.9213e-01,
           1.0357e+00,  4.2996e-01],
         [-9.1122e-01,  8.4839e-01,  9.7467e-01,  ..., -1.0496e+00,
           1.0775e+00,  7.5305e-01],
         ...,
         [ 0.0000e+00,  0

### TF-IDF

#### A prediction every day

In [12]:
#Train
X_tfidfvect_train_tensor, y_tfidfvect_train_tensor, X_structured_train_tensor, n_notes_train = tensors(df_tfidfvect_train, y_train_df, df_admissions_train, maximum_notes, predictor_dict_train) 
torch.save(X_tfidfvect_train_tensor, 'data/tensors/X_tfidfvect_train_tensor_4_days.pt')
torch.save(y_tfidfvect_train_tensor, 'data/tensors/y_tfidfvect_train_tensor_4_days.pt')



#Test
X_tfidfvect_test_tensor, y_tfidfvect_test_tensor, X_structured_test_tensor, n_notes_test = tensors(df_tfidfvect_test, y_test_df, df_admissions_test, maximum_notes, predictor_dict_test) 
torch.save(X_tfidfvect_test_tensor, 'data/tensors/X_tfidfvect_test_tensor_4_days.pt')
torch.save(y_tfidfvect_test_tensor, 'data/tensors/y_tfidfvect_test_tensor_4_days.pt')



#### One prediction per patient

In [13]:
#Train
X_tfidfvect_train_tensor_resampled, y_tfidfvect_train_tensor_resampled, X_structured_train_tensor_resampled, n_notes_train_resampled = tensors_resampled(df_tfidfvect_train_resampled, y_train_df_resampled, df_admissions_train_resampled, maximum_notes_resampled, predictor_dict_train_resampled) #(df_tfidfvect_train_small, y_train_df_small, df_admissions_train_small, maximum_notes, predictor_dict_train)
torch.save(X_tfidfvect_train_tensor_resampled, 'data/tensors/X_tfidfvect_train_tensor_4_days_resampled.pt')
torch.save(y_tfidfvect_train_tensor_resampled, 'data/tensors/y_tfidfvect_train_tensor_4_days_resampled.pt')


#Test
X_tfidfvect_test_tensor_resampled, y_tfidfvect_test_tensor_resampled, X_structured_test_tensor_resampled, n_notes_test_resampled = tensors_resampled(df_tfidfvect_test_resampled, y_test_df, df_admissions_test_resampled, maximum_notes_resampled, predictor_dict_test_resampled) #(df_tfidfvect_train_small, y_train_df_small, df_admissions_train_small, maximum_notes, predictor_dict_train)
torch.save(X_tfidfvect_test_tensor_resampled, 'data/tensors/X_tfidfvect_test_tensor_4_days_resampled.pt')
torch.save(y_tfidfvect_test_tensor_resampled, 'data/tensors/y_tfidfvect_test_tensor_4_days_resampled.pt')



NB:  N_notes and X_structured are the same for all embeddingtypes as the data has the same dimensions.

Therefore, I only save once 

In [14]:
############################ Saving n_notes tensors   ################################

########## A prediction every day
#Train
torch.save(n_notes_train, "data/tensors/n_notes_train_4_days.pt")

#Test
torch.save(n_notes_test, "data/tensors/n_notes_test_4_days.pt")


########## One prediction per patient
#Train
torch.save(n_notes_train_resampled, "data/tensors/n_notes_train_resampled.pt")

#Test
torch.save(n_notes_test_resampled, "data/tensors/n_notes_test_resampled.pt")




############################ Saving X_structured tensors   ################################

########## A prediction every day
#Train
torch.save(X_structured_train_tensor, 'data/tensors/X_structured_train_tensor_4_days.pt')

#Test
torch.save(X_structured_test_tensor, 'data/tensors/X_structured_test_tensor_4_days.pt')


########## One prediction per patient
#Train
torch.save(X_structured_train_tensor_resampled, 'data/tensors/X_structured_train_tensor_4_days_resampled.pt')

#Test
torch.save(X_structured_test_tensor_resampled, 'data/tensors/X_structured_test_tensor_4_days_resampled.pt')

In [17]:
#print(X_static_train_tensor.size())
#print(X_dynamic_train_tensor.size())
print(X_structured_train_tensor.size())
print(X_tfidfvect_train_tensor.size())
print(n_notes_train.size())
print(X_tfidfvect_test_tensor.size())
print(n_notes_test.size())


print(X_structured_train_tensor[0])
print(X_structured_train_tensor[1])
#[print(i) for i in n_notes_train]

#113p322665

torch.Size([23964, 4])
torch.Size([23964, 39, 200])
torch.Size([23964])
torch.Size([9532, 39, 200])
torch.Size([9532])
tensor([53.,  0., 48., 33.])
tensor([53.,  0., 48., 33.])


In [16]:
######################## KØR HERTIL! ###############################

## Loading tensors

In [1]:
import torch 
import numpy as np

### Sentence transformers

#### A prediction every day

In [2]:
############################# Ordinary sentence transformer ############################

#Train
X_sentence_train_tensor = torch.load("data/tensors/X_sentence_train_tensor_4_days.pt", map_location=torch.device('cuda'))
y_sentence_train_tensor = torch.load("data/tensors/y_sentence_train_tensor_4_days.pt", map_location=torch.device('cuda'))

#Test
X_sentence_test_tensor = torch.load("data/tensors/X_sentence_test_tensor_4_days.pt", map_location=torch.device('cuda'))
y_sentence_test_tensor = torch.load("data/tensors/y_sentence_test_tensor_4_days.pt", map_location=torch.device('cuda'))



############################# MeDa-BERT sentence transformer ############################

#Train
X_MeDa_train_tensor = torch.load("data/tensors/X_MeDa_train_tensor_4_days.pt", map_location=torch.device('cuda'))
y_MeDa_train_tensor = torch.load("data/tensors/y_MeDa_train_tensor_4_days.pt", map_location=torch.device('cuda'))

#Test
X_MeDa_test_tensor = torch.load("data/tensors/X_MeDa_test_tensor_4_days.pt", map_location=torch.device('cuda'))
y_MeDa_test_tensor = torch.load("data/tensors/y_MeDa_test_tensor_4_days.pt", map_location=torch.device('cuda'))

#### One prediction per patient

In [3]:
############################# Ordinary sentence transformer ############################

#Train
X_sentence_train_tensor_resampled = torch.load("data/tensors/X_sentence_train_tensor_4_days_resampled.pt", map_location=torch.device('cuda'))
y_sentence_train_tensor_resampled = torch.load("data/tensors/y_sentence_train_tensor_4_days_resampled.pt", map_location=torch.device('cuda'))

#Test
X_sentence_test_tensor_resampled = torch.load("data/tensors/X_sentence_test_tensor_4_days_resampled.pt", map_location=torch.device('cuda'))
y_sentence_test_tensor_resampled = torch.load("data/tensors/y_sentence_test_tensor_4_days_resampled.pt", map_location=torch.device('cuda'))



############################# MeDa-BERT sentence transformer ############################

#Train
X_MeDa_train_tensor_resampled = torch.load("data/tensors/X_MeDa_train_tensor_4_days_resampled.pt", map_location=torch.device('cuda'))
y_MeDa_train_tensor_resampled = torch.load("data/tensors/y_MeDa_train_tensor_4_days_resampled.pt", map_location=torch.device('cuda'))

#Test
X_MeDa_test_tensor_resampled = torch.load("data/tensors/X_MeDa_test_tensor_4_days_resampled.pt", map_location=torch.device('cuda'))
y_MeDa_test_tensor_resampled = torch.load("data/tensors/y_MeDa_test_tensor_4_days_resampled.pt", map_location=torch.device('cuda'))

### TF-IDF

#### A prediction every day

In [4]:
#Train
X_tfidfvect_train_tensor = torch.load("data/tensors/X_tfidfvect_train_tensor_4_days.pt", map_location=torch.device('cuda'))
y_tfidfvect_train_tensor = torch.load("data/tensors/y_tfidfvect_train_tensor_4_days.pt", map_location=torch.device('cuda'))

#Test
X_tfidfvect_test_tensor = torch.load("data/tensors/X_tfidfvect_test_tensor_4_days.pt", map_location=torch.device('cuda'))
y_tfidfvect_test_tensor = torch.load("data/tensors/y_tfidfvect_test_tensor_4_days.pt", map_location=torch.device('cuda'))

#### One prediction per patient

In [5]:
#Train
X_tfidfvect_train_tensor_resampled = torch.load("data/tensors/X_tfidfvect_train_tensor_4_days_resampled.pt", map_location=torch.device('cuda'))
y_tfidfvect_train_tensor_resampled = torch.load("data/tensors/y_tfidfvect_train_tensor_4_days_resampled.pt", map_location=torch.device('cuda'))

#Test
X_tfidfvect_test_tensor_resampled = torch.load("data/tensors/X_tfidfvect_test_tensor_4_days_resampled.pt", map_location=torch.device('cuda'))
y_tfidfvect_test_tensor_resampled = torch.load("data/tensors/y_tfidfvect_test_tensor_4_days_resampled.pt", map_location=torch.device('cuda'))

In [6]:
############################ Loading n_notes tensors   ################################

########## A prediction every day
#Train
n_notes_train = torch.load("data/tensors/n_notes_train_4_days.pt", map_location=torch.device('cuda'))

#Test
n_notes_test = torch.load("data/tensors/n_notes_test_4_days.pt", map_location=torch.device('cuda'))


########## One prediction per patient
#Train
n_notes_train_resampled = torch.load("data/tensors/n_notes_train_resampled.pt", map_location=torch.device('cuda'))

#Test
n_notes_test_resampled = torch.load("data/tensors/n_notes_test_resampled.pt", map_location=torch.device('cuda'))


# ############################ Loading X_structured tensors   ################################

# ########## A prediction every day
# #Train
# X_structured_train_tensor = torch.load("data/tensors/X_structured_train_tensor_4_days.pt", map_location=torch.device('cuda'))

# #Test
# X_structured_test_tensor = torch.load("data/tensors/X_structured_test_tensor_4_days.pt", map_location=torch.device('cuda'))


# ########## One prediction per patient
# #Train
# X_structured_train_tensor_resampled = torch.load("data/tensors/X_structured_train_tensor_resampled.pt", map_location=torch.device('cuda'))

# #Test
# X_structured_test_tensor_resampled = torch.load("data/tensors/X_structured_test_tensor_4_days_resampled.pt", map_location=torch.device('cuda'))


### Unsqueezing y-vectors

In [7]:
############################## A prediction every day ################################

####### Ordinary sentence transformer
y_sentence_train_tensor = torch.unsqueeze(y_sentence_train_tensor, 1).cuda()
y_sentence_test_tensor = torch.unsqueeze(y_sentence_test_tensor, 1).cuda()

######## MeDa-BERT sentence transformer 
y_MeDa_train_tensor = torch.unsqueeze(y_MeDa_train_tensor, 1).cuda()
y_MeDa_test_tensor = torch.unsqueeze(y_MeDa_test_tensor, 1).cuda()

######## TF-IDF
y_tfidfvect_train_tensor = torch.unsqueeze(y_tfidfvect_train_tensor, 1).cuda()
y_tfidfvect_test_tensor = torch.unsqueeze(y_tfidfvect_test_tensor, 1).cuda()


############################## One prediction per patient ################################

####### Ordinary sentence transformer
y_sentence_train_tensor_resampled = torch.unsqueeze(y_sentence_train_tensor_resampled, 1).cuda()
y_sentence_test_tensor_resampled = torch.unsqueeze(y_sentence_test_tensor_resampled, 1).cuda()

######## MeDa-BERT sentence transformer 
y_MeDa_train_tensor_resampled = torch.unsqueeze(y_MeDa_train_tensor_resampled, 1).cuda()
y_MeDa_test_tensor_resampled = torch.unsqueeze(y_MeDa_test_tensor_resampled, 1).cuda()

######## TF-IDF
y_tfidfvect_train_tensor_resampled = torch.unsqueeze(y_tfidfvect_train_tensor_resampled, 1).cuda()
y_tfidfvect_test_tensor_resampled = torch.unsqueeze(y_tfidfvect_test_tensor_resampled, 1).cuda()

## LSTM 

### Hyperparameter search

In [8]:
from LSTM_hyperparametersearch import LSTM_hyperparametersearch

#### Sentence transformers

##### A prediction every day

In [9]:
%%capture

best_params_sentence = LSTM_hyperparametersearch(X_sentence_train_tensor, 
                                                y_sentence_train_tensor, 
                                                n_notes_train, 
                                                X_sentence_test_tensor, 
                                                y_sentence_test_tensor, 
                                                n_notes_test, 
                                                resampled = False)

[I 2024-05-07 09:53:40,548] A new study created in memory with name: no-name-a349c7a7-4806-4eb3-a123-3fab244614c1
[I 2024-05-07 09:54:08,692] Trial 0 finished with value: 0.5 and parameters: {'learning_rate': 0.1, 'batch_size': 32, 'hidden_size': 96}. Best is trial 0 with value: 0.5.
[I 2024-05-07 09:54:39,463] Trial 1 finished with value: 0.5 and parameters: {'learning_rate': 0.01, 'batch_size': 32, 'hidden_size': 48}. Best is trial 0 with value: 0.5.
[I 2024-05-07 09:55:04,545] Trial 2 finished with value: 0.5035301366872793 and parameters: {'learning_rate': 0.0001, 'batch_size': 32, 'hidden_size': 96}. Best is trial 2 with value: 0.5035301366872793.
[I 2024-05-07 09:55:29,402] Trial 3 finished with value: 0.5 and parameters: {'learning_rate': 0.1, 'batch_size': 32, 'hidden_size': 48}. Best is trial 2 with value: 0.5035301366872793.
[I 2024-05-07 09:55:55,296] Trial 4 finished with value: 0.5 and parameters: {'learning_rate': 0.0001, 'batch_size': 64, 'hidden_size': 32}. Best is tria

## IF THAT ^^ WORKS remember to implement early stopping in the rest of the LSTM functions...

In [10]:
best_params_sentence

{'learning_rate': 0.0001, 'batch_size': 16, 'hidden_size': 128}

In [11]:
%%capture

best_params_MeDa = LSTM_hyperparametersearch(X_MeDa_train_tensor, 
                                            y_MeDa_train_tensor, 
                                            n_notes_train, 
                                            X_MeDa_test_tensor, 
                                            y_MeDa_test_tensor, 
                                            n_notes_test, 
                                            resampled = False)

[I 2024-05-07 10:19:47,214] A new study created in memory with name: no-name-ddf0e6a5-a2d2-4105-af77-8b6ca8baf89f
[I 2024-05-07 10:20:12,530] Trial 0 finished with value: 0.5 and parameters: {'learning_rate': 0.1, 'batch_size': 32, 'hidden_size': 96}. Best is trial 0 with value: 0.5.
[I 2024-05-07 10:20:37,904] Trial 1 finished with value: 0.5 and parameters: {'learning_rate': 0.01, 'batch_size': 48, 'hidden_size': 128}. Best is trial 0 with value: 0.5.
[I 2024-05-07 10:21:40,245] Trial 2 finished with value: 0.5007574937777297 and parameters: {'learning_rate': 0.1, 'batch_size': 64, 'hidden_size': 96}. Best is trial 2 with value: 0.5007574937777297.
[I 2024-05-07 10:22:39,787] Trial 3 finished with value: 0.5804611229426904 and parameters: {'learning_rate': 0.001, 'batch_size': 48, 'hidden_size': 48}. Best is trial 3 with value: 0.5804611229426904.
[I 2024-05-07 10:23:11,362] Trial 4 finished with value: 0.5 and parameters: {'learning_rate': 0.01, 'batch_size': 64, 'hidden_size': 48}.

In [12]:
best_params_MeDa

{'learning_rate': 0.001, 'batch_size': 32, 'hidden_size': 80}

##### One prediction per patient

In [13]:
%%capture

best_params_sentence_resampled = LSTM_hyperparametersearch(X_sentence_train_tensor_resampled, 
                                                        y_sentence_train_tensor_resampled, 
                                                        n_notes_train_resampled, 
                                                        X_sentence_test_tensor_resampled, 
                                                        y_sentence_test_tensor_resampled, 
                                                        n_notes_test_resampled, 
                                                        resampled = True)

[I 2024-05-07 11:03:37,659] A new study created in memory with name: no-name-b889e567-e793-40b0-8a1a-7e81931a5387
[I 2024-05-07 11:03:39,591] Trial 0 finished with value: 0.538246450106983 and parameters: {'learning_rate': 0.0001, 'batch_size': 32, 'hidden_size': 80}. Best is trial 0 with value: 0.538246450106983.
[I 2024-05-07 11:03:41,580] Trial 1 finished with value: 0.5546586267263178 and parameters: {'learning_rate': 0.001, 'batch_size': 64, 'hidden_size': 32}. Best is trial 1 with value: 0.5546586267263178.
[I 2024-05-07 11:03:42,776] Trial 2 finished with value: 0.5 and parameters: {'learning_rate': 0.1, 'batch_size': 64, 'hidden_size': 96}. Best is trial 1 with value: 0.5546586267263178.
[I 2024-05-07 11:03:43,571] Trial 3 finished with value: 0.5 and parameters: {'learning_rate': 0.01, 'batch_size': 32, 'hidden_size': 64}. Best is trial 1 with value: 0.5546586267263178.
[I 2024-05-07 11:03:45,516] Trial 4 finished with value: 0.5023584905660378 and parameters: {'learning_rate'

In [14]:
best_params_sentence_resampled
#old {'num_epochs': 30, 'learning_rate': 0.001, 'batch_size': 32}
#new {'num_epochs': 10, 'learning_rate': 0.001, 'batch_size': 16}

{'learning_rate': 0.001, 'batch_size': 32, 'hidden_size': 80}

In [15]:
X_MeDa_train_tensor.shape[2]
X_MeDa_train_tensor_resampled

#X_tfidfvect_train_tensor_resampled.shape[2]

tensor([[[ 2.3729e-01,  1.8317e-01,  5.3262e-02,  ...,  1.5464e-01,
           3.5339e-01,  3.9302e-01],
         [ 7.1390e-01,  4.1368e-01, -6.5218e-01,  ...,  1.1690e+00,
          -5.2027e-01,  5.6682e-02],
         [ 1.9418e-01, -1.3096e-01,  1.1643e+00,  ..., -2.0464e-01,
           2.8164e-01,  4.4701e-01],
         ...,
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[-3.5995e-01, -1.2188e-01,  1.3201e+00,  ..., -3.9213e-01,
           1.0357e+00,  4.2996e-01],
         [-3.5995e-01, -1.2188e-01,  1.3201e+00,  ..., -3.9213e-01,
           1.0357e+00,  4.2996e-01],
         [-9.1122e-01,  8.4839e-01,  9.7467e-01,  ..., -1.0496e+00,
           1.0775e+00,  7.5305e-01],
         ...,
         [ 0.0000e+00,  0

In [16]:
%%capture

best_params_MeDa_resampled = LSTM_hyperparametersearch(X_MeDa_train_tensor_resampled, 
                                                    y_MeDa_train_tensor_resampled, 
                                                    n_notes_train_resampled, 
                                                    X_MeDa_test_tensor_resampled, 
                                                    y_MeDa_test_tensor_resampled, 
                                                    n_notes_test_resampled, 
                                                    resampled = True)

[I 2024-05-07 11:05:07,923] A new study created in memory with name: no-name-81cdb9f9-24ce-4b8c-881b-311a9270a251
[I 2024-05-07 11:05:09,081] Trial 0 finished with value: 0.5 and parameters: {'learning_rate': 0.1, 'batch_size': 16, 'hidden_size': 48}. Best is trial 0 with value: 0.5.
[I 2024-05-07 11:05:10,475] Trial 1 finished with value: 0.5 and parameters: {'learning_rate': 0.1, 'batch_size': 64, 'hidden_size': 112}. Best is trial 0 with value: 0.5.
[I 2024-05-07 11:05:12,381] Trial 2 finished with value: 0.564603190040848 and parameters: {'learning_rate': 0.001, 'batch_size': 32, 'hidden_size': 64}. Best is trial 2 with value: 0.564603190040848.
[I 2024-05-07 11:05:14,731] Trial 3 finished with value: 0.6130859754911496 and parameters: {'learning_rate': 0.0001, 'batch_size': 16, 'hidden_size': 128}. Best is trial 3 with value: 0.6130859754911496.
[I 2024-05-07 11:05:16,613] Trial 4 finished with value: 0.501045516436491 and parameters: {'learning_rate': 0.0001, 'batch_size': 48, 'h

In [17]:
best_params_MeDa_resampled

{'learning_rate': 0.0001, 'batch_size': 16, 'hidden_size': 128}

#### TF-IDF

##### A prediction every day

In [18]:
%%capture

best_params_tfidf = LSTM_hyperparametersearch(X_tfidfvect_train_tensor, 
                                            y_tfidfvect_train_tensor, 
                                            n_notes_train, 
                                            X_tfidfvect_test_tensor, 
                                            y_tfidfvect_test_tensor, 
                                            n_notes_test, 
                                            resampled = False)

[I 2024-05-07 11:06:47,459] A new study created in memory with name: no-name-3f6fd750-54b0-4957-8e3e-4feb959adabc
[I 2024-05-07 11:07:13,214] Trial 0 finished with value: 0.5 and parameters: {'learning_rate': 0.0001, 'batch_size': 64, 'hidden_size': 96}. Best is trial 0 with value: 0.5.
[I 2024-05-07 11:08:12,214] Trial 1 finished with value: 0.5442427312020128 and parameters: {'learning_rate': 0.01, 'batch_size': 48, 'hidden_size': 48}. Best is trial 1 with value: 0.5442427312020128.
[I 2024-05-07 11:09:07,611] Trial 2 finished with value: 0.5652595578274171 and parameters: {'learning_rate': 0.01, 'batch_size': 64, 'hidden_size': 96}. Best is trial 2 with value: 0.5652595578274171.
[I 2024-05-07 11:10:09,231] Trial 3 finished with value: 0.5740291194441624 and parameters: {'learning_rate': 0.001, 'batch_size': 32, 'hidden_size': 128}. Best is trial 3 with value: 0.5740291194441624.
[I 2024-05-07 11:11:09,152] Trial 4 finished with value: 0.5860170069810657 and parameters: {'learning_r

In [19]:
best_params_tfidf

{'learning_rate': 0.001, 'batch_size': 64, 'hidden_size': 80}

##### One prediction per patient

In [20]:
%%capture

best_params_tfidf_resampled = LSTM_hyperparametersearch(X_tfidfvect_train_tensor_resampled, 
                                                        y_tfidfvect_train_tensor_resampled, 
                                                        n_notes_train_resampled, 
                                                        X_tfidfvect_test_tensor_resampled, 
                                                        y_tfidfvect_test_tensor_resampled, 
                                                        n_notes_test_resampled, 
                                                        resampled = True)

[I 2024-05-07 11:50:28,994] A new study created in memory with name: no-name-313fa1cd-0606-4f04-80cc-4f4387a22235
[I 2024-05-07 11:50:29,968] Trial 0 finished with value: 0.5 and parameters: {'learning_rate': 0.001, 'batch_size': 48, 'hidden_size': 80}. Best is trial 0 with value: 0.5.
[I 2024-05-07 11:50:30,794] Trial 1 finished with value: 0.4941402450885042 and parameters: {'learning_rate': 0.1, 'batch_size': 32, 'hidden_size': 128}. Best is trial 0 with value: 0.5.
[I 2024-05-07 11:50:32,019] Trial 2 finished with value: 0.5543911690332619 and parameters: {'learning_rate': 0.01, 'batch_size': 16, 'hidden_size': 128}. Best is trial 2 with value: 0.5543911690332619.
[I 2024-05-07 11:50:33,888] Trial 3 finished with value: 0.5682746547364326 and parameters: {'learning_rate': 0.01, 'batch_size': 48, 'hidden_size': 32}. Best is trial 3 with value: 0.5682746547364326.
[I 2024-05-07 11:50:35,808] Trial 4 finished with value: 0.556482201906244 and parameters: {'learning_rate': 0.01, 'batch

In [21]:
best_params_tfidf_resampled

{'learning_rate': 0.01, 'batch_size': 64, 'hidden_size': 112}

In [27]:
print(best_params_sentence)
print(best_params_MeDa)
print(best_params_tfidf)

print(best_params_sentence_resampled)
print(best_params_MeDa_resampled)
print(best_params_tfidf_resampled)

{'learning_rate': 0.0001, 'batch_size': 16, 'hidden_size': 128}
{'learning_rate': 0.001, 'batch_size': 32, 'hidden_size': 80}
{'learning_rate': 0.001, 'batch_size': 64, 'hidden_size': 80}
{'learning_rate': 0.001, 'batch_size': 32, 'hidden_size': 80}
{'learning_rate': 0.0001, 'batch_size': 16, 'hidden_size': 128}
{'learning_rate': 0.01, 'batch_size': 64, 'hidden_size': 112}


## Model run 

In [34]:
from LSTM_pipeline import LSTM_evaluation

learning_rate = 0.0001 
batch_size = 16 
hidden_size = 128

predictions = LSTM_evaluation(X_sentence_train_tensor, 
                            y_sentence_train_tensor, 
                            n_notes_train, 
                            X_sentence_test_tensor, 
                            y_sentence_test_tensor, 
                            n_notes_test, 
                            learning_rate, 
                            batch_size, 
                            hidden_size, 
                            resampled = False)


cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
Epoch: 0, loss: 0.02229
Epoch: 2, loss: 0.02331
Early stopping
Average Test Loss: 0.0025
1
              precision    recall  f1-score   support

         0.0       0.98      0.41      0.58      9241
         1.0       0.04      0.74      0.07       291

    accuracy                           0.42      9532
   macro avg       0.51      0.57      0.32      9532
weighted avg       0.95      0.42      0.56      9532

True Positives: 216. False Positives: 5477. True negatives: 3764. False negatives: 75.
Accuracy: 41.75%
Precision = 0.036030871103855555
Recall = 0.7422680412371134
F1 score = 0.07219251336898395
ROC-AUC score = 0.5747916334310229


In [29]:
learning_rate = 0.001 
batch_size = 32
hidden_size = 80

predictions = LSTM_evaluation(X_MeDa_train_tensor, 
                            y_MeDa_train_tensor, 
                            n_notes_train,
                            X_MeDa_test_tensor, 
                            y_MeDa_test_tensor, 
                            n_notes_test, 
                            learning_rate, 
                            batch_size, 
                            hidden_size, 
                            resampled = False)


cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
Epoch: 0, loss: 0.02259
Epoch: 2, loss: 0.02399
Epoch: 4, loss: 0.01789
Epoch: 6, loss: 0.00915
Epoch: 8, loss: 0.00995
Early stopping
Average Test Loss: 0.0014
1
              precision    recall  f1-score   support

         0.0       0.98      0.61      0.75      9241
         1.0       0.04      0.51      0.07       291

    accuracy                           0.61      9532
   macro avg       0.51      0.56      0.41      9532
weighted avg       0.95      0.61      0.73      9532

True Positives: 148. False Positives: 3572. True negatives: 5669. False negatives: 143.
Accuracy: 61.03%
Precision = 0.03523636638459221
Recall = 0.5085910652920962
F1 score = 0.0737970580902518
ROC-AUC score = 0.5610264059281604


In [35]:
learning_rate = 0.001 
batch_size = 64
hidden_size = 80

predictions = LSTM_evaluation(X_tfidfvect_train_tensor, 
                            y_tfidfvect_train_tensor, 
                            n_notes_train, 
                            X_tfidfvect_test_tensor,
                            y_tfidfvect_test_tensor, 
                            n_notes_test, 
                            learning_rate, 
                            batch_size, 
                            hidden_size, 
                            resampled = False)

cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
Epoch: 0, loss: 0.02032
Epoch: 2, loss: 0.01688
Epoch: 4, loss: 0.01429
Epoch: 6, loss: 0.01192
Epoch: 8, loss: 0.01240
Epoch: 10, loss: 0.01210
Early stopping
Average Test Loss: 0.0010
1
              precision    recall  f1-score   support

         0.0       0.98      0.65      0.78      9241
         1.0       0.04      0.53      0.08       291

    accuracy                           0.64      9532
   macro avg       0.51      0.59      0.43      9532
weighted avg       0.95      0.64      0.76      9532

True Positives: 153. False Positives: 3260. True negatives: 5981. False negatives: 138.
Accuracy: 64.35%
Precision = 0.038047223778463375
Recall = 0.5257731958762887
F1 score = 0.0826133909287257
ROC-AUC score = 0.5864987611239467


In [31]:
learning_rate = 0.001 
batch_size = 32
hidden_size = 80

predictions = LSTM_evaluation(X_sentence_train_tensor_resampled, 
                            y_sentence_train_tensor_resampled, 
                            n_notes_train_resampled,
                            X_sentence_test_tensor_resampled, 
                            y_sentence_test_tensor_resampled, 
                            n_notes_test_resampled, 
                            learning_rate, 
                            batch_size, 
                            hidden_size, 
                            resampled = True)


cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
Epoch: 0, loss: 0.32304
Epoch: 2, loss: 0.26972
Epoch: 4, loss: 0.20981
Epoch: 6, loss: 0.16020
Epoch: 8, loss: 0.13980
Epoch: 10, loss: 0.14769
Early stopping
Average Test Loss: 0.0107
1
              precision    recall  f1-score   support

         0.0       0.76      0.47      0.58       212
         1.0       0.37      0.68      0.48        97

    accuracy                           0.53       309
   macro avg       0.57      0.57      0.53       309
weighted avg       0.64      0.53      0.55       309

True Positives: 66. False Positives: 113. True negatives: 99. False negatives: 31.
Accuracy: 53.40%
Precision = 0.3512019290359461
Recall = 0.6804123711340206
F1 score = 0.4782608695652174
ROC-AUC score = 0.5736967516047462


In [32]:
learning_rate = 0.0001 
batch_size = 16 
hidden_size = 128

predictions = LSTM_evaluation(X_MeDa_train_tensor_resampled, 
                            y_MeDa_train_tensor_resampled, 
                            n_notes_train_resampled, 
                            X_MeDa_test_tensor_resampled, 
                            y_MeDa_test_tensor_resampled, 
                            n_notes_test_resampled, 
                            learning_rate, 
                            batch_size, 
                            hidden_size, 
                            resampled = True)


cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
Epoch: 0, loss: 0.32134
Epoch: 2, loss: 0.30117
Epoch: 4, loss: 0.26097
Epoch: 6, loss: 0.22556
Epoch: 8, loss: 0.20144
Epoch: 10, loss: 0.16869
Epoch: 12, loss: 0.13017
Epoch: 14, loss: 0.12647
Epoch: 16, loss: 0.11871
Epoch: 18, loss: 0.11095
Epoch: 20, loss: 0.10067
Epoch: 22, loss: 0.09892
Epoch: 24, loss: 0.09471
Early stopping
Average Test Loss: 0.0264
1
              precision    recall  f1-score   support

         0.0       0.75      0.70      0.72       212
         1.0       0.43      0.49      0.46        97

    accuracy                           0.63       309
   macro avg       0.59      0.60      0.59       309
weighted avg       0.65      0.63      0.64       309

True Positives: 48. False Positives: 64. True negatives: 148. False negatives: 49.
Accuracy: 63.43%
Precision = 0.3706526349905391
Recall = 0.4948453608247423
F1 score = 0.4593301435406698
ROC-AUC score = 0.5964792841859561


In [33]:
learning_rate = 0.01 
batch_size = 64
hidden_size = 112 

predictions = LSTM_evaluation(X_tfidfvect_train_tensor_resampled, 
                            y_tfidfvect_train_tensor_resampled, 
                            n_notes_train_resampled, 
                            X_tfidfvect_test_tensor_resampled, 
                            y_tfidfvect_test_tensor_resampled, 
                            n_notes_test_resampled, 
                            learning_rate, 
                            batch_size, 
                            hidden_size, 
                            resampled = True)

cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
Epoch: 0, loss: 0.32680
Epoch: 2, loss: 0.27442
Epoch: 4, loss: 0.15883
Epoch: 6, loss: 0.14540
Epoch: 8, loss: 0.07781
Epoch: 10, loss: 0.04450
Epoch: 12, loss: 0.19380
Epoch: 14, loss: 0.00689
Epoch: 16, loss: 0.00610
Epoch: 18, loss: 0.00465
Early stopping
Average Test Loss: 0.0193
1
              precision    recall  f1-score   support

         0.0       0.67      0.67      0.67       212
         1.0       0.28      0.28      0.28        97

    accuracy                           0.55       309
   macro avg       0.47      0.47      0.47       309
weighted avg       0.55      0.55      0.55       309

True Positives: 27. False Positives: 70. True negatives: 142. False negatives: 70.
Accuracy: 54.69%
Precision = 0.3040162262875076
Recall = 0.27835051546391754
F1 score = 0.27835051546391754
ROC-AUC score = 0.47408091810931724


#### Evaluting the resampled model on all data

In [65]:
learning_rate = 0.001 
batch_size = 32
hidden_size = 80

predictions = LSTM_evaluation(X_sentence_train_tensor_resampled, 
                            y_sentence_train_tensor_resampled, 
                            n_notes_train_resampled,
                            X_sentence_test_tensor, 
                            y_sentence_test_tensor, 
                            n_notes_test, 
                            learning_rate, 
                            batch_size, 
                            hidden_size, 
                            resampled = True)


cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
Epoch: 0, loss: 0.32506
Epoch: 2, loss: 0.27171
Epoch: 4, loss: 0.22728
Epoch: 6, loss: 0.18920
Epoch: 8, loss: 0.14021
Epoch: 10, loss: 0.11673
Epoch: 12, loss: 0.13688
Epoch: 14, loss: 0.11609
Early stopping
Average Test Loss: 0.0088
1
              precision    recall  f1-score   support

         0.0       0.95      0.07      0.13      9241
         1.0       0.03      0.89      0.06       291

    accuracy                           0.09      9532
   macro avg       0.49      0.48      0.09      9532
weighted avg       0.92      0.09      0.13      9532

True Positives: 259. False Positives: 8598. True negatives: 643. False negatives: 32.
Accuracy: 9.46%
Precision = 0.029383860127319764
Recall = 0.8900343642611683
F1 score = 0.05662439877568867
ROC-AUC score = 0.47980778920774025


In [37]:
learning_rate = 0.0001 
batch_size = 16 
hidden_size = 128

predictions = LSTM_evaluation(X_MeDa_train_tensor_resampled, 
                            y_MeDa_train_tensor_resampled, 
                            n_notes_train_resampled, 
                            X_MeDa_test_tensor, 
                            y_MeDa_test_tensor, 
                            n_notes_test, 
                            learning_rate, 
                            batch_size, 
                            hidden_size, 
                            resampled = True)

cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
Epoch: 0, loss: 0.32145
Epoch: 2, loss: 0.29499
Epoch: 4, loss: 0.24915
Epoch: 6, loss: 0.21006
Epoch: 8, loss: 0.18495
Epoch: 10, loss: 0.16306
Epoch: 12, loss: 0.12244
Epoch: 14, loss: 0.09162
Epoch: 16, loss: 0.07028
Epoch: 18, loss: 0.03817
Epoch: 20, loss: 0.01946
Epoch: 22, loss: 0.01659
Epoch: 24, loss: 0.01539
Epoch: 26, loss: 0.06759
Early stopping
Average Test Loss: 0.0100
1
              precision    recall  f1-score   support

         0.0       0.97      0.93      0.95      9241
         1.0       0.06      0.14      0.08       291

    accuracy                           0.91      9532
   macro avg       0.52      0.53      0.52      9532
weighted avg       0.94      0.91      0.92      9532

True Positives: 40. False Positives: 638. True negatives: 8603. False negatives: 251.
Accuracy: 90.67%
Precision = 0.034441914333140096
Recall = 0.13745704467353953
F1 score = 0.08255933952528381
ROC-AUC score = 0.534208448751659

In [58]:
from LSTM_pipeline import LSTM_evaluation

from tfidf import tf_idf

df_tfidfvect_train_resampled_new, df_tfidfvect_test_new = tf_idf(predictor_dict_train_resampled["df_SFI_text"], predictor_dict_test["df_SFI_text"], min_df = 30, max_df = 0.6, max_features=400, ngram_range = (1, 2))




  X_train_SFI['note'] = X_train_SFI['note'].str.replace('\d+', '')
  X_test_SFI['note'] = X_test_SFI['note'].str.replace('\d+', '')


Tokens ['aff' 'aften' 'av' 'bad' 'behov' 'ble' 'ble samt' 'ble skiftet' 'bleen'
 'bleen våd' 'bleskift' 'blære' 'blæren' 'blærescannet' 'bukser'
 'bundskiftet' 'bækken' 'bækkenstol' 'ca' 'ca ml' 'dag' 'dv'
 'efterfølgende' 'evt' 'faldet' 'fik' 'fint' 'forbindelse' 'forsøgt'
 'fortæller' 'fået' 'får' 'føler' 'gang' 'gange' 'gerne' 'givet' 'godt'
 'gået' 'går' 'haft' 'hjælp' 'inden' 'kad' 'kalder' 'kateter' 'kl' 'klar'
 'klar urin' 'kolbe' 'kolbe ml' 'kolben' 'komme' 'kommet' 'konc'
 'koncentreret' 'lade' 'lade vandet' 'let' 'løbet' 'ml' 'ml klar'
 'ml kolbe' 'ml urin' 'morgen' 'mærke' 'nat' 'natten' 'når' 'obs' 'pga'
 'pose' 'pose tømt' 'posen' 'pt' 'påsat' 'påsat uridom' 'resultat'
 'ringer' 'samt' 'sat' 'scannet' 'se' 'seng' 'sengen' 'sidste' 'siger'
 'sik' 'skift' 'skift våd' 'skiftet' 'skiftet våd' 'spontan' 'stikbækken'
 'stix' 'stor' 'stor vandl' 'stor vandladning' 'str' 'stuen' 'tages'
 'taget' 'time' 'tisse' 'tisser' 'tisset' 'toilet' 'toiletbesøg'
 'toilettet' 'trang' 'tung' 't

In [61]:
from create_tensors import tensors
import torch

maximum_notes = 38

#Test
X_tfidfvect_test_tensor_new, y_tfidfvect_test_tensor_new, X_structured_test_tensor_new, n_notes_test_new = tensors(df_tfidfvect_test_new, y_test_df, df_admissions_test, maximum_notes, predictor_dict_test) 
torch.save(X_tfidfvect_test_tensor_new, 'data/tensors/X_tfidfvect_test_tensor_4_days_new.pt')
torch.save(y_tfidfvect_test_tensor_new, 'data/tensors/y_tfidfvect_test_tensor_4_days_new.pt')


  subset = subset.append(pd.Series(0, index=subset.columns), ignore_index=True)
  subset = subset.append(pd.Series(0, index=subset.columns), ignore_index=True)
  subset = subset.append(pd.Series(0, index=subset.columns), ignore_index=True)
  subset = subset.append(pd.Series(0, index=subset.columns), ignore_index=True)
  subset = subset.append(pd.Series(0, index=subset.columns), ignore_index=True)
  subset = subset.append(pd.Series(0, index=subset.columns), ignore_index=True)
  subset = subset.append(pd.Series(0, index=subset.columns), ignore_index=True)
  subset = subset.append(pd.Series(0, index=subset.columns), ignore_index=True)
  subset = subset.append(pd.Series(0, index=subset.columns), ignore_index=True)
  subset = subset.append(pd.Series(0, index=subset.columns), ignore_index=True)
  subset = subset.append(pd.Series(0, index=subset.columns), ignore_index=True)
  subset = subset.append(pd.Series(0, index=subset.columns), ignore_index=True)
  subset = subset.append(pd.Series(0, in

In [63]:

#Test
X_tfidfvect_test_tensor_new = torch.load("data/tensors/X_tfidfvect_test_tensor_4_days_new.pt", map_location=torch.device('cuda'))
y_tfidfvect_test_tensor_new = torch.load("data/tensors/y_tfidfvect_test_tensor_4_days_new.pt", map_location=torch.device('cuda'))

In [64]:
learning_rate = 0.01 
batch_size = 64
hidden_size = 112 

predictions = LSTM_evaluation(X_tfidfvect_train_tensor_resampled, 
                            y_tfidfvect_train_tensor_resampled, 
                            n_notes_train_resampled, 
                            X_tfidfvect_test_tensor_new, 
                            y_tfidfvect_test_tensor_new, 
                            n_notes_test_new, 
                            learning_rate, 
                            batch_size, 
                            hidden_size, 
                            resampled = True)

cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
Epoch: 0, loss: 0.32619
Epoch: 2, loss: 0.21568
Epoch: 4, loss: 0.22051
Epoch: 6, loss: 0.16137
Epoch: 8, loss: 0.08381
Epoch: 10, loss: 0.04513
Epoch: 12, loss: 0.01914
Epoch: 14, loss: 0.00656
Epoch: 16, loss: 0.00508
Epoch: 18, loss: 0.00275
Epoch: 20, loss: 0.00457
Early stopping
Average Test Loss: 0.0062
1
              precision    recall  f1-score   support

         0.0       0.97      0.77      0.86      9241
         1.0       0.04      0.29      0.07       291

    accuracy                           0.75      9532
   macro avg       0.50      0.53      0.46      9532
weighted avg       0.94      0.75      0.83      9532

True Positives: 83. False Positives: 2142. True negatives: 7099. False negatives: 208.
Accuracy: 75.35%
Precision = 0.032461026781186876
Recall = 0.2852233676975945
F1 score = 0.06597774244833068
ROC-AUC score = 0.5267151358561557
