# Netherlands Neurogenetics Database
Author: Nienke Mekkes <br>
Date: 9-Nov-2022. <br>
Correspond: n.j.mekkes@umcg.nl <br>

## Script: clinical history predictions
Steps: <br>
- (when model not yet trained: load cleaned training data)
- (when model not yet trained: train model on cleaned training data using optimized hyperparameters)

- load trained model (request authors or train own)
- load full corpus of sentences (request from authors)
- predict full corpus of sentences with loaded pretrained model
- save predictions for further processing




#### Paths

In [1]:
cleaned_training_data = ""
predictions_output_path = ""
full_corpus = ""
location_of_best_model = ""

#### Minimal requirements

In [2]:
print('LOADING PACKAGES...')
import seaborn as sns
import pickle
import logging, sys
import matplotlib.pyplot as plt
%matplotlib inline
import csv#,random
import optuna
import pandas as pd
import os, re#, string
import numpy as np

from adjustText import adjust_text

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,precision_score,recall_score,classification_report 

import torch
from collections import Counter

from openpyxl import load_workbook, Workbook
import xlsxwriter

import joblib
from datetime import date
# import kaleido
# import plotly

from optuna.visualization import plot_contour,plot_edf,plot_intermediate_values,plot_optimization_history
from optuna.visualization import plot_parallel_coordinate,plot_param_importances,plot_slice

LOADING PACKAGES...


In [3]:
from simpletransformers.classification import MultiLabelClassificationModel, MultiLabelClassificationArgs

#### load data

In [4]:
if not os.path.exists(predictions_output_path):
    os.makedirs(predictions_output_path)
    print("creating predictions directory")

if not os.path.exists(location_of_best_model):
    os.makedirs(location_of_best_model)
    print("creating model directory")

df_train = pd.read_excel(cleaned_training_data, engine='openpyxl', index_col=[0])

In [5]:
## get the column names to make predictions df human readable
non_attribute_columns = ['NBB_nr','Year_Sentence_nr','Sentence']
attributes = [col for col in df_train.columns if col not in non_attribute_columns]
print(attributes)

['Muscular_Weakness', 'Spasticity', 'Hyperreflexia_and_oth_reflexes', 'Frontal_release_signs', 'Fasciculations', 'Positive_sensory_symptoms', 'Negative_sensory_symptoms', 'Parkinsonism', 'Facial_masking', 'Tremor', 'Bradykinesia', 'Rigidity', 'Vertigo', 'Nystagmus', 'Ataxia', 'Loss_of_coordination', 'Balance_problems', 'Frequent_falls', 'Decreased_motor_skills', 'Unspecified_disturbed_gait_patt', 'Mobility_problems', 'Dementia', 'Cognitive_decline', 'Bradyphrenia', 'Lack_of_insight', 'Facade_behavior', 'Head_turning_sign', 'Memory_impairment', 'Amnesia', 'Forgetfulness', 'Imprinting_disturbances', 'Impaired_recognition', 'Confabulations', 'Disorientation', 'Wandering', 'Confusion', 'Aphasia', 'Word_finding_problems', 'Language_impairment', 'Impaired_comprehension', 'Communication_problems', 'Dysarthria', 'Apraxia', 'Executive_function_disorder', 'Lack_of_planning_organis_overv', 'Concentration_problems', 'Disinhibition', 'Loss_of_decorum', 'Apathy_inertia', 'Lack_of_initiative', 'Loss_

In [6]:
## optional, only when still need to train
df_train['labels'] = [x for x in df_train[attributes].to_numpy()]
df_train = df_train[['Sentence','labels']]
display(df_train)

Unnamed: 0,Sentence,labels
0,Past: The patient was known to have atrial fib...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,The patient was known to have hypertension and...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,1979: She got a total hip,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,At age 76 the first demential symptomes appeared,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,After the death of her husband homesituation w...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
19049,The patient himself did not recognize himself ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
19050,In July and August he suffered from deliria po...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
19051,In August the GP reported that it was impossib...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
19052,This was a reason why a hospice turned down an...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
## set model args
model_args_bert = { "do_lower_case": True, 
       "fp16": True,
       "learning_rate":  5.12e-05, 
       "manual_seed": 2,
       "max_seq_length": 300, 
       "num_train_epochs": 33,
       "output_dir": location_of_best_model,
       "overwrite_output_dir": True,
       "reprocess_input_data" : True,
       "save_eval_checkpoints":False,
       "save_model_every_epoch":False,
       "save_optimizer_and_scheduler":False,
       "save_steps": -1,
       "silent":False,
       "train_batch_size": 16,  
       "use_multiprocessing": True,
       "wandb_project": 'predict',
       "threshold":0.6

 }

In [8]:
# ## ONLY RUN IF YOUR MODEL IS NOT YET TRAINED!
# model = MultiLabelClassificationModel('bert', ## "bert" or "t5"
#                                       "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", ## "modelname from huggingface"
#                                       args=model_args_bert,
#                                       use_cuda=True,
#                                       num_labels=90)

# model.train_model(df_train[['Sentence','labels']])

In [8]:
## LOAD ALREADY TRAINED MODEL
model = MultiLabelClassificationModel('bert', ## "bert" or "t5"
                                      location_of_best_model, ## "modelname from huggingface"
                                      args=model_args_bert,
                                      use_cuda=False,#True,
                                      num_labels=90) 

##### load sentences to predict

In [9]:
full_corpus_df = pd.read_excel(full_corpus, engine='openpyxl', index_col=None)

In [10]:
df_predict = full_corpus_df.copy()
df_predict

Unnamed: 0,DonorID,Year,Sentence_number,Sentence
0,NBB 1997-127,past:,1,Past:
1,NBB 1997-127,past:,2,Appendectomy
2,NBB 1997-127,1996,3,1996:
3,NBB 1997-127,1996,4,"Diagnosed with a metastasized adenocarcinoma, ..."
4,NBB 1997-127,1997,5,1997:
...,...,...,...,...
213526,NBB 2006-047,2001,13,Sensibility was intact.
213527,NBB 2006-047,2001,12,There were scattered fasciculations.
213528,NBB 2006-047,2001,11,There was muscle atrophy distally more than pr...
213529,NBB 2006-047,2001,10,"Patient was, by lessened strength in his right..."


##### predict

In [12]:
ps = df_predict['Sentence'].values
# Some values are interpreted as double/int, they should be converted to str
ps = [str(i) for i in list(ps)]

In [None]:
p, raw_outputs = model.predict(list(ps))

  0%|          | 0/200188 [00:00<?, ?it/s]

##### save

In [None]:
df_final_predictions.to_excel(f"{predictions_output_path}/predictions_{date.today()}.xlsx")
