# Netherlands Neurogenetics Database
Author: Nienke Mekkes <br>
Date: 9-Nov-2022. <br>
Correspond: n.j.mekkes@umcg.nl <br>

## Script: clinical history predictions
Steps: <br>
- (when model not yet trained: load cleaned training data)
- (when model not yet trained: train model on cleaned training data using optimized hyperparameters)

- load trained model
- load full corpus of sentences
- predict full corpus of sentences with loaded pretrained model
- save predictions for further processing




#### Paths

In [None]:
cleaned_training_data = "/home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/training_data/cleaned_training_data.xlsx"
predictions_output_path = "/home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/final_predictions"
full_corpus = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/input_data/Clinical_history_15-12-2022.xlsx"
location_of_best_model = "/home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/nlp_models/final_trained_best_model"

In [None]:
# %pip install optuna

#### Minimal requirements

In [None]:
print('LOADING PACKAGES...')
import seaborn as sns
import pickle
import logging, sys
import matplotlib.pyplot as plt
%matplotlib inline
import csv#,random
import optuna
import pandas as pd
import os, re#, string
import numpy as np

from adjustText import adjust_text

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,precision_score,recall_score,classification_report 

import torch
from collections import Counter

from openpyxl import load_workbook, Workbook
import xlsxwriter

import joblib
from datetime import date
# import kaleido
# import plotly

from optuna.visualization import plot_contour,plot_edf,plot_intermediate_values,plot_optimization_history
from optuna.visualization import plot_parallel_coordinate,plot_param_importances,plot_slice

In [None]:
from simpletransformers.classification import MultiLabelClassificationModel, MultiLabelClassificationArgs

#### load data

In [None]:
if not os.path.exists(predictions_output_path):
    os.makedirs(predictions_output_path)
    print("creating predictions directory")

if not os.path.exists(location_of_best_model):
    os.makedirs(location_of_best_model)
    print("creating model directory")

df_train = pd.read_excel(cleaned_training_data, engine='openpyxl', index_col=[0])

In [None]:
## get the column names to make predictions df human readable
non_attribute_columns = ['NBB_nr','Year_Sentence_nr','Sentence']
attributes = [col for col in df_train.columns if col not in non_attribute_columns]
print(attributes)

In [None]:
## optional, only when still need to train
df_train['labels'] = [x for x in df_train[attributes].to_numpy()]
df_train = df_train[['Sentence','labels']]
display(df_train)

In [None]:
## set model args
model_args_bert = { "do_lower_case": True, # for uncased models
       "fp16": True,#speeds up, but risk under/overflow
       "learning_rate":  5.123640376667562e-05, # candidate for optimalisation
       "manual_seed": 2,
       "max_seq_length": 300, #Chosen such that most samples are not truncated. Increasing the sequence length significantly affects the memory consumption of the model, so it s usually best to keep it as short as possible (ideally without truncating the input sequences).
       "num_train_epochs": 33, # option for optimalisation
      #"optimizer": "Adafactor", # option for optimalisation
       "output_dir": location_of_best_model,
       "overwrite_output_dir": True,
       "reprocess_input_data" : True, #default true, input data will be reprocessed even if a cached file of the input data exists.
       "save_eval_checkpoints":False,
       "save_model_every_epoch":False,
       "save_optimizer_and_scheduler":False,
       "save_steps": -1,
       "silent":False,
      #"scheduler": "linear_schedule_with_warmup",  # option for optimalisation
      #"sliding_window": True # not supported, but advised? # option for optimalisation
       "train_batch_size": 16,  
       "use_multiprocessing": True, #speeds up,may be unstable, has some issues reported with t5
       "wandb_project": 'predict',
#         "wandb_kwargs": {"mode":"disabled"},
       "threshold":0.6

 }

In [None]:
# ## ONLY RUN IF YOUR MODEL IS NOT YET TRAINED!
# model = MultiLabelClassificationModel('bert', ## "bert" or "t5"
#                                       "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", ## "modelname from huggingface"
#                                       args=model_args_bert,
#                                       use_cuda=True,
#                                       num_labels=90)

# model.train_model(df_train[['Sentence','labels']])

In [None]:
## LOAD ALREADY TRAINED MODEL
model = MultiLabelClassificationModel('bert', ## "bert" or "t5"
                                      location_of_best_model, ## "modelname from huggingface"
                                      args=model_args_bert,
                                      use_cuda=False,#True,
                                      num_labels=90) 

##### load sentences to predict

In [None]:
import pandas as pd
full_corpus_df = pd.read_excel(full_corpus, engine='openpyxl', index_col=None)
general_information = "/home/jupyter-n.mekkes@gmail.com-f6d87/clinical_history/input_data/General_information_20-07-2023.xlsx"
general_information_df = pd.read_excel(general_information, engine='openpyxl', sheet_name="Sheet1")

In [None]:
df_predict = full_corpus_df.copy()
full_corpus_donors = list(df_predict['DonorID'].unique())
gi_donors = list(general_information_df['DonorID'].unique())
print(len(full_corpus_donors))
print(len(gi_donors))

full_corpus_donors_not_in_gi_donors = [item for item in full_corpus_donors if item not in gi_donors]
print(full_corpus_donors_not_in_gi_donors)  # Output: [1, 2, 5]

gi_donors_not_in_full_corpus_donors = [item for item in gi_donors if item not in full_corpus_donors]
print(gi_donors_not_in_full_corpus_donors)  # Output: [6, 7]

In [None]:
# sentences_to_remove = ['Past:','Last two years:','Last 2 years:','Last two months:','Last 2 months:','In the past:','0','M.',
#                        'January:','February:','March:','April:','May:','June:','July:','August:','September:','October:','November:','December:',
#                        'January','February','March','April','May','June','July','August','September','October','November','December'
#                       ]
# year = "^[12][0-9]{3}:$"
# year2 = "^[12][0-9]{3}$"
# year3 = "^\([12][0-9]{3}\)$"
# # years = predictions_df['Sentence'].str.contains(patternDel)
# m = df_predict['Sentence'].str.contains(year)
# m2 = df_predict['Sentence'].str.contains(year2)
# m3 = df_predict['Sentence'].str.contains(year3)
# df_predict = df_predict[~m]
# df_predict =df_predict[~m2]
# df_predict = df_predict[~m3]
# df_predict = df_predict[~df_predict['Sentence'].isin(sentences_to_remove)]

# short_symptoms = ['tia','uti','copd','gout','coma','pick','cva']
# df_predict = df_predict.loc[(df_predict['Sentence'].str.len() > 4) | \
#                       df_predict['Sentence'].str.contains('|'.join(short_symptoms),case=False)]
# print(f"there are {df_predict.shape[0]} sentences and {len(attributes)} columns")
# print(f"there are {len(list(df_predict['DonorID'].unique()))} unique donor IDs")

##### predict

In [None]:
ps = df_predict['Sentence'].values
# Some values are interpreted as double/int, they should be converted to str
ps = [str(i) for i in list(ps)]
# print(ps[1])
# nr = 0
# p, raw_outputs = model.predict(list(ps[1]))
# print(p)
# # pred = np.array(p)
# # print(pred)
# # print(pred.shape)
# # preds_in_df = pd.DataFrame(pred)
# # preds_in_df.columns = attributes
# # display(preds_in_df)
# # display(preds_in_df)
# # for i in ps[0:10]:
# #     print(nr, i)
# #     nr = nr + 1
# #     p, raw_outputs = model.predict(list(ps[nr]))
# #     pred = np.array(p)
# #     # print(pred)
# #     # print(pred.shape)
# #     preds_in_df = pd.DataFrame(pred)
# #     preds_in_df.columns = attributes
# #     # display(preds_in_df)
# #     df_final_predictions = pd.concat([df_predict, preds_in_df], axis=1)
# # display(df_final_predictions.head(10))


In [None]:
p, raw_outputs = model.predict(list(ps))

##### save

In [None]:
df_final_predictions.to_excel(f"{predictions_output_path}/predictions_{date.today()}.xlsx")
