# Netherlands Neurogenetics Database
Author: Nienke Mekkes <br>
Date: 21-Sep-2022. <br>
Correspond: n.j.mekkes@umcg.nl <br>

## Script: clinical history NLP models: PubMedBERT
Objectives: optimization of PubMedBERT based model <br>
Based on: 


### Input files:
- File (excel or pickle) containing the train&val data
- File (excel or pickle) containing the clean training data (before split_data.ipynb), to extract attribute names
- File containing the attribute metadata, to plot official attribute names

### Output:
- Folder containing: <br>
    - optuna study in the form of a .db and a .pkl file
    - csv file with performance metrics per attribute for all trials
    - csv file with performance of the best trial only
    - figure folder with analyses



#### Minimal requirements

In [7]:
## GPU, 3.82 werkt (3.7.4 niet)
# %pip install optuna
# %pip install nltk
# %pip install scikit-learn
# %pip install plotly
# %pip install kaleido
# %pip install adjustText
# %pip install simpletransformers
# %pip install torch
# %pip install tqdm

#### Paths (user input required)

In [1]:
# path_to_trainval_xlsx =  "/home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/training_data/trainval_data.xlsx"
path_to_trainval_pkl = "/home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/training_data/trainval_data.pkl"
# path_to_test_xlsx =  "/home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/training_data/test_data.xlsx"
path_to_test_pkl = "/home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/training_data/test_data.pkl"
# path_to_cleaned_training_data_xlsx = "/home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/training_data/cleaned_training_data.xlsx"
path_to_cleaned_training_data_pkl = "/home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/training_data/cleaned_training_data.pkl"
path_to_attribute_grouping = "/home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/input_data/Clinical History - attributes grouping in categories - metadata_oct.xlsx"

path_to_additional_functions = "/home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/scripts"
study_name = 'test'
used_model = 'PubMedBERT'
model_save_location = "/home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/nlp_models"


#### Imports

In [2]:
import pickle
import pandas as pd
import numpy as np
import re, os
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import optuna
import logging,sys
from sklearn.metrics import classification_report 
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import csv

import plotly
import kaleido
from optuna.visualization import plot_param_importances, plot_parallel_coordinate, plot_optimization_history
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date

import warnings
from sklearn.exceptions import ConvergenceWarning
import sys
sys.path.insert(1, path_to_additional_functions)
from helper_functions import  scatter_plot,plot_trials,analysis_performance


In [3]:
from simpletransformers.classification import MultiLabelClassificationModel, MultiLabelClassificationArgs

In [7]:
# print(MultiLabelClassificationModel.__version__)
import simpletransformers

%pip freeze

absl-py==0.12.0
adjustText==0.7.3
aiohttp==3.8.3
aiosignal==1.2.0
alembic==1.8.1
algorithmx==2.0.3
altair==4.2.0
argon2-cffi @ file:///tmp/build/80754af9/argon2-cffi_1596828452693/work
asn1crypto==0.24.0
astunparse==1.6.3
async-generator==1.10
async-timeout==4.0.2
asynctest==0.13.0
attrs @ file:///tmp/build/80754af9/attrs_1598374659300/work
autopage==0.5.1
backcall==0.2.0
backports.zoneinfo==0.2.1
bleach==3.1.5
blinker==1.5
bokeh @ file:///tmp/build/80754af9/bokeh_1598903502831/work
botocore==1.23.10
bz2file==0.98
cachetools==4.2.1
certifi==2022.9.24
certipy==0.1.3
cffi @ file:///home/conda/feedstock_root/build_artifacts/cffi_1613413867554/work
chardet==3.0.4
charset-normalizer==2.1.1
click==8.1.3
cliff==3.10.1
cmaes==0.8.2
cmd2==2.4.2
colorlog==6.7.0
commonmark==0.9.1
conda==4.13.0
conda-package-handling==1.3.11
cryptography==2.7
cutadapt==1.18
cycler==0.10.0
Cython==0.29.28
datasets==2.6.1
decorator==4.4.2
defusedxml==0.6.0
dill==0.3.5.1
docker-pycreds==0.4.0
entrypoints==0.3
et-xmlf

In [38]:
pd.set_option('display.max_rows', 100)
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

#### Load data 
Either from pickle or from excel

In [39]:
model_save_path = f"{model_save_location}/{used_model}/{study_name}"
storage_name = "sqlite:///{}/{}.db".format(model_save_path,study_name)
if not os.path.exists(model_save_path):
    print('Creating model folder ',model_save_path)
    os.makedirs(model_save_path)
    
    
save_path_figures = '{}/figures'.format(model_save_path)
if not os.path.exists(save_path_figures):
    print('Creating model figure folder ',save_path_figures)
    os.makedirs(save_path_figures)



# trainval = pd.read_excel(path_to_trainval_xlsx, engine='openpyxl', index_col=[0])
with open(path_to_trainval_pkl,"rb") as file:
    trainval = pickle.load(file)
# display(trainval)
# trainval["labels"] = [eval(row["labels"]) for index, row in trainval.iterrows()]

Creating model folder  /home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/nlp_models/PubMedBERT/test
Creating model figure folder  /home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/nlp_models/PubMedBERT/test/figures


#### Load cleaned data file to get column names

In [40]:
# cleaned_train = pd.read_excel(path_to_cleaned_training_data_xlsx, engine='openpyxl', index_col=[0])
with open(path_to_cleaned_training_data_pkl,"rb") as file:
    cleaned_train = pickle.load(file)
display(cleaned_train)

non_attribute_columns = ['NBB_nr','Year_Sentence_nr','Sentence']
attributes = [col for col in cleaned_train.columns if col not in non_attribute_columns]

Unnamed: 0,NBB_nr,Year_Sentence_nr,Sentence,Muscular_Weakness,Spasticity,Hyperreflexia_and_oth_reflexes,Frontal_release_signs,Fasciculations,Positive_sensory_symptoms,Negative_sensory_symptoms,...,Orthostatic_hypotension,Headache_migraine,Fatique,Declined_deteriorated_health,Cachexia,Weight_loss,Reduces_oral_intake,Help_in_ADL,Day_care,Admission_to_nursing_home
0,NBB 1990-048,Past_sentence_0,Past: The patient was known to have atrial fib...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,NBB 1990-048,Past_sentence_1,The patient was known to have hypertension and...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,NBB 1990-048,1979_sentence_0,1979: She got a total hip,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NBB 1990-048,1979_sentence_1,At age 76 the first demential symptomes appeared,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NBB 1990-048,1979_sentence_2,After the death of her husband homesituation w...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19049,NBB 2018-114,2018_sentence_25,The patient himself did not recognize himself ...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19050,NBB 2018-114,2018_sentence_26,In July and August he suffered from deliria po...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19051,NBB 2018-114,2018_sentence_27,In August the GP reported that it was impossib...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19052,NBB 2018-114,2018_sentence_28,This was a reason why a hospice turned down an...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Functions

Optuna has an objective function which returns a value. Optuna then tries to maximize that value, where it tries new parameter combinations each trial. In our case the value we want to optimize is performance of all 90 attributes, averaged over the 5 folds. This is the average 5 fold micro F1 score. Creating the model and optimization itself is quite straightforward. For analysis purposes however, we also want to save other performance metrics for all trials into a csv file. We do not save the intermediate models.

In [41]:
def objective(trial):
    
    counter = 1
    dataframe_list = []
    
    ## set up split
    X = trainval[['text']].to_numpy() 
    Y = pd.DataFrame(trainval['labels'])
    Y = pd.DataFrame(Y['labels'].to_list())
    Y = Y.to_numpy() 
    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True,random_state=0)
    
    ## Split the train&val into multiple train and val sets.
    for train_index, val_index in mskf.split(X, Y):
        x_train, x_val = X[train_index], X[val_index] 
        y_train, y_val = Y[train_index], Y[val_index]
        train = pd.DataFrame(x_train,columns=['text']) 
        val = pd.DataFrame(x_val,columns=['text'])
        train['labels'] = y_train.tolist()
        val['labels'] = y_val.tolist()
        
        print("Trial nr:",trial.number,"Fold number:",counter,"\nTRAIN row numbers:", train_index, "\nVAL row numbers:", val_index)
        
        print(f"nr of sentences in train: {len(x_train)}, or {len(x_train)/(len(x_train)+len(x_val)):.2f}%\n" \
              f"nr of sentences in val: {len(x_val)}, or {len(x_val)/(len(x_train)+len(x_val)):.2f}% \n")
        
        ## to optimize
        lr = trial.suggest_float("lr", 1e-5, 1e-4)
#         th = trial.suggest_float("th", 0.4, 0.7)
        ep = trial.suggest_int("ep", 1, 2)#20,35
        model_args_bert = { "do_lower_case": True, # for uncased models
               "fp16": True,#speeds up, but risk under/overflow
               "learning_rate": lr, # candidate for optimalisation
               "manual_seed": 2,
               "max_seq_length": 300, #Chosen such that most samples are not truncated. Increasing the sequence length significantly affects the memory consumption of the model, so it s usually best to keep it as short as possible (ideally without truncating the input sequences).
               "num_train_epochs": ep, # option for optimalisation
              #"optimizer": "Adafactor", # option for optimalisation
               "output_dir": model_save_path + '/Optuna',
               "overwrite_output_dir": True,
               "reprocess_input_data" : True, #default true, input data will be reprocessed even if a cached file of the input data exists.
               "save_eval_checkpoints":False,
               "save_model_every_epoch":False,
               "save_optimizer_and_scheduler":False,
               "save_steps": -1,
               "silent":False,
              #"scheduler": "linear_schedule_with_warmup",  # option for optimalisation
              #"sliding_window": True # not supported, but advised? # option for optimalisation
               "train_batch_size": 16,  
               "use_multiprocessing": True, #speeds up,may be unstable, has some issues reported with t5
               "wandb_project": None,#"pubmed_wandnm",
                "wandb_kwargs": {"mode":"disabled"},
               "threshold":0.6}
        model = MultiLabelClassificationModel('bert', ## "bert" or "t5"
                                              "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", ## "modelname from huggingface"
                                              args=model_args_bert,
                                              use_cuda=False,#True,
                                              num_labels=90)  
        model.train_model(train)
        
        ## y_val: the true labels in numpy array form
        y_val = val["labels"]
        y_val = pd.DataFrame(y_val) 
        y_val = pd.DataFrame(y_val['labels'].to_list())
        y_val = y_val.to_numpy()
        
        ## the predicted labels in numpy array form
        sentences = val["text"].values
        sentences = [str(i) for i in list(sentences)]
        predictions, raw_outputs = model.predict(list(sentences))
        y_val_predicted_labels_pubmedbert = np.array(predictions)
         
        all_metrics = classification_report(y_val, y_val_predicted_labels_pubmedbert,
                                            target_names=attributes,digits=3,output_dict=True)   
        all_metrics_df = pd.DataFrame(all_metrics).transpose()
        all_metrics_df['Trial'] = trial.number
        all_metrics_df['Fold'] = counter
        dataframe_list.append(all_metrics_df)

        counter += 1

    performance_all_folds = pd.concat(dataframe_list)
    performance_all_folds = performance_all_folds.reset_index()
    performance_all_folds = performance_all_folds.rename(columns={"index": "Attribute"})
    average_performance = performance_all_folds.drop('Fold',axis=1)
    average_performance = average_performance.groupby('Attribute').mean()
    display(performance_all_folds)
    micro_f1_score = average_performance.loc['micro avg']['f1-score'] 
    
    ## add to csv file containing all trials for all folds
    with open('{}/{}.csv'.format(model_save_path,study_name),'a') as f:
        performance_all_folds.to_csv(f, header = False,index=False)
    print(micro_f1_score)
    print('-----------------------')
    
    
    return micro_f1_score



#### Set up Optuna

In [42]:
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study = optuna.create_study(study_name=study_name,direction='maximize',load_if_exists=True, storage=storage_name)
f = model_save_path + '/' + study_name + '.csv'
file_exists = os.path.isfile(f)
if not file_exists:
    print('creating csv file {} to save model performance'.format(f))
    with open(f, 'w') as f:
        writer = csv.writer(f,delimiter=',', lineterminator='\n')
        writer.writerow(['Attribute','Precision','Recall','F1','Support','Trial','Fold'])
        f.close()
        
study = optuna.load_study(study_name=study_name, storage=storage_name)#,"maximize"])

[32m[I 2022-10-27 12:47:14,478][0m A new study created in RDB with name: test[0m


A new study created in RDB with name: test
A new study created in RDB with name: test
A new study created in RDB with name: test
A new study created in RDB with name: test
creating csv file /home/jupyter-n.mekkes@gmail.com-f6d87/ext_n_mekkes_gmail_com/clinical_history/nlp_models/PubMedBERT/test/test.csv to save model performance


#### Optimize Optuna
IMPORTANT: if you have already finished an optuna trial and are pleased with the results, and do not want additional trials, do not run this block. <br>
Typically I run for 30 Trials, we do not see improvement after 30 trials

In [43]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=ConvergenceWarning)
    study.optimize(objective, n_trials=2,n_jobs=2)

## additional, store using joblib:
savename = '{}/{}.pkl'.format(model_save_path,study_name)
joblib.dump(study, savename)

print('finished optimization')

Trial nr: 0 Fold number: 1 
TRAIN row numbers: [    0     2     3 ... 15127 15128 15131] 
VAL row numbers: [    1    12    20 ... 15129 15130 15132]
nr of sentences in train: 12106, or 0.80%
nr of sentences in val: 3027, or 0.20% 

Trial nr: 1 Fold number: 1 
TRAIN row numbers: [    0     2     3 ... 15127 15128 15131] 
VAL row numbers: [    1    12    20 ... 15129 15130 15132]
nr of sentences in train: 12106, or 0.80%
nr of sentences in val: 3027, or 0.20% 



Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForMultiLabelSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)

Downloading:   0%|          | 0.00/225k [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForMultiLabelSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

  0%|          | 0/12106 [00:00<?, ?it/s]

  0%|          | 0/12106 [00:00<?, ?it/s]

KeyboardInterrupt: 

#### Quick overview of Optuna results

In [None]:
# Getting the best trial:
print(f"The best trial is : \n{study.best_trial} \n")

# Getting the best score:
print(f"The highest f1 value is : \n{study.best_value}\n")

# Getting the best parameters:
print(f"The best parameters are : \n{study.best_params}\n")


### Figures to illustrate optimization

#### Figure 1: Hyperparameter importance
Which hyperparameter has the most influence on model performance?

In [None]:
pai = plot_param_importances(study)
pai.update_layout(
    title='{} Hyperparameter importance'.format(used_model),
    xaxis_title='Importance for F1-score',
    font=dict(
        family="Arial, monospace",
        size=18,
        color="Black"
    )
)

pai.update_xaxes(range=[0,1])
pai.write_image(save_path_figures + "/{}_{}_parameter_importance.png".format(used_model,study_name))
pai.write_image(save_path_figures + "/{}_{}_parameter_importance.pdf".format(used_model,study_name))
pai.show()


#### Figure 2: optuna visualisation plot optimization
Improvement over time

In [None]:
oph = plot_optimization_history(study,target_name='F1-score')
oph.update_layout(
    title='{} Optimization history'.format(used_model),
    xaxis_title='Number of trials',
    font=dict(
        family="Arial, monospace",
        size=18,
        color="Black"
    )
)

oph.update_yaxes(range=[0,1])
oph.write_image(save_path_figures + "/{}_{}_optimization_history.png".format(used_model,study_name))
oph.write_image(save_path_figures + "/{}_{}_optimization_history.pdf".format(used_model,study_name))
oph.show()

#### Figure 3: optuna visualisation parallel coordinate

In [None]:
pac = plot_parallel_coordinate(study,target_name='F1-score')

pac.update_layout(
    title='{} Parallel Coordinate Plot'.format(used_model),
    yaxis_title="F1-score",
    height=400,
    margin=dict(
        pad=40
    ),
    autosize=False,
    font=dict(
        family="Arial, monospace",
        size=16,
        color="Black"
    )
)

pac.write_image(save_path_figures + "/{}_{}_parallel_coordinate.png".format(used_model,study_name))
pac.write_image(save_path_figures + "/{}_{}_parallel_coordinate.pdf".format(used_model,study_name))
pac.show()

#### Figure 4: All trials; plot improvement over trials (f1 and precision)

In [None]:
best_trial_number,best_trial,f1_precision_long,micro_F1,micro_Precision = analysis_performance(model_save_path,study_name,used_model)



In [None]:
plot_trials(f1_precision_long,
            save_path_figures,
            used_model,
            study_name,
            best_trial_number,
            metric='F1',
            pal='Blues')

In [None]:
plot_trials(f1_precision_long,
            save_path_figures,
            used_model,
            study_name,
            best_trial_number,
            metric='Precision',
            pal='Oranges')

## Attribute scatter
We have selected the best trial. How do the different attributes perform under this trial? Some might be bad. We plot the precision against the F1 score for all attributes

In [None]:
attribute_grouping = pd.read_excel(path_to_attribute_grouping, engine='openpyxl', index_col=[0], sheet_name='90 parameters')
display(attribute_grouping)



In [None]:
correct_names = {}
for attr, real_name in zip(attribute_grouping.index, attribute_grouping["Attribute"]):
    if not isinstance(real_name, float):
        correct_names[real_name] = attr

In [None]:
best_trial = best_trial.rename(index=correct_names)

We select some attributes that we want to plot with their text label

In [None]:
bad_attributes = ['Unspecified_disturbed_gait_patt',
'Loss_of_sympathy_empathy',
'Fasciculations',
# 'Psychiatric_admissions',
'Changed_moods_emotions',
'Bradyphrenia',
'Head_turning_sign',
# 'Communication_problems',
# 'Decreased_motor_skills',
'Language_impairment',
# 'Positive_sensory_symptoms',
'Facade_behavior',
# 'Impaired_comprehension',
'Changed_behavior_personality',
'Frontal_release_signs',
'Vivid_dreaming',
'Loss_of_sympathy_empathy'
]

fancy_bad_attributes = [correct_names.get(item,item)  for item in bad_attributes]
print(fancy_bad_attributes)

good_attributes = [
# 'Parkinsonism',
'Memory_impairment',
'Mobility_problems',
'Fatigue',
'Depressed_mood'
# 'Psychosis',
'Agitation',
# 'Fatigue',
]

fancy_good_attributes = [correct_names.get(item,item)  for item in good_attributes]
print(fancy_good_attributes)

#### plot for optuna best trial

In [None]:
best_trial = best_trial.drop(["micro avg", "macro avg", "weighted avg","samples avg"],errors='ignore')
scatter_plot(best_trial, 
             used_model,
             study_name,
             fancy_good_attributes,
             fancy_bad_attributes,
             metrics='Precision_F1',
             printf1=round(micro_F1,5),
             printprec=round(micro_Precision,5),
             trialname=best_trial_number,
             wheretosave=save_path_figures,
             val_or_test='Validation')

### Retrain on train&val data with best parameters, evaluate on hold-out test data

We train on the combined training and validation data. we test on the kept apart testing data.

In [None]:
with open(path_to_test_pkl,"rb") as file:
    test = pickle.load(file)

# test["labels"] = [eval(row["labels"]) for index, row in test.iterrows()]

In [None]:
model_args_PubMedBERT = { "do_lower_case": True, # for uncased models
       "fp16": True,#speeds up, but risk under/overflow
       "learning_rate": study.trials[best_trial_number].params['lr'], # candidate for optimalisation
       "manual_seed": 2,
       "max_seq_length": 300, #Chosen such that most samples are not truncated. Increasing the sequence length significantly affects the memory consumption of the model, so it s usually best to keep it as short as possible (ideally without truncating the input sequences).
       "num_train_epochs": study.trials[best_trial_number].params['ep'], # option for optimalisation
      #"optimizer": "Adafactor", # option for optimalisation
       "output_dir": path_final,
       "overwrite_output_dir": True,
       "reprocess_input_data" : True, #default true, input data will be reprocessed even if a cached file of the input data exists.
       "save_eval_checkpoints":False,
       "save_model_every_epoch":False,
       "save_optimizer_and_scheduler":False,
       "save_steps": -1,
       "silent":False,
      #"scheduler": "linear_schedule_with_warmup",  # option for optimalisation
      #"sliding_window": True # not supported, but advised? # option for optimalisation
       "train_batch_size": 16,  
       "use_multiprocessing": True, #speeds up,may be unstable, has some issues reported with t5
       "wandb_project": None,
        "wandb_kwargs": {"mode":"disabled"},
       "threshold":0.6
 }

## when training for the first time
model = MultiLabelClassificationModel('bert', ## "bert" or "t5"
                                      "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", ## "modelname from huggingface"
                                      args=model_args_PubMedBERT,
                                      use_cuda=True,
                                      num_labels=90)

model.train_model(trainval)

# ## when loading already trained model
# model = MultiLabelClassificationModel('bert', ## "bert" or "t5"
#                                       path_final, ## "modelname from huggingface"
#                                       args=model_args_PubMedBERT,
#                                       use_cuda=True,#True,
#                                       num_labels=90) 

#### Test performance metrics

In [None]:
# Select the truths
y_test = test['labels']
y_test = pd.DataFrame(y_test) 
y_test = pd.DataFrame(y_test['labels'].to_list())
y_test = y_test.to_numpy()


# Create a list with the text to predict (parsed sentences)
sentences = test['text'].values#val["text"].values
sentences = [str(i) for i in list(sentences)]
predictions, raw_outputs = model.predict(list(sentences))
predictions = np.array(predictions)

test_report = classification_report(y_test, predictions, target_names=attributes,
                                       digits=3,output_dict=True)
test_report_df = pd.DataFrame(test_report).transpose()
test_report_df.columns=['Precision','Recall','F1','Support']
F1 = test_report_df.loc['micro avg']['Precision'] #!4
Precision = test_report_df.loc['micro avg']['F1']
print('F1-score micro: ', F1)
print('Precision micro: ', Precision)

display(test_report_df)

## save the performance
test_report_df.to_csv(model_save_path +'/{}_{}_test_performance.csv'.format(used_model,study_name))

In [None]:
predictions

In [None]:
## Save the sentences with their truth label and prediction label, for investigation
truths = pd.DataFrame(y_test,columns=attributes).add_suffix('_Truth')
predictions_df = pd.DataFrame(predictions,columns=attributes).add_suffix('_Prediction')
sentences = pd.concat([truths,predictions_df], axis=1)
sentences = sentences.reindex(sorted(sentences.columns), axis=1)
sentences.insert(loc=0, column='Sentence', value=test['text'])

display(sentences)
# sentences.to_csv(model_save_path +'/{}_{}_test_sentences.csv'.format(used_model,study_name),index=False)

In [None]:
test_report_df['pass_fail'] = np.where(((test_report_df.F1 >= 0.8) | (test_report_df.Precision >= 0.8) ),'pass', 'fail')
test_report_df = test_report_df.drop(["micro avg", "macro avg", "weighted avg","samples avg"],errors='ignore')
display(test_report_df)
scatter_plot(test_report_df, 
             used_model,
             study_name,
             fancy_good_attributes,
             fancy_bad_attributes,
             metrics='Precision_F1',
             printf1=round(F1,5),
             printprec=round(Precision,5),
             trialname='',
             wheretosave=save_path_figures,
             val_or_test='Test')