In [2]:
import sys
import os
from pathlib import Path
import numpy as np
import pandas as pd
import datasets
import evaluate
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification
import torch

COLAB = 'google.colab' in sys.modules
if COLAB:
    !nvidia-smi
COLAB

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512,garbage_collection_threshold:0.9"

ImportError: cannot import name 'Dataset' from 'datasets' (c:\Users\Gbadamosi\Documents\Nerd Corner\Master in ds and AI\MSC project\workspace\workspace\ner\notebooks\tests codes\datasets.py)

In [211]:

tru_label = ['O',
              'B-cond', 
              'I-cond',
              'B-des',
              'I-des',
              'B-subj',
              'I-subj',
              "B-group_A", 
              "I-group_A", 
              "B-group_B", 
              "I-group_B", 
              "B-group_C",
              "I-group_C",
              'B-group_D',
              'I-group_D', 
              'B-proc',
              'I-proc', 
              'B-N_A',
              'I-N_A',
              'B-N_B',
              'I-N_B',
              'B-ther',
              'I-ther']

id2label = {i:label for i, label in enumerate(tru_label) }
label2id = {label:i for i, label in enumerate(tru_label) }

#### Reading the datasest

In [8]:
if COLAB:
    DATA_PATH = Path("/content/drive/MyDrive/ner/data/swt")# path for processed data from data preprocessing.ipynb
    MODELS_BASE_PATH = Path("/content/drive/MyDrive/ner/models")# path to save model
else:
    folder_path = Path("C:/Users/Gbadamosi/Documents/Nerd Corner/Master in ds and AI/MSC project/workspace/workspace/ner/ner/data/swt/processed")

In [4]:
import json

def read_file(folder_path, doc): 
    file_path = doc + '.json'
    file_name = os.path.join(folder_path, file_path )

    with open(file_name, "r", encoding="utf-8") as f:
        return json.load(f)

In [10]:
file_names = ["base_abstract_with_punc_base_2023_12_03", "base_abstract_meth_with_punc_2023_12_03", "base_50_abstract_meth_with_punc_2023_12_03", "base_512_abstract_meth_with_punc_2023_12_03"]
abstract= read_file(DATA_PATH, file_names[0])
abstract_methods = read_file(DATA_PATH, file_names[1])
splitted_50_abstract_methods_context = read_file(DATA_PATH, file_names[2])
splitted_512_records_method_context =read_file(DATA_PATH, file_names[3])


In [55]:
abstract_methods[0]

{'pmid': '16960863',
 'paper': 'Burn2006_PMID16960863_1.pdf',
 'text': "Effects of rivastigmine in patients with and without visual hallucinations in dementia associated with Parkinson’s disease.  We aimed to determine prospectively whether rivastigmine, an inhibitor of acetylcholinesterase and butyrylcholinesterase, provided benefits in patients with and without visual hallucinations in a population with dementia associated with Parkinson's disease (PDD). This was a 24-week double-blind placebo-controlled study. Primary efficacy measures were the Alzheimer's Disease Assessment Scale cognitive subscale (ADAS-cog) and Alzheimer's Disease Cooperative Study-Clinician's Global Impression of Change (ADCS-CGIC). Secondary efficacy measures included activities of daily living, behavioral symptoms, and executive and attentional functions. Patients were stratified according to the presence of visual hallucinations at baseline. The study included 188 visual hallucinators (118 on rivastigmine, 70

#### FUNCTIONS

In [208]:


def tokenize_align_labels(example, label_all_tokens = True):
    # tokenized_input = tokenizer(example['tokens'], truncation = True, is_split_into_words=True, max_length = 450)
    tokenized_input = tokenizer(example['tokens'], truncation = True, is_split_into_words=True)

    labels = []
    for i , label in enumerate(example['ner_tags']):
        word_ids = tokenized_input.word_ids(batch_index=i)
        previous_word_idx = None

        label_ids = []
        for word_idx in word_ids:
            if word_idx is None: 
                label_ids.append(-100)
            elif word_ids!= previous_word_idx:
                label_ids.append(label[word_idx])
            else: 
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_input['labels'] = labels
    return tokenized_input



#### hyperparameters

In [36]:
model_checkpoint = "kamalkraj/BioELECTRA-PICO"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length = 512)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, ignore_mismatched_sizes=True, id2label = id2label, label2id = label2id)

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at kamalkraj/BioELECTRA-PICO and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([15, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([15]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = evaluate.load('seqeval')

In [18]:
from typing import Tuple
from sklearn.metrics import precision_score, recall_score
from functools import partial 

from preprocessing import *

def token_f1_cond(true, pred, labels):
    class_scores = zip(labels, precision_score(true, pred, labels=labels, average=None, zero_division=True),
                       recall_score(true, pred, labels=labels, average=None))
    result = {label: {"f1": get_f1(prec, rec), "p": prec, "r": rec} for label, prec, rec in class_scores}
    return result
def get_f1(prec, rec):
    return 2 * prec * rec / (prec + rec)

def compute_metrics_by_token_swt(p: Tuple[list, list], label_list):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [label_list[p] for prediction, label in zip(predictions, labels) for (p, l) in
                        zip(prediction, label) if l != -100]
    true_labels = [label_list[l] for prediction, label in zip(predictions, labels) for (p, l) in zip(prediction, label)
                   if l != -100]

    metrics_report = token_f1_cond(true_labels, true_predictions, label_list)
    metrics_report_f1 = {"f1_" + k: v["f1"] for k, v in metrics_report.items() if k != "O"}
    return metrics_report_f1
compute_metrics_by_token_swt = partial(compute_metrics_by_token_swt, label_list=tru_label) 


In [19]:
from transformers import TrainingArguments, Trainer

model_max_length = 512

if COLAB:
    batch_size = 16 # for p-100 16 is ok. For T4: 12
    if model_max_length <= 384:
      # at least 15 GB gpu
        batch_size = 16
    gradient_accumulation_steps = 1
    num_train_epochs = 8
else:
    batch_size = 4
    if model_max_length==256:
        batch_size = 9

    gradient_accumulation_steps = 4
    num_train_epochs = 5

learning_rate = 5e-5
weight_decay = 0.003 

print(f"bs: {batch_size}, model_max_length: {model_max_length}, gradient_acc_steps: {gradient_accumulation_steps}, \
n_epochs: {num_train_epochs}, lr: {learning_rate}")

bs: 4, model_max_length: 512, gradient_acc_steps: 4, n_epochs: 5, lr: 5e-05


### ABSTRACT ONLY

#### Creating Huggingface Dataset

In [85]:

hf_ds = create_datasets(abstract, tru_label)

Casting the dataset: 100%|██████████| 1520/1520 [00:00<00:00, 25838.72 examples/s]


In [28]:
hf_ds.features

{'pmid': Value(dtype='string', id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-cond', 'I-cond', 'B-des', 'I-des', 'B-subj', 'I-subj', 'B-group_A', 'I-group_A', 'B-group_B', 'I-group_B', 'B-group_C', 'I-group_C', 'B-group_D', 'I-group_D'], id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [29]:
hf_ds[0]


{'pmid': '16960863',
 'ner_tags': [0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  6,
  6,
  6,
  6,
  6,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  2,
  2,
  0,
  0,
  0,
  3,
  4,
  4,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  11,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  9,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  11,
  0,


In [30]:
part_train_test = train_test_split(hf_ds, train_test_size=0.2, validation_size=0.5) 


In [31]:
part_train_test

DatasetDict({
    train: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 1394
    })
    test: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 78
    })
    valid: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 77
    })
})

In [32]:
tags = part_train_test['train'].features['ner_tags'].feature.names
tags

['O',
 'B-cond',
 'I-cond',
 'B-des',
 'I-des',
 'B-subj',
 'I-subj',
 'B-group_A',
 'I-group_A',
 'B-group_B',
 'I-group_B',
 'B-group_C',
 'I-group_C',
 'B-group_D',
 'I-group_D']

#### Tokenizer Example and Testing

In [33]:
example_text = part_train_test['train'][0]
example_text['tokens']

['efficacy',
 'of',
 'metformin',
 'versus',
 'insulin',
 'in',
 'the',
 'management',
 'of',
 'pregnancy',
 'with',
 'diabetes',
 'objective',
 'to',
 'compare',
 'the',
 'efficacy',
 'of',
 'metformin',
 'with',
 'insulin',
 'in',
 'the',
 'management',
 'of',
 'pregnancy',
 'with',
 'diabetes',
 'study',
 'design',
 'randomized',
 'clinical',
 'trial',
 'place',
 'and',
 'duration',
 'of',
 'study',
 'department',
 'of',
 'obstetrics',
 'and',
 'gynaecology',
 'maternal',
 'and',
 'child',
 'health',
 'centre',
 'pakistan',
 'institute',
 'of',
 'medical',
 'sciences',
 'islamabad',
 'from',
 'may',
 '2010',
 'to',
 'january',
 '2011',
 'methodology',
 'a',
 'total',
 'of',
 '68',
 'pregnant',
 'patients',
 'with',
 'diabetes',
 'were',
 'included',
 'in',
 'this',
 'study',
 'patients',
 'were',
 'randomly',
 'divided',
 'in',
 'to',
 'two',
 'groups',
 'of',
 'each',
 '34',
 'patients',
 'based',
 'on',
 'table',
 'of',
 'random',
 'numbers',
 'one',
 'was',
 'labelled',
 'as',
 '

In [34]:

tokenized_input = tokenizer(example_text['tokens'], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])

words_ids = tokenized_input.word_ids()
words_ids

[None,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 53,
 53,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 96,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 102,
 102,
 103,
 103,
 103,
 104,
 105,
 106,
 107,
 107,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 133,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 148,
 149,
 149,
 149,
 150,
 151,
 152,
 152,
 152,
 153,
 154,
 155,
 156,
 156,
 157,
 157,
 157,
 158,
 159,
 159,
 159,
 160,

In [35]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
tokens

['[CLS]',
 'efficacy',
 'of',
 'metformin',
 'versus',
 'insulin',
 'in',
 'the',
 'management',
 'of',
 'pregnancy',
 'with',
 'diabetes',
 'objective',
 'to',
 'compare',
 'the',
 'efficacy',
 'of',
 'metformin',
 'with',
 'insulin',
 'in',
 'the',
 'management',
 'of',
 'pregnancy',
 'with',
 'diabetes',
 'study',
 'design',
 'randomized',
 'clinical',
 'trial',
 'place',
 'and',
 'duration',
 'of',
 'study',
 'department',
 'of',
 'obstetrics',
 'and',
 'gynaec',
 '##ology',
 'maternal',
 'and',
 'child',
 'health',
 'centre',
 'pakistan',
 'institute',
 'of',
 'medical',
 'sciences',
 'isl',
 '##ama',
 '##ba',
 '##d',
 'from',
 'may',
 '2010',
 'to',
 'january',
 '2011',
 'methodology',
 'a',
 'total',
 'of',
 '68',
 'pregnant',
 'patients',
 'with',
 'diabetes',
 'were',
 'included',
 'in',
 'this',
 'study',
 'patients',
 'were',
 'randomly',
 'divided',
 'in',
 'to',
 'two',
 'groups',
 'of',
 'each',
 '34',
 'patients',
 'based',
 'on',
 'table',
 'of',
 'random',
 'numbers',


In [36]:
len(example_text['ner_tags']), len(tokenized_input['input_ids'])

(293, 364)

#### testing tokenizer

In [37]:
q = tokenize_align_labels(part_train_test['train'][0:1])
print(q)

{'input_ids': [[2, 3540, 1685, 12968, 3803, 3433, 1682, 1680, 3190, 1685, 4097, 1715, 3507, 2833, 1701, 4461, 1680, 3540, 1685, 12968, 1715, 3433, 1682, 1680, 3190, 1685, 4097, 1715, 3507, 1901, 2693, 4384, 2121, 4033, 3831, 1690, 3864, 1685, 1901, 6138, 1685, 19921, 1690, 19417, 2317, 4796, 1690, 2355, 2161, 8622, 22195, 9218, 1685, 3045, 13222, 28142, 14707, 7915, 1022, 1814, 2056, 6759, 1701, 6125, 7336, 6604, 42, 2453, 1685, 4980, 6157, 1808, 1715, 3507, 1748, 2990, 1682, 1805, 1901, 1808, 1748, 5585, 5130, 1682, 1701, 2033, 2345, 1685, 2562, 3930, 1808, 2234, 1755, 12685, 1685, 3077, 5352, 2155, 1734, 9802, 1732, 1922, 16, 42, 1690, 2187, 1734, 9802, 1732, 1922, 16, 43, 1922, 16, 42, 3484, 3433, 1690, 1922, 16, 43, 3484, 12968, 1725, 1680, 3190, 1685, 3507, 1890, 1680, 2433, 2293, 1734, 3718, 5283, 16, 23, 4853, 1690, 3718, 3460, 16, 23, 5375, 2281, 1682, 2345, 16, 42, 1690, 43, 2427, 7721, 2368, 8921, 1999, 1970, 20, 2444, 1734, 3843, 1682, 3184, 11, 4701, 26, 8, 12, 1808, 1682, 

In [38]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q['input_ids'][0]), q['labels'][0]):
    print(f'{token:_<40} {label}')


[CLS]___________________________________ -100
efficacy________________________________ 0
of______________________________________ 0
metformin_______________________________ 9
versus__________________________________ 0
insulin_________________________________ 7
in______________________________________ 0
the_____________________________________ 0
management______________________________ 0
of______________________________________ 0
pregnancy_______________________________ 0
with____________________________________ 0
diabetes________________________________ 1
objective_______________________________ 0
to______________________________________ 0
compare_________________________________ 0
the_____________________________________ 0
efficacy________________________________ 0
of______________________________________ 0
metformin_______________________________ 9
with____________________________________ 0
insulin_________________________________ 7
in______________________________________ 0
the_____

#### tokenizing dataset

In [86]:
tokenized_dataset = part_train_test.map(tokenize_align_labels, batched = True, remove_columns=part_train_test['train'].column_names)

Map: 100%|██████████| 1394/1394 [00:05<00:00, 262.67 examples/s]
Map: 100%|██████████| 78/78 [00:00<00:00, 243.58 examples/s]
Map: 100%|██████████| 77/77 [00:00<00:00, 262.47 examples/s]


#### Trainer

In [87]:
output_dir = r'C:\Users\Gbadamosi\Documents\Nerd Corner\Master in ds and AI\MSC project\workspace\workspace\ner\models\BioElectra\thesis\abstract'
run_name = "Bioelectra_Abstract"
from transformers import Trainer
args = TrainingArguments(
    output_dir= output_dir,
    evaluation_strategy='epoch',
    eval_steps=1,
    save_strategy = 'epoch',
    learning_rate = learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_strategy="epoch",
    save_total_limit=2,  
    run_name =run_name,
    load_best_model_at_end=True
)


trainer = Trainer(
    model, 
    args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['valid'],
    data_collator = data_collator,
    tokenizer = tokenizer,
    

    compute_metrics = compute_metrics_by_token_swt
    
)

In [88]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1394
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 78
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
})

In [89]:
trainer.train()

model.save_pretrained('SWT_bioelectra_model_15-11-2023')
tokenizer.save_pretrained('SWT_bioelectra_tokenizer_15-11-2023')

 20%|██        | 87/435 [21:19<1:17:44, 13.40s/it]

{'loss': 0.1622, 'learning_rate': 4e-05, 'epoch': 1.0}


  _warn_prf(average, modifier, msg_start, len(result))

 20%|██        | 87/435 [21:33<1:17:44, 13.40s/it]

{'eval_loss': 0.23381659388542175, 'eval_f1_B-cond': 0.47750865051903113, 'eval_f1_I-cond': 0.5030674846625767, 'eval_f1_B-des': 0.6125, 'eval_f1_I-des': 0.7277486910994765, 'eval_f1_B-subj': 0.5771428571428572, 'eval_f1_I-subj': 0.5140845070422535, 'eval_f1_B-group_A': 0.464746772591857, 'eval_f1_I-group_A': 0.2026431718061674, 'eval_f1_B-group_B': 0.373806275579809, 'eval_f1_I-group_B': 0.2097902097902098, 'eval_f1_B-group_C': 0.0, 'eval_f1_I-group_C': 0.0, 'eval_f1_B-group_D': 0.0, 'eval_f1_I-group_D': 0.0, 'eval_runtime': 14.6469, 'eval_samples_per_second': 5.257, 'eval_steps_per_second': 1.365, 'epoch': 1.0}


 40%|████      | 174/435 [42:02<1:04:34, 14.85s/it]

{'loss': 0.1261, 'learning_rate': 3e-05, 'epoch': 1.99}


  _warn_prf(average, modifier, msg_start, len(result))

 40%|████      | 174/435 [42:19<1:04:34, 14.85s/it]

{'eval_loss': 0.2520667612552643, 'eval_f1_B-cond': 0.5101449275362319, 'eval_f1_I-cond': 0.5156537753222836, 'eval_f1_B-des': 0.6338797814207651, 'eval_f1_I-des': 0.7174887892376682, 'eval_f1_B-subj': 0.5867970660146699, 'eval_f1_I-subj': 0.5705128205128205, 'eval_f1_B-group_A': 0.536007292616226, 'eval_f1_I-group_A': 0.4431486880466472, 'eval_f1_B-group_B': 0.38014527845036317, 'eval_f1_I-group_B': 0.2180094786729858, 'eval_f1_B-group_C': 0.0, 'eval_f1_I-group_C': 0.0, 'eval_f1_B-group_D': 0.0, 'eval_f1_I-group_D': 0.0, 'eval_runtime': 16.4972, 'eval_samples_per_second': 4.667, 'eval_steps_per_second': 1.212, 'epoch': 1.99}


 60%|██████    | 261/435 [1:03:09<39:23, 13.58s/it]

{'loss': 0.1034, 'learning_rate': 2e-05, 'epoch': 2.99}


  _warn_prf(average, modifier, msg_start, len(result))
  return 2 * prec * rec / (prec + rec)

 60%|██████    | 261/435 [1:03:30<39:23, 13.58s/it]

{'eval_loss': 0.25815680623054504, 'eval_f1_B-cond': 0.43621399176954734, 'eval_f1_I-cond': 0.4694835680751174, 'eval_f1_B-des': 0.5241379310344828, 'eval_f1_I-des': 0.6542553191489362, 'eval_f1_B-subj': 0.5558912386706948, 'eval_f1_I-subj': 0.5499999999999999, 'eval_f1_B-group_A': 0.46490428441203285, 'eval_f1_I-group_A': 0.42436974789915966, 'eval_f1_B-group_B': 0.38081395348837216, 'eval_f1_I-group_B': 0.2292490118577075, 'eval_f1_B-group_C': 0.13740458015267176, 'eval_f1_I-group_C': nan, 'eval_f1_B-group_D': 0.0, 'eval_f1_I-group_D': 0.0, 'eval_runtime': 20.8217, 'eval_samples_per_second': 3.698, 'eval_steps_per_second': 0.961, 'epoch': 2.99}


 80%|████████  | 349/435 [1:24:33<19:59, 13.95s/it]

{'loss': 0.0867, 'learning_rate': 9.885057471264368e-06, 'epoch': 4.0}


  _warn_prf(average, modifier, msg_start, len(result))
  return 2 * prec * rec / (prec + rec)

 80%|████████  | 349/435 [1:24:46<19:59, 13.95s/it]

{'eval_loss': 0.2720515727996826, 'eval_f1_B-cond': 0.51875, 'eval_f1_I-cond': 0.50561797752809, 'eval_f1_B-des': 0.5925925925925926, 'eval_f1_I-des': 0.6490765171503958, 'eval_f1_B-subj': 0.5272727272727272, 'eval_f1_I-subj': 0.5968253968253967, 'eval_f1_B-group_A': 0.4908424908424908, 'eval_f1_I-group_A': 0.4313725490196078, 'eval_f1_B-group_B': 0.4050632911392405, 'eval_f1_I-group_B': 0.22656249999999997, 'eval_f1_B-group_C': 0.16370106761565836, 'eval_f1_I-group_C': nan, 'eval_f1_B-group_D': 0.0, 'eval_f1_I-group_D': 0.0, 'eval_runtime': 12.3934, 'eval_samples_per_second': 6.213, 'eval_steps_per_second': 1.614, 'epoch': 4.0}


100%|██████████| 435/435 [1:45:42<00:00, 14.41s/it]

{'loss': 0.0811, 'learning_rate': 0.0, 'epoch': 4.99}


  _warn_prf(average, modifier, msg_start, len(result))
  return 2 * prec * rec / (prec + rec)

100%|██████████| 435/435 [1:45:54<00:00, 14.41s/it]

{'eval_loss': 0.2779647409915924, 'eval_f1_B-cond': 0.5177993527508091, 'eval_f1_I-cond': 0.5190839694656489, 'eval_f1_B-des': 0.6309523809523809, 'eval_f1_I-des': 0.6735751295336787, 'eval_f1_B-subj': 0.5714285714285714, 'eval_f1_I-subj': 0.6178343949044586, 'eval_f1_B-group_A': 0.49539594843462237, 'eval_f1_I-group_A': 0.4413145539906103, 'eval_f1_B-group_B': 0.40392706872370265, 'eval_f1_I-group_B': 0.2113207547169811, 'eval_f1_B-group_C': 0.23776223776223776, 'eval_f1_I-group_C': nan, 'eval_f1_B-group_D': 0.0, 'eval_f1_I-group_D': 0.0, 'eval_runtime': 12.452, 'eval_samples_per_second': 6.184, 'eval_steps_per_second': 1.606, 'epoch': 4.99}


100%|██████████| 435/435 [1:46:01<00:00, 14.62s/it]


{'train_runtime': 6361.4068, 'train_samples_per_second': 1.096, 'train_steps_per_second': 0.068, 'train_loss': 0.11193458842135023, 'epoch': 4.99}


('SWT_bioelectra_tokenizer_15-11-2023\\tokenizer_config.json',
 'SWT_bioelectra_tokenizer_15-11-2023\\special_tokens_map.json',
 'SWT_bioelectra_tokenizer_15-11-2023\\vocab.txt',
 'SWT_bioelectra_tokenizer_15-11-2023\\added_tokens.json',
 'SWT_bioelectra_tokenizer_15-11-2023\\tokenizer.json')

#### Testing the model

In [43]:
trainer.evaluate(eval_dataset= tokenized_dataset['test'] )

100%|██████████| 20/20 [00:16<00:00,  1.23it/s]


{'eval_loss': 0.20650240778923035,
 'eval_f1_B-cond': 0.5723905723905723,
 'eval_f1_I-cond': 0.6702127659574468,
 'eval_f1_B-des': 0.5694444444444443,
 'eval_f1_I-des': 0.7025089605734767,
 'eval_f1_B-subj': 0.49382716049382713,
 'eval_f1_I-subj': 0.5943396226415095,
 'eval_f1_B-group_A': 0.5092322643343052,
 'eval_f1_I-group_A': 0.16101694915254236,
 'eval_f1_B-group_B': 0.5364583333333334,
 'eval_f1_I-group_B': 0.4358974358974359,
 'eval_f1_B-group_C': 0.0,
 'eval_f1_I-group_C': 0.0,
 'eval_f1_B-group_D': 0.0,
 'eval_f1_I-group_D': 0.0,
 'eval_runtime': 17.4262,
 'eval_samples_per_second': 4.476,
 'eval_steps_per_second': 1.148,
 'epoch': 4.99}

In [44]:
sentence = "Semaglutide, a glucagon-like peptide-1 receptor agonist, has been shown to reduce the risk of adverse cardiovascular events in patients with diabetes. Whether semaglutide can reduce cardiovascular risk associated with overweight and obesity in the absence of diabetes is unknown"
sentence

'Semaglutide, a glucagon-like peptide-1 receptor agonist, has been shown to reduce the risk of adverse cardiovascular events in patients with diabetes. Whether semaglutide can reduce cardiovascular risk associated with overweight and obesity in the absence of diabetes is unknown'

In [45]:
model_checkpoint = trainer.state.best_model_checkpoint # or save model in disk and load it later
print(f"using checkpoint {model_checkpoint}")

from transformers import pipeline

token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="first")
# token_classifier.tokenizer.model_max_length = model_max_length

using checkpoint C:\Users\Gbadamosi\Documents\Nerd Corner\Master in ds and AI\MSC project\workspace\workspace\ner\models\BioElectra\thesis\abstract\checkpoint-349


In [46]:
res = token_classifier(sentence, aggregation_strategy="first")
print(res)

[{'entity_group': 'group_A', 'score': 0.64658564, 'word': 'semaglutide', 'start': 0, 'end': 11}, {'entity_group': 'group_A', 'score': 0.68428516, 'word': 'semaglutide', 'start': 159, 'end': 170}]


###  ABSTRACT AND METHODS

#### Creating Huggingface Dataset

In [97]:
hf_ds = create_datasets(abstract_methods, tru_label)

Casting the dataset: 100%|██████████| 1542/1542 [00:00<00:00, 24672.00 examples/s]


In [98]:
hf_ds.features

{'pmid': Value(dtype='string', id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-cond', 'I-cond', 'B-des', 'I-des', 'B-subj', 'I-subj', 'B-group_A', 'I-group_A', 'B-group_B', 'I-group_B', 'B-group_C', 'I-group_C', 'B-group_D', 'I-group_D'], id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [99]:
hf_ds[0]


{'pmid': '16960863',
 'ner_tags': [0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  6,
  6,
  6,
  6,
  6,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  2,
  2,
  0,
  0,
  0,
  3,
  4,
  4,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  11,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  9,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  11,
  0,


In [100]:
part_train_test = train_test_split(hf_ds, train_test_size=0.1, validation_size=0.5) 


In [101]:
part_train_test

DatasetDict({
    train: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 1387
    })
    test: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 78
    })
    valid: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 77
    })
})

In [102]:
tags = part_train_test['train'].features['ner_tags'].feature.names
tags

['O',
 'B-cond',
 'I-cond',
 'B-des',
 'I-des',
 'B-subj',
 'I-subj',
 'B-group_A',
 'I-group_A',
 'B-group_B',
 'I-group_B',
 'B-group_C',
 'I-group_C',
 'B-group_D',
 'I-group_D']

#### tokenizing dataset

In [103]:
tokenized_dataset = part_train_test.map(tokenize_align_labels, batched = True, remove_columns=part_train_test['train'].column_names)

Map: 100%|██████████| 1387/1387 [00:18<00:00, 76.40 examples/s]
Map: 100%|██████████| 78/78 [00:01<00:00, 75.81 examples/s]
Map: 100%|██████████| 77/77 [00:00<00:00, 78.70 examples/s]


#### Trainer

In [104]:
output_dir = r'C:\Users\Gbadamosi\Documents\Nerd Corner\Master in ds and AI\MSC project\workspace\workspace\ner\models\BioElectra\thesis\abstract_methods'
run_name = "Bioelectra_Abstract_Method"
from transformers import Trainer
args = TrainingArguments(
    output_dir= output_dir,
    evaluation_strategy='epoch',
    eval_steps=1,
    save_strategy = 'epoch',
    learning_rate = learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_strategy="epoch",
    save_total_limit=2,  
    run_name =run_name,
    load_best_model_at_end=True
)


trainer = Trainer(
    model, 
    args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['valid'],
    data_collator = data_collator,
    tokenizer = tokenizer,
    

    compute_metrics = compute_metrics_by_token_swt
    
)

In [105]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1387
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 78
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
})

In [106]:
trainer.train()

model.save_pretrained('Abstract_method_SWT_bioelectra_model_18-11-2023')
tokenizer.save_pretrained('Abstract_method_SWT_bioelectra_tokenizer_18-11-2023')

 15%|█▌        | 66/430 [18:01<1:43:16, 17.02s/it]

KeyboardInterrupt: 

#### Testing the model

In [None]:
trainer.evaluate(eval_dataset= tokenized_dataset['test'])

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 20/20 [00:40<00:00,  2.01s/it]


{'eval_loss': 0.2409309297800064,
 'eval_precision': 0.436607892527288,
 'eval_recall': 0.3123123123123123,
 'eval_f1': 0.36414565826330536,
 'eval_accuracy': 0.9334202591475838,
 'eval_runtime': 42.275,
 'eval_samples_per_second': 1.845,
 'eval_steps_per_second': 0.473,
 'epoch': 2.97}

In [None]:
sentence = "Semaglutide, a glucagon-like peptide-1 receptor agonist, has been shown to reduce the risk of adverse cardiovascular events in patients with diabetes. Whether semaglutide can reduce cardiovascular risk associated with overweight and obesity in the absence of diabetes is unknown"
sentence

"We aimed to determine prospectively whether rivastigmine, an inhibitor of acetylcholinesterase and butyrylcholinesterase, provided benefits in patients with and without visual hallucinations in a population with dementia associated with Parkinson's disease (PDD)"

In [None]:
model_checkpoint = trainer.state.best_model_checkpoint # or save model in disk and load it later
print(f"using checkpoint {model_checkpoint}")

from transformers import pipeline

token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="first")
# token_classifier.tokenizer.model_max_length = model_max_length

using checkpoint C:\Users\Gbadamosi\Documents\Nerd Corner\Master in ds and AI\MSC project\workspace\workspace\ner\models\BioElectra\thesis\checkpoint-258


In [None]:
res = token_classifier(sentence, aggregation_strategy="first")
print(res)

[]


###  ABSTRACT AND METHODS SPLITTED INTO CHUNKS OF 50

#### Creating Huggingface Dataset

In [None]:
hf_ds = create_datasets(splitted_50_abstract_methods_context, tru_label)

Casting the dataset: 100%|██████████| 13880/13880 [00:00<00:00, 91142.69 examples/s]


In [None]:
hf_ds.features

{'pmid': Value(dtype='string', id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-cond', 'I-cond', 'B-des', 'I-des', 'B-subj', 'I-subj', 'B-group_A', 'I-group_A', 'B-group_B', 'I-group_B', 'B-group_C', 'I-group_C', 'B-group_D', 'I-group_D'], id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [None]:
hf_ds[0]


{'pmid': '16960863',
 'ner_tags': [0,
  0,
  7,
  0,
  5,
  6,
  6,
  6,
  6,
  6,
  0,
  1,
  2,
  2,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  6,
  6,
  6,
  6,
  6,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  2,
  2,
  0,
  0],
 'tokens': ['effects',
  'of',
  'rivastigmine',
  'in',
  'patients',
  'with',
  'and',
  'without',
  'visual',
  'hallucinations',
  'in',
  'dementia',
  'associated',
  'with',
  'parkinsons',
  'disease',
  'we',
  'aimed',
  'to',
  'determine',
  'prospectively',
  'whether',
  'rivastigmine',
  'an',
  'inhibitor',
  'of',
  'acetylcholinesterase',
  'and',
  'butyrylcholinesterase',
  'provided',
  'benefits',
  'in',
  'patients',
  'with',
  'and',
  'without',
  'visual',
  'hallucinations',
  'in',
  'a',
  'population',
  'with',
  'dementia',
  'associated',
  'with',
  'parkinsons',
  'disease',
  'this',
  'was']}

In [None]:
part_train_test = train_test_split(hf_ds, train_test_size=0.1, validation_size=0.5) 


In [None]:
part_train_test

DatasetDict({
    train: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 12492
    })
    test: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 694
    })
    valid: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 694
    })
})

In [None]:
tags = part_train_test['train'].features['ner_tags'].feature.names
tags

['O',
 'B-cond',
 'I-cond',
 'B-des',
 'I-des',
 'B-subj',
 'I-subj',
 'B-group_A',
 'I-group_A',
 'B-group_B',
 'I-group_B',
 'B-group_C',
 'I-group_C',
 'B-group_D',
 'I-group_D']

#### tokenizing dataset

In [None]:
tokenized_dataset = part_train_test.map(tokenize_align_labels, batched = True, remove_columns=part_train_test['train'].column_names)

Map: 100%|██████████| 12492/12492 [00:09<00:00, 1333.02 examples/s]
Map: 100%|██████████| 694/694 [00:04<00:00, 158.05 examples/s]
Map: 100%|██████████| 694/694 [00:00<00:00, 1444.20 examples/s]


#### Trainer

In [None]:
output_dir = r'C:\Users\Gbadamosi\Documents\Nerd Corner\Master in ds and AI\MSC project\workspace\workspace\ner\models\BioElectra\thesis\fifty_abstract_methods'
run_name = 'bio_electra_50_method_abstract'
from transformers import Trainer
args = TrainingArguments(
    output_dir= output_dir,
    evaluation_strategy='epoch',
    eval_steps=1,
    save_strategy = 'epoch',
    learning_rate = learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_strategy="epoch",
    save_total_limit=2,  
    run_name =run_name,
    load_best_model_at_end=True
)


trainer = Trainer(
    model, 
    args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['valid'],
    data_collator = data_collator,
    tokenizer = tokenizer,
    

    compute_metrics = compute_metrics_by_token_swt)


In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 12492
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 694
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 694
    })
})

In [None]:
trainer.train()

model.save_pretrained('Abstract_method_50_SWT_bioelectra_model_15-11-2023')
tokenizer.save_pretrained('Abstract_method_50_SWT_bioelectra_tokenizer_15-11-2023')

  0%|          | 2/2340 [00:23<7:30:32, 11.56s/it]

[A                                               

{'loss': 0.1714, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}


  _warn_prf(average, modifier, msg_start, len(result))
                                                 
                                                  

{'eval_loss': 0.19292329251766205, 'eval_precision': 0.4903912148249828, 'eval_recall': 0.6125160737248179, 'eval_f1': 0.5446922050695636, 'eval_accuracy': 0.9295361966697635, 'eval_runtime': 32.4974, 'eval_samples_per_second': 21.356, 'eval_steps_per_second': 5.354, 'epoch': 1.0}



[A                                                

{'loss': 0.1422, 'learning_rate': 6.6581196581196584e-06, 'epoch': 2.0}


  _warn_prf(average, modifier, msg_start, len(result))
                                                 
                                                   

{'eval_loss': 0.18915516138076782, 'eval_precision': 0.5213554535827745, 'eval_recall': 0.6330904414916416, 'eval_f1': 0.5718157181571816, 'eval_accuracy': 0.9328235546344601, 'eval_runtime': 33.1299, 'eval_samples_per_second': 20.948, 'eval_steps_per_second': 5.252, 'epoch': 2.0}



[A                                                  

{'loss': 0.1225, 'learning_rate': 0.0, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))
                                                 
                                                     

{'eval_loss': 0.19120648503303528, 'eval_precision': 0.5202821869488536, 'eval_recall': 0.632233176168024, 'eval_f1': 0.5708204334365325, 'eval_accuracy': 0.933395269063103, 'eval_runtime': 28.4008, 'eval_samples_per_second': 24.436, 'eval_steps_per_second': 6.127, 'epoch': 3.0}



100%|██████████| 2340/2340 [1:24:20<00:00,  2.16s/it]


{'train_runtime': 5060.7878, 'train_samples_per_second': 7.405, 'train_steps_per_second': 0.462, 'train_loss': 0.14538126562395667, 'epoch': 3.0}


('Abstract_method_50_SWT_bioelectra_tokenizer_15-11-2023\\tokenizer_config.json',
 'Abstract_method_50_SWT_bioelectra_tokenizer_15-11-2023\\special_tokens_map.json',
 'Abstract_method_50_SWT_bioelectra_tokenizer_15-11-2023\\vocab.txt',
 'Abstract_method_50_SWT_bioelectra_tokenizer_15-11-2023\\added_tokens.json',
 'Abstract_method_50_SWT_bioelectra_tokenizer_15-11-2023\\tokenizer.json')

#### Testing the model

In [None]:
trainer.evaluate(eval_dataset= tokenized_dataset['test'])

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 174/174 [00:28<00:00,  6.12it/s]


{'eval_loss': 0.17110401391983032,
 'eval_precision': 0.5381932333449599,
 'eval_recall': 0.6560374149659864,
 'eval_f1': 0.5913010155202146,
 'eval_accuracy': 0.937537081424876,
 'eval_runtime': 28.6072,
 'eval_samples_per_second': 24.26,
 'eval_steps_per_second': 6.082,
 'epoch': 3.0}

In [None]:
sentence = "Semaglutide, a glucagon-like peptide-1 receptor agonist, has been shown to reduce the risk of adverse cardiovascular events in patients with diabetes. Whether semaglutide can reduce cardiovascular risk associated with overweight and obesity in the absence of diabetes is unknown"
sentence

"We aimed to determine prospectively whether rivastigmine, an inhibitor of acetylcholinesterase and butyrylcholinesterase, provided benefits in patients with and without visual hallucinations in a population with dementia associated with Parkinson's disease (PDD)"

In [None]:
model_checkpoint = trainer.state.best_model_checkpoint # or save model in disk and load it later
print(f"using checkpoint {model_checkpoint}")

from transformers import pipeline

token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="first")
# token_classifier.tokenizer.model_max_length = model_max_length

using checkpoint C:\Users\Gbadamosi\Documents\Nerd Corner\Master in ds and AI\MSC project\workspace\workspace\ner\models\BioElectra\thesis\fifty_abstract_methods\checkpoint-1561


In [None]:
res = token_classifier(sentence, aggregation_strategy="first")
print(res)

[{'entity_group': 'group_A', 'score': 0.8458064, 'word': 'rivastigmine', 'start': 44, 'end': 56}, {'entity_group': 'subj', 'score': 0.8219936, 'word': 'patients', 'start': 143, 'end': 151}, {'entity_group': 'cond', 'score': 0.524707, 'word': 'visual hallucinations', 'start': 169, 'end': 190}, {'entity_group': 'cond', 'score': 0.70852387, 'word': "dementia associated with parkinson ' s disease ( pdd )", 'start': 212, 'end': 262}]


###  ABSTRACT AND METHODS SPLITTED INTO CHUNKS OF 500

#### Creating Huggingface Dataset

In [None]:
hf_ds = create_datasets(splitted_512_records_method_context, tru_label)

Casting the dataset: 100%|██████████| 13880/13880 [00:00<00:00, 91142.69 examples/s]


In [None]:
hf_ds.features

{'pmid': Value(dtype='string', id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-cond', 'I-cond', 'B-des', 'I-des', 'B-subj', 'I-subj', 'B-group_A', 'I-group_A', 'B-group_B', 'I-group_B', 'B-group_C', 'I-group_C', 'B-group_D', 'I-group_D'], id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [None]:
hf_ds[0]


{'pmid': '16960863',
 'ner_tags': [0,
  0,
  7,
  0,
  5,
  6,
  6,
  6,
  6,
  6,
  0,
  1,
  2,
  2,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  6,
  6,
  6,
  6,
  6,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  2,
  2,
  0,
  0],
 'tokens': ['effects',
  'of',
  'rivastigmine',
  'in',
  'patients',
  'with',
  'and',
  'without',
  'visual',
  'hallucinations',
  'in',
  'dementia',
  'associated',
  'with',
  'parkinsons',
  'disease',
  'we',
  'aimed',
  'to',
  'determine',
  'prospectively',
  'whether',
  'rivastigmine',
  'an',
  'inhibitor',
  'of',
  'acetylcholinesterase',
  'and',
  'butyrylcholinesterase',
  'provided',
  'benefits',
  'in',
  'patients',
  'with',
  'and',
  'without',
  'visual',
  'hallucinations',
  'in',
  'a',
  'population',
  'with',
  'dementia',
  'associated',
  'with',
  'parkinsons',
  'disease',
  'this',
  'was']}

In [None]:
part_train_test = train_test_split(hf_ds, train_test_size=0.1, validation_size=0.5) 


In [None]:
part_train_test

DatasetDict({
    train: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 12492
    })
    test: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 694
    })
    valid: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 694
    })
})

In [None]:
tags = part_train_test['train'].features['ner_tags'].feature.names
tags

['O',
 'B-cond',
 'I-cond',
 'B-des',
 'I-des',
 'B-subj',
 'I-subj',
 'B-group_A',
 'I-group_A',
 'B-group_B',
 'I-group_B',
 'B-group_C',
 'I-group_C',
 'B-group_D',
 'I-group_D']

#### tokenizing dataset

In [None]:
tokenized_dataset = part_train_test.map(tokenize_align_labels, batched = True, remove_columns=part_train_test['train'].column_names)

Map: 100%|██████████| 12492/12492 [00:09<00:00, 1333.02 examples/s]
Map: 100%|██████████| 694/694 [00:04<00:00, 158.05 examples/s]
Map: 100%|██████████| 694/694 [00:00<00:00, 1444.20 examples/s]


#### Trainer

In [None]:
output_dir = r'C:\Users\Gbadamosi\Documents\Nerd Corner\Master in ds and AI\MSC project\workspace\workspace\ner\models\BioElectra\thesis\five_hundred_abstract_methods'
run_name = 'bio_electra_500_method_abstract'
from transformers import Trainer
args = TrainingArguments(
    output_dir= output_dir,
    evaluation_strategy='epoch',
    eval_steps=1,
    save_strategy = 'epoch',
    learning_rate = learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_strategy="epoch",
    save_total_limit=2,  
    run_name =run_name,
    load_best_model_at_end=True
)


trainer = Trainer(
    model, 
    args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['valid'],
    data_collator = data_collator,
    tokenizer = tokenizer,
    

    compute_metrics = compute_metrics_by_token_swt)


In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 12492
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 694
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 694
    })
})

In [None]:
trainer.train()

model.save_pretrained('Abstract_method_500_SWT_bioelectra_model_18-11-2023')
tokenizer.save_pretrained('Abstract_method_500_SWT_bioelectra_tokenizer_18-11-2023')

  0%|          | 2/2340 [00:23<7:30:32, 11.56s/it]

[A                                               

{'loss': 0.1714, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}


  _warn_prf(average, modifier, msg_start, len(result))
                                                 
                                                  

{'eval_loss': 0.19292329251766205, 'eval_precision': 0.4903912148249828, 'eval_recall': 0.6125160737248179, 'eval_f1': 0.5446922050695636, 'eval_accuracy': 0.9295361966697635, 'eval_runtime': 32.4974, 'eval_samples_per_second': 21.356, 'eval_steps_per_second': 5.354, 'epoch': 1.0}



[A                                                

{'loss': 0.1422, 'learning_rate': 6.6581196581196584e-06, 'epoch': 2.0}


  _warn_prf(average, modifier, msg_start, len(result))
                                                 
                                                   

{'eval_loss': 0.18915516138076782, 'eval_precision': 0.5213554535827745, 'eval_recall': 0.6330904414916416, 'eval_f1': 0.5718157181571816, 'eval_accuracy': 0.9328235546344601, 'eval_runtime': 33.1299, 'eval_samples_per_second': 20.948, 'eval_steps_per_second': 5.252, 'epoch': 2.0}



[A                                                  

{'loss': 0.1225, 'learning_rate': 0.0, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))
                                                 
                                                     

{'eval_loss': 0.19120648503303528, 'eval_precision': 0.5202821869488536, 'eval_recall': 0.632233176168024, 'eval_f1': 0.5708204334365325, 'eval_accuracy': 0.933395269063103, 'eval_runtime': 28.4008, 'eval_samples_per_second': 24.436, 'eval_steps_per_second': 6.127, 'epoch': 3.0}



100%|██████████| 2340/2340 [1:24:20<00:00,  2.16s/it]


{'train_runtime': 5060.7878, 'train_samples_per_second': 7.405, 'train_steps_per_second': 0.462, 'train_loss': 0.14538126562395667, 'epoch': 3.0}


('Abstract_method_50_SWT_bioelectra_tokenizer_15-11-2023\\tokenizer_config.json',
 'Abstract_method_50_SWT_bioelectra_tokenizer_15-11-2023\\special_tokens_map.json',
 'Abstract_method_50_SWT_bioelectra_tokenizer_15-11-2023\\vocab.txt',
 'Abstract_method_50_SWT_bioelectra_tokenizer_15-11-2023\\added_tokens.json',
 'Abstract_method_50_SWT_bioelectra_tokenizer_15-11-2023\\tokenizer.json')

#### Testing the model

In [None]:
trainer.evaluate(eval_dataset= tokenized_dataset['test'])

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 174/174 [00:28<00:00,  6.12it/s]


{'eval_loss': 0.17110401391983032,
 'eval_precision': 0.5381932333449599,
 'eval_recall': 0.6560374149659864,
 'eval_f1': 0.5913010155202146,
 'eval_accuracy': 0.937537081424876,
 'eval_runtime': 28.6072,
 'eval_samples_per_second': 24.26,
 'eval_steps_per_second': 6.082,
 'epoch': 3.0}

In [None]:
sentence = "Semaglutide, a glucagon-like peptide-1 receptor agonist, has been shown to reduce the risk of adverse cardiovascular events in patients with diabetes. Whether semaglutide can reduce cardiovascular risk associated with overweight and obesity in the absence of diabetes is unknown"
sentence

"We aimed to determine prospectively whether rivastigmine, an inhibitor of acetylcholinesterase and butyrylcholinesterase, provided benefits in patients with and without visual hallucinations in a population with dementia associated with Parkinson's disease (PDD)"

In [None]:
model_checkpoint = trainer.state.best_model_checkpoint # or save model in disk and load it later
print(f"using checkpoint {model_checkpoint}")

from transformers import pipeline

token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="first")
# token_classifier.tokenizer.model_max_length = model_max_length

using checkpoint C:\Users\Gbadamosi\Documents\Nerd Corner\Master in ds and AI\MSC project\workspace\workspace\ner\models\BioElectra\thesis\fifty_abstract_methods\checkpoint-1561


In [None]:
res = token_classifier(sentence, aggregation_strategy="first")
print(res)

[{'entity_group': 'group_A', 'score': 0.8458064, 'word': 'rivastigmine', 'start': 44, 'end': 56}, {'entity_group': 'subj', 'score': 0.8219936, 'word': 'patients', 'start': 143, 'end': 151}, {'entity_group': 'cond', 'score': 0.524707, 'word': 'visual hallucinations', 'start': 169, 'end': 190}, {'entity_group': 'cond', 'score': 0.70852387, 'word': "dementia associated with parkinson ' s disease ( pdd )", 'start': 212, 'end': 262}]
