#### Setup

In [1]:
!pip install datasets evaluate transformers seqeval pandas accelerate numba



In [2]:
import sys
import os
from pathlib import Path
import numpy as np
import pandas as pd
import evaluate
import datasets
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, \
                         TrainerCallback, TrainingArguments, TrainerState, TrainerControl
import torch
import gc
from numba import cuda

COLAB = 'google.colab' in sys.modules
if COLAB:
    torch.cuda.is_available
COLAB

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512,garbage_collection_threshold:0.9"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
if not COLAB:
    from IPython.core.display import display, HTML
    display(HTML("<style>.container { width:80% !important; }</style>"))
    %load_ext autoreload
    %autoreload 2

    sys.path.append("../../")
else:
    if not os.path.exists("ner"):
        from google.colab import drive
        drive.mount("/content/drive")

        !cp -r /content/drive/MyDrive/colab_folder/notebooks .

         # with current datasets package (2.4) create_train_test_split_hf_dataset does not work

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
if COLAB:
    %load_ext tensorboard

In [6]:

tru_label = ['O',
              'B-cond',
              'I-cond',
              'B-des',
              'I-des',
              'B-subj',
              'I-subj',
              "B-group_A",
              "I-group_A",
              "B-group_B",
              "I-group_B",
              "B-group_C",
              "I-group_C",
              'B-group_D',
              'I-group_D']

id2label = {i:label for i, label in enumerate(tru_label) }
label2id = {label:i for i, label in enumerate(tru_label) }

#### Reading the datasest

In [7]:

DATA_PATH = Path("/content/drive/MyDrive/colab_folder/datasets")
MODELS_BASE_PATH = Path("/content/drive/MyDrive/colab_folder/models")


In [8]:
import json
dataset_date = "2023-11-22"

def read_file(DATA_PATH, doc):
    file_path = doc + '.json'
    file_name = os.path.join(DATA_PATH, file_path)

    with open(file_name, "r", encoding="utf-8") as f:
        return json.load(f)

In [9]:


abstract_methods = read_file(DATA_PATH, "final_labelled_abstract_method_22_11_2023_2.0")


In [10]:
json.dumps(abstract_methods[0])

'{"pmid": "16960863", "ner_tags": [0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 7, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 9, 0, 0, 7, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 9, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

#### FUNCTIONS

In [11]:
label_dict = {'O': 0, 'B-condition': 1, 'I-condition': 2,
              'B-design': 3, 'I-design': 4,
              'B-subjects': 5,'I-subjects': 6,
              "B-group_A": 7, "I-group_A": 8,
              "B-group_B": 9, "I-group_B": 10,
              "B-group_C": 11,"I-group_C": 12,
              'B-group_D': 13,'I-group_D': 14
              }
def tokenize_align_labels(example, label_all_tokens = True):
    # tokenized_input = tokenizer(example['tokens'], truncation = True, is_split_into_words=True, max_length = 512)
    tokenized_input = tokenizer(example['tokens'], truncation = True, is_split_into_words=True)

    labels = []
    for i , label in enumerate(example['ner_tags']):
        word_ids = tokenized_input.word_ids(batch_index=i)
        previous_word_idx = None

        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_ids!= previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_input['labels'] = labels
    return tokenized_input

from collections import Counter

def get_key(val):
    for key, value in label_dict.items():
        if val == value:
            return key
    return "key doesn't exist"




def count_values(lst):
    # Use Counter to count occurrences of each value in the list
    value_counts = Counter(lst)

    # Convert the Counter to a dictionary for a more readable output
    count_dict = dict(value_counts)

    return count_dict

def explore_annotations(dataset):
    all_annotations = []

    for records in dataset:
        temp_tag = records['ner_tags']
        all_annotations.extend(temp_tag)
    print(f'There are {len(all_annotations)} annotations in the dataset')

    return [get_key(x) for x in all_annotations]

def normalize_tag_name(tag):
    if tag in [ 'condition', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5', 'condition_6', 'condition_7']:
        tag= 'condition'
    elif tag in ['design', 'design_1', 'design_2', 'design_3', 'design_4', 'design_5']:
        tag = 'design'
    elif tag in ['subject_1', 'subject_2', 'subjects', 'subjects_1', 'subjects_2', 'subjects_3', 'subjects_4', 'subjects_5', 'subjekts', 'sujects']:
        tag = 'subjects'
    elif tag in [ 'group_A', 'group_A_1', 'group_A_2', 'group_A_3', 'group_A_4']:
        tag = 'group_A'
    elif tag in ['groub_B_1', 'group_B', 'group_B _2', 'group_B_1', 'group_B_2', 'group_B_3', 'group_B_4' ]:
        tag = 'group_B'
    elif tag in [ 'group_C', 'group_C_1', 'group_C_2', 'group_C_3' ]:
        tag = 'group_C'
    elif tag in [ 'group_D', 'group_D_1', 'group_D_2', 'group_D_3']:
        tag = 'group_D'
    return tag
def create_datasets(doc_dict, labels):
    temp = Dataset.from_pandas(pd.DataFrame(doc_dict))
    temp = temp.cast_column("ner_tags", datasets.Sequence(datasets.ClassLabel(names=labels)))
    return temp

def train_test_split(data_set, train_test_size, validation_size):
    from datasets import DatasetDict
    train_testvalid = data_set.train_test_split(test_size=train_test_size)

    # Split the test to half test, half valid
    test_valid = train_testvalid['test'].train_test_split(test_size=validation_size)

    #Gather everything into one dataset dictionary
    train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

    return train_test_valid_dataset



#### hyperparameters

In [12]:
from transformers import TrainingArguments, Trainer

model_max_length = 4096

if COLAB:
    batch_size = 4 # for p-100 16 is ok. For T4: 12
    if model_max_length <= 384:
      # at least 15 GB gpu
        batch_size = 16
    gradient_accumulation_steps = 3
    num_train_epochs = 8
else:
    batch_size = 4
    if model_max_length==256:
        batch_size = 9

    gradient_accumulation_steps = 4
    num_train_epochs = 5

learning_rate = 5e-5
weight_decay = 0.003

print(f"bs: {batch_size}, model_max_length: {model_max_length}, gradient_acc_steps: {gradient_accumulation_steps}, \
n_epochs: {num_train_epochs}, lr: {learning_rate}")

bs: 4, model_max_length: 4096, gradient_acc_steps: 3, n_epochs: 8, lr: 5e-05


In [13]:
model_checkpoint = "yikuan8/Clinical-Longformer"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length = model_max_length, add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, ignore_mismatched_sizes=True, id2label = id2label, label2id = label2id)

Some weights of LongformerForTokenClassification were not initialized from the model checkpoint at yikuan8/Clinical-Longformer and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = evaluate.load('seqeval')

In [15]:
from typing import Tuple
from sklearn.metrics import precision_score, recall_score
from functools import partial

def token_f1_cond(true, pred, labels):
    class_scores = zip(labels, precision_score(true, pred, labels=labels, average=None, zero_division=True),
                       recall_score(true, pred, labels=labels, average=None))
    result = {label: {"f1": get_f1(prec, rec), "p": prec, "r": rec} for label, prec, rec in class_scores}
    return result
def get_f1(prec, rec):
    return 2 * prec * rec / (prec + rec)

def compute_metrics_by_token_swt(eval_preds, label_list):
    labels = eval_preds.label_ids
    preds = eval_preds.predictions[0]

    predictions = preds

    true_predictions = [label_list[p] for prediction, label in zip(predictions, labels) for (p, l) in
                        zip(prediction, label) if l != -100]
    true_labels = [label_list[l] for prediction, label in zip(predictions, labels) for (p, l) in zip(prediction, label)
                   if l != -100]


    metrics_report = token_f1_cond(true_labels, true_predictions, label_list)
    metrics_report_f1 = {"f1_" + k: v["f1"] for k, v in metrics_report.items() if k != "O"}
    return metrics_report_f1

compute_metrics_by_token_swt = partial(compute_metrics_by_token_swt, label_list=tru_label)




###  ABSTRACT AND METHODS

In [16]:
abstract_methods[0]

{'pmid': '16960863',
 'ner_tags': [0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  6,
  6,
  6,
  6,
  6,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  4,
  4,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  9,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  9,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

##### count the annotations

In [17]:
for token, label in zip(abstract_methods[13]['tokens'], abstract_methods[13]['ner_tags']):
    print(f"{token:_<40} {label}")

alpha-dihydroergocryptine_______________ 7
in______________________________________ 0
the_____________________________________ 0
treatment_______________________________ 0
of______________________________________ 0
de______________________________________ 0
novo____________________________________ 0
parkinsonian____________________________ 0
patients________________________________ 5
:_______________________________________ 0
results_________________________________ 0
of______________________________________ 0
a_______________________________________ 0
multicentre_____________________________ 0
,_______________________________________ 0
randomized______________________________ 0
,_______________________________________ 0
double-blind____________________________ 0
,_______________________________________ 0
placebo-controlled______________________ 0
study___________________________________ 0
._______________________________________ 0
introduction____________________________ 0
:__________

In [18]:
tags = explore_annotations(abstract_methods)
import pandas as pd
result = count_values(tags)
temp_df = pd.DataFrame(result.items(), columns=['label_id', 'count'])
temp_df.sort_values(by = ['count'], ascending=False)

There are 2158415 annotations in the dataset


Unnamed: 0,label_id,count
0,O,2102999
1,B-group_A,7941
2,B-subjects,7582
10,I-condition,7058
7,B-group_B,6515
5,I-design,6379
3,I-subjects,4317
6,I-group_A,3799
9,B-condition,3119
11,I-group_B,2982


##### Saving the data as a json

In [19]:
len(abstract_methods)

1549

#### Creating Huggingface Dataset

In [20]:
abstract_methods_hf_ds = create_datasets(abstract_methods, tru_label)

Casting the dataset:   0%|          | 0/1549 [00:00<?, ? examples/s]

In [21]:
abstract_methods_hf_ds.features

{'pmid': Value(dtype='string', id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-cond', 'I-cond', 'B-des', 'I-des', 'B-subj', 'I-subj', 'B-group_A', 'I-group_A', 'B-group_B', 'I-group_B', 'B-group_C', 'I-group_C', 'B-group_D', 'I-group_D'], id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [22]:
abstract_methods_hf_ds[0]


{'pmid': '16960863',
 'ner_tags': [0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  6,
  6,
  6,
  6,
  6,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  4,
  4,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  9,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  9,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [23]:
_abstract_methods_part_train_test = train_test_split(abstract_methods_hf_ds, train_test_size=0.2, validation_size=0.5)


In [24]:
_abstract_methods_part_train_test

DatasetDict({
    train: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 1239
    })
    test: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 155
    })
    valid: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 155
    })
})

In [25]:
tags = _abstract_methods_part_train_test['train'].features['ner_tags'].feature.names
tags

['O',
 'B-cond',
 'I-cond',
 'B-des',
 'I-des',
 'B-subj',
 'I-subj',
 'B-group_A',
 'I-group_A',
 'B-group_B',
 'I-group_B',
 'B-group_C',
 'I-group_C',
 'B-group_D',
 'I-group_D']

#### tokenizing dataset

In [26]:
tokenized_dataset = _abstract_methods_part_train_test.map(tokenize_align_labels, batched = True, remove_columns=_abstract_methods_part_train_test['train'].column_names)

Map:   0%|          | 0/1239 [00:00<?, ? examples/s]

Map:   0%|          | 0/155 [00:00<?, ? examples/s]

Map:   0%|          | 0/155 [00:00<?, ? examples/s]

In [27]:
#del tokenizer
#del model
#del args
#del trainer

#gc.collect()
#cuda.current_context().memory_manager.deallocations.clear()
#torch.cuda.empty_cache()


#### Trainer

In [28]:
output_dir = r'/content/drive/MyDrive/colab_folder/models/clinical_longformer/Abstract_Method'
run_name = "clinical_longformer_Abstract_Method"

eval_strategy = 'epoch'
# eval_strategy = 'steps'
from transformers import Trainer
args = TrainingArguments(
    output_dir= output_dir,
    evaluation_strategy=eval_strategy,
    eval_steps=1,
    save_strategy = eval_strategy,
    learning_rate = learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_strategy=eval_strategy,
    save_total_limit=2,
    gradient_checkpointing = True,
    run_name =run_name,
    load_best_model_at_end=True
)

class DefaultFlowCallback(TrainerCallback):
    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        cuda.current_context().memory_manager.deallocations.clear()
        torch.cuda.empty_cache()
        gc.enable()
        gc.collect()

def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    return logits.argmax(dim=2), labels
    # return logits.argmax(axis=2), labels

trainer = Trainer(
    model,
    args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['valid'],
    data_collator = data_collator,
    tokenizer = tokenizer,


    compute_metrics = compute_metrics_by_token_swt,
    callbacks=[DefaultFlowCallback],
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

In [29]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1239
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 155
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 155
    })
})

In [30]:
trainer.train()

#import transformers as tf
#tf.saved_model.save(model, '/content/drive/MyDrive/colab_folder/models/clinical_longformer/Abstract_method_SWT_clinical_longformer_model_18-11-2023' )
model.save_pretrained('/content/drive/MyDrive/colab_folder/models/clinical_longformer/Abstract_method_SWT_clinical_longformer_model_18-11-2023')
tokenizer.save_pretrained('/content/drive/MyDrive/colab_folder/models/clinical_longformer/tokenizer/Abstract_method_SWT_clinical_longformer_tokenizer_18-11-2023')

You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1 B-cond,F1 I-cond,F1 B-des,F1 I-des,F1 B-subj,F1 I-subj,F1 B-group A,F1 I-group A,F1 B-group B,F1 I-group B,F1 B-group C,F1 I-group C,F1 B-group D,F1 I-group D
0,0.2446,0.133806,0.105023,0.149512,0.0,0.569176,0.33796,0.378723,0.136129,0.0,0.208208,0.0,0.0,0.0,0.0,0.0
1,0.1163,0.121841,0.428035,0.445083,0.414698,0.620612,0.228514,0.448895,0.402505,0.0,0.369135,0.0,0.010568,0.0,0.0,0.0
3,0.0818,0.110423,0.4575,0.454545,0.47606,0.692921,0.387807,0.457091,0.474687,0.308004,0.372093,0.230146,0.126866,,0.0,0.0
4,0.0719,0.116784,0.444755,0.473633,0.469565,0.693264,0.442511,0.49754,0.527676,0.291636,0.454638,0.294606,0.334056,,0.0,0.0
6,0.0529,0.128767,0.45645,0.464726,0.473837,0.669039,0.423715,0.496183,0.487296,0.305057,0.449102,0.346639,0.515152,0.228261,0.0,0.0
7,0.0491,0.128734,0.454128,0.469408,0.470588,0.669375,0.450032,0.502439,0.496874,0.305138,0.46732,0.34004,0.52439,0.175824,0.0,0.0


  return 2 * prec * rec / (prec + rec)
  return 2 * prec * rec / (prec + rec)
  return 2 * prec * rec / (prec + rec)


('/content/drive/MyDrive/colab_folder/models/clinical_longformer/tokenizer/Abstract_method_SWT_clinical_longformer_tokenizer_18-11-2023/tokenizer_config.json',
 '/content/drive/MyDrive/colab_folder/models/clinical_longformer/tokenizer/Abstract_method_SWT_clinical_longformer_tokenizer_18-11-2023/special_tokens_map.json',
 '/content/drive/MyDrive/colab_folder/models/clinical_longformer/tokenizer/Abstract_method_SWT_clinical_longformer_tokenizer_18-11-2023/vocab.json',
 '/content/drive/MyDrive/colab_folder/models/clinical_longformer/tokenizer/Abstract_method_SWT_clinical_longformer_tokenizer_18-11-2023/merges.txt',
 '/content/drive/MyDrive/colab_folder/models/clinical_longformer/tokenizer/Abstract_method_SWT_clinical_longformer_tokenizer_18-11-2023/added_tokens.json',
 '/content/drive/MyDrive/colab_folder/models/clinical_longformer/tokenizer/Abstract_method_SWT_clinical_longformer_tokenizer_18-11-2023/tokenizer.json')

#### Testing the model

In [31]:
trainer.evaluate(eval_dataset= tokenized_dataset['test'])

  return 2 * prec * rec / (prec + rec)


{'eval_loss': 0.10777873545885086,
 'eval_f1_B-cond': 0.4101633393829401,
 'eval_f1_I-cond': 0.3610698365527489,
 'eval_f1_B-des': 0.5577264653641208,
 'eval_f1_I-des': 0.6727989487516425,
 'eval_f1_B-subj': 0.36183206106870236,
 'eval_f1_I-subj': 0.548885077186964,
 'eval_f1_B-group_A': 0.5947242206235013,
 'eval_f1_I-group_A': 0.25575447570332477,
 'eval_f1_B-group_B': 0.40985130111524165,
 'eval_f1_I-group_B': 0.13531353135313529,
 'eval_f1_B-group_C': 0.0,
 'eval_f1_I-group_C': nan,
 'eval_f1_B-group_D': 0.0,
 'eval_f1_I-group_D': 0.0,
 'eval_runtime': 14.8531,
 'eval_samples_per_second': 10.436,
 'eval_steps_per_second': 2.626,
 'epoch': 7.97}

In [32]:
sentence = "Semaglutide, a glucagon-like peptide-1 receptor agonist, has been shown to reduce the risk of adverse cardiovascular events in patients with diabetes. Whether semaglutide can reduce cardiovascular risk associated with overweight and obesity in the absence of diabetes is unknown"
sentence

'Semaglutide, a glucagon-like peptide-1 receptor agonist, has been shown to reduce the risk of adverse cardiovascular events in patients with diabetes. Whether semaglutide can reduce cardiovascular risk associated with overweight and obesity in the absence of diabetes is unknown'

In [33]:
model_checkpoint = trainer.state.best_model_checkpoint # or save model in disk and load it later
print(f"using checkpoint {model_checkpoint}")

from transformers import pipeline

token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="first")
# token_classifier.tokenizer.model_max_length = model_max_length

using checkpoint /content/drive/MyDrive/colab_folder/models/clinical_longformer/Abstract_Method/checkpoint-413


In [34]:
res = token_classifier(sentence, aggregation_strategy="first")
print(res)

[{'entity_group': 'subj', 'score': 0.62568545, 'word': ' patients', 'start': 127, 'end': 135}]


