# **BioBERT ITA Fine-Tuning Experiment For <u>Named Entity Recognition</u>**

*Tommaso Buonocore, University of Pavia, 2022*

*Last edited: 16/11/2022*

#Initialization

Short string describing the current run

In [None]:
experiment_name = "Species800-NER"

##Imports

In [None]:
%%capture
# If running on colab, install first
!pip3 install datasets transformers seqeval

# Google Colab only
from IPython.display import display, HTML
from google.colab import files

# General
import random
import pandas as pd
import numpy as np
from torch import cuda
import json
import os
from io import StringIO
import time

# HuggingFace Transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback, set_seed
from datasets import load_dataset, load_metric, ClassLabel, Sequence, DatasetDict, Features, Value, Sequence, ClassLabel, Dataset

# Set device to GPU Cuda if available 
device = 'cuda' if cuda.is_available() else 'cpu'

##Session info

In [None]:
session_info = json.loads(os.popen("curl curl ipinfo.io").read())
if device=='cuda':
  gpu_info = pd.read_csv(StringIO(os.popen("nvidia-smi --query-gpu=gpu_name,memory.total --format=csv").read()),names=["name","memory"],header=0)
  session_info[f'gpus'] = [{'name': row["name"], 'memory': row["memory"]} for index, row in gpu_info.iterrows()] 
else: 
  session_info[f'gpus'] = []
session_info['time_start'] = time.strftime("%H:%M:%S", time.localtime())
session_info['experiment_name'] = experiment_name
session_info

# Data Preprocessing

At this point, we expect to have six files already loaded in the current session (by drag and drop):

*   "/content/*_corpus.txt" : document id and document string, separated by "|"
*   "/content/*_annotations.txt": details about the annotation (e.g., start index, end index, value, tag label, etc.) separated by "/t"

(* = "train", "test", "dev")



In [None]:
from nltk.tokenize import WordPunctTokenizer

def prepare_dataset(ds_corpus, ds_tags):
  tokenizer = WordPunctTokenizer()
  dropped = 0
  taglist = [[]]*len(ds_corpus)
  for i in range(len(ds_corpus)):
    row = ds_corpus.iloc[i,:]
    id=row["id"]
    text=tokenizer.tokenize(row["tokens"])
    tags = ["O"]*len(text)
    filtered_rows = ds_tags[ds_tags['id'] == id]
    for j in range(len(filtered_rows)):
      row = filtered_rows.iloc[j,:]
      word = row["value"]
      if word in text:
        idx = text.index(word)
        tags[idx] = "B-"+row["ner_tags"]
      else:
        #tokenize word into ...words, because it's probably multi word
        #look into tokens to find the position of the corresponding sequence
        #associate "B-TAG" to the first one and "I-TAG" to the others
        tword = tokenizer.tokenize(word)
        idxs = [(i, i+len(tword)) for i in range(len(text)) if text[i:i+len(tword)] == tword]
        if len(idxs)>0:
          tags[idxs[0][0]] = "B-"+row["ner_tags"]
          for k in range(idxs[0][0]+1,idxs[0][1]):
            tags[k] = "I-"+row["ner_tags"]
        else:
          dropped  +=1
    taglist[i]=tags
  print(f"{round(dropped*100/len(ds_tags),2)}% dropped")

  ds_corpus["ner_tags"] = taglist
  ds_corpus["tokens"] = [tokenizer.tokenize(ds_corpus.iloc[i,:]["tokens"]) for i in range(len(ds_corpus))]

  dataset = Dataset.from_pandas(ds_corpus)

  class_names = [*['O'],
               *['B-'+tag for tag in set(ds_tags["ner_tags"])],
               *['I-'+tag for tag in set(ds_tags["ner_tags"])]]
  dataset_features = Features({'id': Value('string'), 'tokens': Sequence(feature=Value('string')), 'ner_tags': Sequence(feature=ClassLabel(names=class_names))})
  dataset = dataset.map(dataset_features.encode_example, features=dataset_features)
  return dataset

In [None]:
default_missing = pd._libs.parsers.STR_NA_VALUES
default_missing = default_missing.remove('NA') #sodium (NA)

In [None]:
#colnames = ["id","type","start","end","value","ner_tags"]
colnames = ["id","start","end","value","ner_tags"]

ds_tags = pd.read_csv('/content/train_annotations.txt', sep = '\t', header=None, names = ["id","start","end","value","ner_tags"], index_col=False,na_values=default_missing)
print(f'tags: {set(ds_tags["ner_tags"])}')

ds_corpus = pd.read_csv('/content/train_corpus.txt', sep ='|', header=None, names = ["id", "tokens"],index_col=False)
ds_tags = pd.read_csv('/content/train_annotations.txt', sep = '\t', header=None, names = colnames, na_values=default_missing,index_col=False)
dtrain = prepare_dataset(ds_corpus, ds_tags)

ds_corpus = pd.read_csv('/content/test_corpus.txt', sep ='|', header=None, names = ["id", "tokens"],index_col=False)
ds_tags = pd.read_csv('/content/test_annotations.txt', sep = '\t', header=None, names = colnames, na_values=default_missing,index_col=False)
dtest = prepare_dataset(ds_corpus, ds_tags)

ds_corpus = pd.read_csv('/content/dev_corpus.txt', sep ='|', header=None, names = ["id", "tokens"],index_col=False)
ds_tags = pd.read_csv('/content/dev_annotations.txt', sep = '\t', header=None, names = colnames, na_values=default_missing,index_col=False)
ddev = prepare_dataset(ds_corpus, ds_tags)

# Split the train + valid in 80% train, %20 valid
# train_valid = dtrain.train_test_split(test_size=0.20, seed=seed)

# gather everyone if you want to have a single DatasetDict
datasets = DatasetDict({
    'train': dtrain,
    'test': dtest,
    'valid': ddev}) 
# Check the levels defined for the NER tagging task in this dataset.
label_list = datasets["train"].features["ner_tags"].feature.names

In [None]:
print(datasets["train"][0])

Function that will be called in the training loop

In [None]:
def tokenize_and_align_labels(examples, tokenizer, label_all_tokens = True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on label_all_tokens.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Training

In [None]:
# Mount Google Drive 
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
model_checkpoints = [
    #"dbmdz/bert-base-italian-xxl-cased", # Baseline
    #"/content/gdrive/MyDrive/Colab Environments/biobert_models/bio-full", # BioBERT
    "/content/gdrive/MyDrive/Colab Environments/biobert_models/med-reg-v3", # Best model w/o corpus augmentation
    #"/content/gdrive/MyDrive/Colab Environments/biobert_models/med-reg-v12", # Best ER model w/ corpus augmentation
    #"/content/gdrive/MyDrive/Colab Environments/biobert_models/med-reg-v3-enriched" # Best MIXOUT model w/ corpus augmentation
    ]
seeds = [
    3407, 
    6, 
    11, 
    61, 
    39
    ]

#This can be changed according to the downstream dataset. The only important thing is that they remain consistent for *ALL* the models    
batch_size = 10
learning_rate = 3e-5
epochs=15
weight_decay=0.01

## Define metrics

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, scheme="IOB2", zero_division=1)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

##Training Loop

In [None]:
for model_checkpoint in model_checkpoints:
  df_results = pd.DataFrame(columns= ['type', 'number', 'f1', 'precision','recall','seed'])
  for seed in seeds:
    # Seed must be set before creating the model, otherwise the random head will be initialized in a different way every time and the results will not be replicable
    # From now on, the seed is set for *all* the random processes, including numpy, sklearn, etc...not only for transformers!
    set_seed(seed)

    # Load tokenizer and initialize the TokenClassification transformer with checkpoint weights
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)                
    tokenized_datasets = datasets.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

    # Define the training details
    training_args = TrainingArguments(
        output_dir = f"/content/{os.path.basename(model_checkpoint)}_ft_NER/{seed}",
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        save_total_limit = 3,
        load_best_model_at_end = True,
        metric_for_best_model = "f1",
        learning_rate = learning_rate,
        per_device_train_batch_size = batch_size,
        per_device_eval_batch_size = batch_size,
        num_train_epochs = epochs,
        weight_decay = weight_decay,
    )
    data_collator = DataCollatorForTokenClassification(tokenizer, label_pad_token_id = -100, return_tensors ="pt") 
    metric = load_metric("seqeval")

    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["valid"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks = [EarlyStoppingCallback(early_stopping_patience = 4)]
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    trainer.evaluate()

    # Collect results on test set
    predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, scheme="IOB2", zero_division=1)
    print(results)

    for key, value in results.items():
      if 'overall' not in key:
        row = {'type': key, 'number': value['number'], 'f1': value['f1'], 'precision': value['precision'], 'recall': value['recall'], 'seed': seed}
        df_results = df_results.append(row, ignore_index=True)
    row = {'type': 'overall','number': 0, 'f1': results['overall_f1'], 'precision': results['overall_precision'], 'recall': results['overall_recall'], 'seed':seed}
    df_results = df_results.append(row, ignore_index=True)
  
  display(df_results)
  df_results.to_csv(f'/content/results_{os.path.basename(model_checkpoint)}.csv')
  files.download(f'/content/results_{os.path.basename(model_checkpoint)}.csv')

Finalize session info and download

In [None]:
session_info['checkpoints'] = [os.path.basename(c) for c in model_checkpoints]
session_info['seeds'] = seeds
session_info['training_arguments'] = training_args
session_info['time_end'] = time.strftime("%H:%M:%S", time.localtime())

with open(f'/content/session_info.json', "w") as outfile:
    outfile.write(json.dumps(session_info, indent=4))
files.download(f'/content/session_info.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#uncomment to download the model for later use

#!zip -r checkpoint.zip model_checkpoint-finetuned-NER/checkpoint-100
#files.download('checkpoint.zip') #otherwise, right click on the zip file in the file system

In [None]:
# ###QUALITATIVE EVALUATION
# from collections import Counter
# def pack_tokens_and_predictions(word_tokens, predictions, labels):
#   tok_input = tokenizer(word_tokens, is_split_into_words=True)
#   word_ids = tok_input.word_ids()
#   c = Counter(word_ids)
#   token_index = 1
#   word_index = 0
#   words = []
#   preds = []
#   while token_index<511:
#     w_tokens = c.get(word_index)
#     if w_tokens==None:
#       break
#     start = token_index
#     end = token_index+w_tokens
#     w_labels = labels[start:end]
#     w_pred = predictions[start:end]
#     maj_key_pred, maj_value_pred = Counter(w_pred).most_common()[0]
#     coverage_pred = maj_value_pred/w_tokens
#     coverage_string = ""
#     if coverage_pred<1:
#       coverage_string = " ("+str([tag_labels[w] for w in w_pred])+")"
#     wrong_string = ""
#     if maj_key_pred!=w_labels[0]:
#       wrong_string = " , expected "+tag_labels[w_labels[0]]
#     arrow = "   "
#     if w_labels[0] != 0 or maj_key_pred != 0:
#       arrow = "-->"
#     word = tokenizer.decode(tok_input.input_ids[start:end])
#     print(arrow+"'"+word+"': "+tag_labels[maj_key_pred]+coverage_string+wrong_string)
#     token_index = end
#     word_index += 1
#     words.append(word)
#     preds.append(tag_labels[maj_key_pred])
#   return words, preds


# tag_labels = datasets["test"].features["ner_tags"].feature.names

# id_example_test = 4
# words, preds = pack_tokens_and_predictions(tokenized_datasets["test"][id_example_test]["tokens"], predictions[id_example_test], labels[id_example_test])

# import IPython
# color_palette=["#2fbbab","#fd9720","#a6e22d","#ef60b4","#F7DEA7","#B0B4D1","#ABC5F5","#D9E5AE","#F9C0C0"]
# colors = dict(zip(set(ds_tags["ner_tags"]), color_palette[0:len(set(ds_tags["ner_tags"]))]))
# for i in range(len(words)):
#   if(preds[i]!="O"):
#     color = colors[preds[i][2:]]
#     tag = preds[i][2:5]
#     js_code = f'''var container = document.querySelector("#output-area");
#               var span = document.createElement("span");
#               span.style.backgroundColor = "{color}"; 
#               span.style.borderRadius = "0.25em";
#               span.style.lineHeight = "2";
#               span.style.margin = "0.1em 0.1em";
#               span.style.padding = "0.3em 0.2em";
#               span.appendChild(document.createTextNode("{words[i]}"));
#               var tag = document.createElement("span");
#               tag.style.backgroundColor = "rgba(255, 255, 255, 0.8)";
#               tag.style.borderRadius = "0.25em";
#               tag.style.lineHeight = "1";
#               tag.style.fontSize="xx-small";
#               tag.style.margin = "0.1em 0.1em";
#               tag.style.padding = "0.05em 0.1em";
#               tag.appendChild(document.createTextNode("{tag}"));
#               span.appendChild(tag);
#               container.appendChild(span);'''
#   else:
#     text = words[i]+" "
#     js_code = f'''document.querySelector("#output-area").appendChild(document.createTextNode("{text}"));'''
#   display(IPython.display.Javascript(js_code))