**BioNER using BERT (Transfer learning model)**

In [None]:
import nltk
nltk.download('all')

In [2]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
pip install transformers==4.28.0

In [None]:
pip install datasets

In [5]:
from datasets import load_dataset

dataset = load_dataset('ncbi_disease')



  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 5433
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 924
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 941
    })
})

In [7]:
dataset["train"][0]

{'id': '0',
 'tokens': ['Identification',
  'of',
  'APC2',
  ',',
  'a',
  'homologue',
  'of',
  'the',
  'adenomatous',
  'polyposis',
  'coli',
  'tumour',
  'suppressor',
  '.'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]}

In [8]:
from transformers import AutoTokenizer

MODEL = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [9]:
train_texts = [item["tokens"] for item in dataset["train"]]
dev_texts = [item["tokens"] for item in dataset["validation"]]
test_texts = [item["tokens"] for item in dataset["test"]]

train_texts_encoded = tokenizer(train_texts, padding=True, truncation=True, max_length=256, is_split_into_words=True)
dev_texts_encoded = tokenizer(dev_texts, padding=True, truncation=True, max_length=256, is_split_into_words=True)
test_texts_encoded = tokenizer(test_texts, padding=True, truncation=True, max_length=256, is_split_into_words=True)


In [10]:
import re
def pre_processing(doc):

    for i in doc:
    
      #lower case
      lower=i.lower()
      
      #tokenize
      tokenize=lower.split( )
      
      #stopwwords
      sw=stopwords.words('english')
      d=[i for i in tokenize if i not in sw ]
      
      #remove punctutation and numbers
      d=[re.sub(r'[^\w]|[0-9]', '', i) for i in d]
      
      #remove empty space
      d=[i for i in d if re.search('\w',i)]
      return d

#for i in train_texts:
m=pre_processing(train_texts[0])

In [11]:
x = tokenizer(m, padding=True, truncation=True, max_length=256, is_split_into_words=True)
x

{'input_ids': [2, 4824, 3], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [12]:
train_texts_encoded[0]

Encoding(num_tokens=138, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [13]:
train_texts_encoded[0].tokens[:20]

['[CLS]',
 'identification',
 'of',
 'apc',
 '##2',
 ',',
 'a',
 'homologue',
 'of',
 'the',
 'adenomatous',
 'polyposis',
 'coli',
 'tumour',
 'suppressor',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [14]:
train_texts_encoded[0].offsets[:20]

[(0, 0),
 (0, 14),
 (0, 2),
 (0, 3),
 (3, 4),
 (0, 1),
 (0, 1),
 (0, 9),
 (0, 2),
 (0, 3),
 (0, 11),
 (0, 9),
 (0, 4),
 (0, 6),
 (0, 10),
 (0, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0)]

In [15]:
all_labels = list(set([label for item in dataset["train"] for label in item["ner_tags"]]))
all_labels

[0, 1, 2]

In [16]:
import numpy as np

def map_entities_to_tokens(items, encodings):
    
    labels = [item["ner_tags"] for item in items]
    offsets = [encoding.offsets for encoding in encodings]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, offsets):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

train_labels = map_entities_to_tokens(dataset["train"], train_texts_encoded.encodings)
dev_labels = map_entities_to_tokens(dataset["validation"], dev_texts_encoded.encodings)
test_labels = map_entities_to_tokens(dataset["test"], test_texts_encoded.encodings)


In [17]:
list(zip(train_texts_encoded[0].tokens[:20], train_labels[0][:20]))

[('[CLS]', -100),
 ('identification', 0),
 ('of', 0),
 ('apc', 0),
 ('##2', -100),
 (',', 0),
 ('a', 0),
 ('homologue', 0),
 ('of', 0),
 ('the', 0),
 ('adenomatous', 1),
 ('polyposis', 2),
 ('coli', 2),
 ('tumour', 2),
 ('suppressor', 0),
 ('.', 0),
 ('[SEP]', -100),
 ('[PAD]', -100),
 ('[PAD]', -100),
 ('[PAD]', -100)]

In [18]:
import torch

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    

train_dataset = NERDataset(train_texts_encoded, train_labels)
dev_dataset = NERDataset(dev_texts_encoded, dev_labels)
test_dataset = NERDataset(test_texts_encoded, test_labels)

print(f"Train items: {len(train_dataset)}")
print(f"Dev items: {len(dev_dataset)}")
print(f"Test items: {len(test_dataset)}")

Train items: 5433
Dev items: 924
Test items: 941


In [19]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    flat_labels, flat_preds = [], []
    flat_ent_labels, flat_ent_preds = [], []
    for label_row, pred_row in zip(labels, preds):
        for label, pred_label in zip(label_row, pred_row):
            if label != -100:
                flat_labels.append(label)
                flat_preds.append(pred_label)
                if label != 0 or pred_label != 0:
                    flat_ent_labels.append(label)
                    flat_ent_preds.append(pred_label)
                    
        
    precision, recall, f1, _ = precision_recall_fscore_support(flat_ent_labels, flat_ent_preds, average='micro')
    acc = accuracy_score(flat_labels, flat_preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [20]:
from transformers import Trainer, TrainingArguments, AutoModelForTokenClassification, BertForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=len(all_labels))


#device = torch.device("cpu")
#model.to(device)

training_args = TrainingArguments(
    output_dir='./results_1',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=int(len(train_dataset)/8),  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,              
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=5,
    load_best_model_at_end=True,
    no_cuda=False
)

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,         
    eval_dataset=dev_dataset,            
)

trainer.train()

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForToken

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
200,No log,0.089966,0.975301,0.699797,0.699797,0.699797
400,No log,0.055895,0.983896,0.805247,0.805247,0.805247
600,0.197000,0.053588,0.984855,0.823872,0.823872,0.823872
800,0.197000,0.043774,0.987484,0.849322,0.849322,0.849322
1000,0.046600,0.04431,0.986524,0.840651,0.840651,0.840651
1200,0.046600,0.060826,0.984438,0.810371,0.810371,0.810371
1400,0.046600,0.05467,0.986274,0.83308,0.83308,0.83308
1600,0.024200,0.045866,0.988944,0.867101,0.867101,0.867101
1800,0.024200,0.051987,0.987484,0.850374,0.850374,0.850374
2000,0.011800,0.052524,0.987359,0.847815,0.847815,0.847815


TrainOutput(global_step=2040, training_loss=0.06871181568678687, metrics={'train_runtime': 556.1537, 'train_samples_per_second': 29.307, 'train_steps_per_second': 3.668, 'total_flos': 1147910506316604.0, 'train_loss': 0.06871181568678687, 'epoch': 3.0})

In [21]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.04873446747660637,
 'eval_accuracy': 0.9851002163530228,
 'eval_f1': 0.8359550561797754,
 'eval_precision': 0.8359550561797753,
 'eval_recall': 0.8359550561797753,
 'eval_runtime': 7.0451,
 'eval_samples_per_second': 133.568,
 'eval_steps_per_second': 16.749,
 'epoch': 3.0}

In [22]:
from transformers import pipeline

model = AutoModelForTokenClassification.from_pretrained("/content/results_1/checkpoint-2000")
nlp = pipeline("ner", tokenizer=tokenizer, model=model)

In [23]:
print(dataset["test"][2])

nlp(dataset["test"][2]["tokens"])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'id': '2', 'tokens': ['The', 'risk', 'of', 'cancer', ',', 'especially', 'lymphoid', 'neoplasias', ',', 'is', 'substantially', 'elevated', 'in', 'A', '-', 'T', 'patients', 'and', 'has', 'long', 'been', 'associated', 'with', 'chromosomal', 'instability', '.'], 'ner_tags': [0, 0, 0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


[[{'entity': 'LABEL_0',
   'score': 0.9999144,
   'index': 1,
   'word': 'the',
   'start': 0,
   'end': 3}],
 [{'entity': 'LABEL_0',
   'score': 0.99970055,
   'index': 1,
   'word': 'risk',
   'start': 0,
   'end': 4}],
 [{'entity': 'LABEL_0',
   'score': 0.99991167,
   'index': 1,
   'word': 'of',
   'start': 0,
   'end': 2}],
 [{'entity': 'LABEL_0',
   'score': 0.99284416,
   'index': 1,
   'word': 'cancer',
   'start': 0,
   'end': 6}],
 [{'entity': 'LABEL_0',
   'score': 0.9999199,
   'index': 1,
   'word': ',',
   'start': 0,
   'end': 1}],
 [{'entity': 'LABEL_0',
   'score': 0.99988854,
   'index': 1,
   'word': 'especially',
   'start': 0,
   'end': 10}],
 [{'entity': 'LABEL_0',
   'score': 0.9997329,
   'index': 1,
   'word': 'lymphoid',
   'start': 0,
   'end': 8}],
 [{'entity': 'LABEL_1',
   'score': 0.927492,
   'index': 1,
   'word': 'neoplasia',
   'start': 0,
   'end': 9},
  {'entity': 'LABEL_0',
   'score': 0.70131797,
   'index': 2,
   'word': '##s',
   'start': 9,
  