In [66]:
#
# Named-entity recognition using BERT
# Dataset: https://www.kaggle.com/datasets/alaakhaled/conll003-englishversion
#

# dependencies
import torch
import torch.optim as optim 
from torchtext.vocab import build_vocab_from_iterator
from transformers import BertForTokenClassification, BertTokenizerFast
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
import tqdm
tqdmn = tqdm.notebook.tqdm

In [67]:
# hyper-parameters
EPOCHS = 3
BATCH_SIZE = 8
LR = 1e-5

In [68]:
# the path of the data files
base_path = '/kaggle/input/conll003-englishversion/'

# use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [69]:
# read the data files
def load_sentences(filepath):

    sentences = []
    tokens = []
    pos_tags = []
    chunk_tags = []
    ner_tags = []

    with open(filepath, 'r') as f:
        
        for line in f.readlines():
            
            if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                if len(tokens) > 0:
                    sentences.append({'tokens': tokens, 'pos_tags': pos_tags, 'chunk_tags': chunk_tags, 'ner_tags': ner_tags})
                    tokens = []
                    pos_tags = []
                    chunk_tags = []
                    ner_tags = []
            else:
                l = line.split(' ')
                tokens.append(l[0])
                pos_tags.append(l[1])
                chunk_tags.append(l[2])
                ner_tags.append(l[3].strip('\n'))
    
    return sentences

In [70]:
print('loading data')
train_sentences = load_sentences(base_path + 'train.txt')
test_sentences = load_sentences(base_path + 'test.txt')
valid_sentences = load_sentences(base_path + 'valid.txt')

loading data


In [71]:
# build tagset and tag ids
tags = [sentence['ner_tags'] for sentence in train_sentences]
tagmap = build_vocab_from_iterator(tags)
tagset = set([item for sublist in tags for item in sublist])
print('Tagset size:',len(tagset))

Tagset size: 9


In [72]:
# load BERT tokenizer
bert_version = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(bert_version)

In [73]:
# map tokens and tags to token ids and label ids
def align_label(tokens, labels):

    word_ids = tokens.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(tagmap[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

In [74]:
def encode(sentence):
    encodings = tokenizer(sentence['tokens'], truncation=True, padding='max_length', is_split_into_words=True)
    labels = align_label(encodings, sentence['ner_tags'])
    return { 'input_ids': torch.LongTensor(encodings.input_ids), 'attention_mask': torch.LongTensor(encodings.attention_mask), 'labels': torch.LongTensor(labels) }

In [75]:
print('encoding data')
train_dataset = [encode(sentence) for sentence in train_sentences]
valid_dataset = [encode(sentence) for sentence in valid_sentences]
test_dataset = [encode(sentence) for sentence in test_sentences]

encoding data


In [76]:
# initialize the model including a classification layer with num_labels classes
print('initializing the model')
model = BertForTokenClassification.from_pretrained(bert_version, num_labels=len(tagset))
model.to(device)
optimizer = optim.AdamW(params=model.parameters(), lr=LR)

initializing the model


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

In [77]:
# prepare batches of data
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [78]:
# evaluate the performance of the model
def EvaluateModel(model, data_loader, question_2 = False):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds = [],[]
        for i, batch in enumerate(tqdmn(data_loader)):
            # move the batch tensors to the same device as the model
            batch = { k: v.to(device) for k, v in batch.items() }
            # send 'input_ids', 'attention_mask' and 'labels' to the model
            outputs = model(**batch)
            # iterate through the examples
            for idx, _ in enumerate(batch['labels']):
                # get the true values
                true_values_all = batch['labels'][idx]
                true_values = true_values_all[true_values_all != -100]
                # get the predicted values
                pred_values = torch.argmax(outputs[1], dim=2)[idx]
                pred_values = pred_values[true_values_all != -100]
                # update the lists of true answers and predictions
                Y_actual.append(true_values)
                Y_preds.append(pred_values)
    if question_2 == False:
        Y_actual = torch.cat(Y_actual)
        Y_preds = torch.cat(Y_preds)
        # Return list of actual labels, predicted labels 
        return Y_actual.detach().cpu().numpy(), Y_preds.detach().cpu().numpy()
    else:
        # Return actual and predicted labels per sentence
        return [y.detach().cpu().numpy() for y in Y_actual], [y.detach().cpu().numpy() for y in Y_preds]

        

In [79]:
# train the model
print('training the model')
for epoch in tqdmn(range(EPOCHS)):
    model.train()
    print('epoch',epoch+1)
    # iterate through each batch of the train data
    for i, batch in enumerate(tqdmn(train_loader)):
        # move the batch tensors to the same device as the model
        batch = { k: v.to(device) for k, v in batch.items() }
        # send 'input_ids', 'attention_mask' and 'labels' to the model
        outputs = model(**batch)
        loss = outputs[0]
        # set the gradients to zero
        optimizer.zero_grad()
        # propagate the loss backwards
        loss.backward()
        # update the model weights
        optimizer.step()
    # calculate performence on validation set
    Y_actual, Y_preds = EvaluateModel(model,valid_loader)
    print("\nValidation Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
    print("\nValidation Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))

training the model


  0%|          | 0/3 [00:00<?, ?it/s]

epoch 1


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.985

Validation Macro-Accuracy : 0.915
epoch 2


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.987

Validation Macro-Accuracy : 0.917
epoch 3


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.988

Validation Macro-Accuracy : 0.929


In [80]:
print('applying the model to the test set')
# apply the trained model to the test set
Y_actual, Y_preds = EvaluateModel(model,test_loader)

applying the model to the test set


  0%|          | 0/432 [00:00<?, ?it/s]

In [81]:
print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
print("\nTest Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds,labels = tagmap(tagmap.get_itos()), target_names = tagmap.get_itos(), zero_division = 0))


Test Accuracy : 0.981

Test Macro-Accuracy : 0.907

Classification Report : 
              precision    recall  f1-score   support

           O       1.00      0.99      0.99     38323
       B-LOC       0.94      0.93      0.93      1668
       B-PER       0.98      0.97      0.97      1617
       B-ORG       0.89      0.91      0.90      1661
       I-PER       0.99      0.99      0.99      1156
       I-ORG       0.86      0.91      0.88       835
      B-MISC       0.81      0.84      0.83       702
       I-LOC       0.81      0.89      0.85       257
      I-MISC       0.69      0.74      0.71       216

    accuracy                           0.98     46435
   macro avg       0.88      0.91      0.90     46435
weighted avg       0.98      0.98      0.98     46435



## Question 2

In [89]:
import numpy as np

def find_sentence_with_at_least_one_wrong_label(Y_actual, Y_preds):
    for i, (preds, labels) in enumerate(zip(Y_preds, Y_actual)):
        num_of_wrong_labels = sum(np.array(preds) != np.array(labels))
        if len(labels) >= 10 and num_of_wrong_labels > 0:
            tokens = test_sentences[i]["tokens"]
            preds = preds[labels != -100]
            predictions = [tagmap.get_itos()[p] for p in preds]
            labels = labels[labels != -100]
            labels = [tagmap.get_itos()[l] for l in labels]
            break
    return tokens, predictions, labels

Y_actual, Y_preds = EvaluateModel(model, test_loader, question_2 = True)
tokens, predictions, labels = find_sentence_with_at_least_one_wrong_label(Y_actual, Y_preds)

print('\nThe actual and predicted tags for the tokens of this sentence were:')
print("\n{:<10} {:<10} {:<10}".format('Token', 'Actual Tag', 'Predicted Tag'))

for i in range(len(tokens)):
    print("{:<10} {:<10} {:<10}".format(tokens[i], labels[i], predictions[i]))

  0%|          | 0/432 [00:00<?, ?it/s]


The actual and predicted tags for the tokens of this sentence were:

Token      Actual Tag Predicted Tag
SOCCER     O          O         
-          O          O         
JAPAN      B-LOC      B-LOC     
GET        O          O         
LUCKY      O          O         
WIN        O          O         
,          O          O         
CHINA      B-PER      B-LOC     
IN         O          O         
SURPRISE   O          O         
DEFEAT     O          O         
.          O          O         


In [87]:
test_sentence_dict = {'tokens': ['Zurich', 'and', 'some', 'other', 'insurers', 'with', 'big', 'fleets', 'of', 'engineers', 'are', 'advising', 'companies', 'on', 'how', 'to', 'fortify', 'their', 'properties', '.'],
                      'pos_tags': ['NNP', 'CC', 'DT', 'JJ', 'NNS', 'IN', 'JJ', 'NNS', 'IN', 'NNS', 'VBP', 'VBG', 'NNS', 'IN', 'WRB', 'TO', 'VB', 'PRP$', 'NNS', '.'],
                      'chunk_tags': ['B-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'B-VP', 'B-VP', 'B-NP', 'B-PP', 'B-ADVP', 'B-VP', 'B-VP', 'B-NP', 'I-NP', 'O'],
                      'ner_tags': ['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}

new_test_dataset = [encode(test_sentence_dict)]
new_test_loader = torch.utils.data.DataLoader(new_test_dataset, batch_size=1)
new_Y_actual, new_Y_preds = EvaluateModel(model,new_test_loader )

print('\nThe actual and predicted tags for the tokens of this sentence were:')
print("\n{:<10} {:<10} {:<10}".format('Token', 'Actual Tag', 'Predicted Tag'))

for i in range(len(test_sentence_dict['tokens'])):
    token = test_sentence_dict['tokens'][i]
    print("{:<10} {:<10} {:<10}".format(token, tagmap.get_itos()[new_Y_actual[i].item()], tagmap.get_itos()[new_Y_preds[i].item()]))

  0%|          | 0/1 [00:00<?, ?it/s]


The actual and predicted tags for the tokens of this sentence were:

Token      Actual Tag Predicted Tag
Zurich     B-ORG      B-ORG     
and        O          O         
some       O          O         
other      O          O         
insurers   O          O         
with       O          O         
big        O          O         
fleets     O          O         
of         O          O         
engineers  O          O         
are        O          O         
advising   O          O         
companies  O          O         
on         O          O         
how        O          O         
to         O          O         
fortify    O          O         
their      O          O         
properties O          O         
.          O          O         
