In [108]:
#
# Named-entity recognition using BERT
# Dataset: https://www.kaggle.com/datasets/alaakhaled/conll003-englishversion
#

# dependencies
import torch
import torch.optim as optim 
from torchtext.vocab import build_vocab_from_iterator
from transformers import BertForTokenClassification, BertTokenizerFast
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
import tqdm
tqdmn = tqdm.notebook.tqdm

In [109]:
# hyper-parameters
EPOCHS = 3
BATCH_SIZE = 8
LR = 1e-5

In [110]:
# the path of the data files
base_path = '/kaggle/input/conll003-englishversion/'

# use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [111]:
# read the data files
def load_sentences(filepath):

    sentences = []
    tokens = []
    pos_tags = []
    chunk_tags = []
    ner_tags = []

    with open(filepath, 'r') as f:
        
        for line in f.readlines():
            
            if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                if len(tokens) > 0:
                    sentences.append({'tokens': tokens, 'pos_tags': pos_tags, 'chunk_tags': chunk_tags, 'ner_tags': ner_tags})
                    tokens = []
                    pos_tags = []
                    chunk_tags = []
                    ner_tags = []
            else:
                l = line.split(' ')
                tokens.append(l[0])
                pos_tags.append(l[1])
                chunk_tags.append(l[2])
                ner_tags.append(l[3].strip('\n'))
    
    return sentences

In [112]:
print('loading data')
train_sentences = load_sentences(base_path + 'train.txt')
test_sentences = load_sentences(base_path + 'test.txt')
valid_sentences = load_sentences(base_path + 'valid.txt')

loading data


When specifying the tags that are going to be used we replace the 'ner_tags' with 'chunk_tags' in this block of code.

In [113]:
# build tagset and tag ids
tags = [sentence['chunk_tags'] for sentence in train_sentences]
tagmap = build_vocab_from_iterator(tags)
tagset = set([item for sublist in tags for item in sublist])
print('Tagset size:',len(tagset))

Tagset size: 20


In [114]:
# load BERT tokenizer
bert_version = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(bert_version)

In [115]:
# map tokens and tags to token ids and label ids
def align_label(tokens, labels):

    word_ids = tokens.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(tagmap[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

In the next block the ner_tags should also be replaced with chunk_tags.

In [116]:
def encode(sentence):
    encodings = tokenizer(sentence['tokens'], truncation=True, padding='max_length', is_split_into_words=True)
    labels = align_label(encodings, sentence['chunk_tags'])
    return { 'input_ids': torch.LongTensor(encodings.input_ids), 'attention_mask': torch.LongTensor(encodings.attention_mask), 'labels': torch.LongTensor(labels) }

In [117]:
print('encoding data')
train_dataset = [encode(sentence) for sentence in train_sentences]
valid_dataset = [encode(sentence) for sentence in valid_sentences]
test_dataset = [encode(sentence) for sentence in test_sentences]

encoding data


In [118]:
# initialize the model including a classification layer with num_labels classes
print('initializing the model')
model = BertForTokenClassification.from_pretrained(bert_version, num_labels=len(tagset))
model.to(device)
optimizer = optim.AdamW(params=model.parameters(), lr=LR)

initializing the model


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

In [119]:
# prepare batches of data
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [120]:
# evaluate the performance of the model
def EvaluateModel(model, data_loader, question_2 = False):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds = [],[]
        for i, batch in enumerate(tqdmn(data_loader)):
            # move the batch tensors to the same device as the model
            batch = { k: v.to(device) for k, v in batch.items() }
            # send 'input_ids', 'attention_mask' and 'labels' to the model
            outputs = model(**batch)
            # iterate through the examples
            for idx, _ in enumerate(batch['labels']):
                # get the true values
                true_values_all = batch['labels'][idx]
                true_values = true_values_all[true_values_all != -100]
                # get the predicted values
                pred_values = torch.argmax(outputs[1], dim=2)[idx]
                pred_values = pred_values[true_values_all != -100]
                # update the lists of true answers and predictions
                Y_actual.append(true_values)
                Y_preds.append(pred_values)
    if question_2 == False:
        Y_actual = torch.cat(Y_actual)
        Y_preds = torch.cat(Y_preds)
        # Return list of actual labels, predicted labels 
        return Y_actual.detach().cpu().numpy(), Y_preds.detach().cpu().numpy()
    else:
        # Return actual and predicted labels per sentence
        return [y.detach().cpu().numpy() for y in Y_actual], [y.detach().cpu().numpy() for y in Y_preds]

In [121]:
# train the model
print('training the model')
for epoch in tqdmn(range(EPOCHS)):
    model.train()
    print('epoch',epoch+1)
    # iterate through each batch of the train data
    for i, batch in enumerate(tqdmn(train_loader)):
        # move the batch tensors to the same device as the model
        batch = { k: v.to(device) for k, v in batch.items() }
        # send 'input_ids', 'attention_mask' and 'labels' to the model
        outputs = model(**batch)
        loss = outputs[0]
        # set the gradients to zero
        optimizer.zero_grad()
        # propagate the loss backwards
        loss.backward()
        # update the model weights
        optimizer.step()
    # calculate performence on validation set
    Y_actual, Y_preds = EvaluateModel(model,valid_loader)
    print("\nValidation Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
    print("\nValidation Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))

training the model


  0%|          | 0/3 [00:00<?, ?it/s]

epoch 1


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.949

Validation Macro-Accuracy : 0.504
epoch 2


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.954

Validation Macro-Accuracy : 0.585
epoch 3


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.956

Validation Macro-Accuracy : 0.626


In [122]:
print('applying the model to the test set')
# apply the trained model to the test set
Y_actual, Y_preds = EvaluateModel(model,test_loader)

applying the model to the test set


  0%|          | 0/432 [00:00<?, ?it/s]

In [123]:
print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
print("\nTest Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds,labels = tagmap(tagmap.get_itos()), target_names = tagmap.get_itos(), zero_division = 0))


Test Accuracy : 0.951

Test Macro-Accuracy : 0.611

Classification Report : 
              precision    recall  f1-score   support

        I-NP       0.96      0.96      0.96     16177
        B-NP       0.95      0.95      0.95     12985
           O       0.98      0.98      0.98      6210
        B-PP       0.96      0.99      0.97      3979
        B-VP       0.94      0.92      0.93      3767
        I-VP       0.91      0.94      0.93      1913
      B-ADVP       0.80      0.73      0.76       559
      B-SBAR       0.88      0.83      0.86       296
      B-ADJP       0.72      0.63      0.67       276
       B-PRT       0.67      0.77      0.72       110
      I-ADJP       0.56      0.67      0.61        55
      I-ADVP       0.56      0.30      0.39        33
        I-PP       0.88      0.47      0.61        15
      B-INTJ       0.00      0.00      0.00        13
     I-CONJP       0.29      0.29      0.29         7
       B-LST       1.00      0.21      0.34        29
   

In [124]:
import numpy as np

def find_sentence_with_at_least_one_wrong_label(Y_actual, Y_preds):
    for i, (preds, labels) in enumerate(zip(Y_preds, Y_actual)):
        num_of_wrong_labels = sum(np.array(preds) != np.array(labels))
        if len(labels) >= 10 and num_of_wrong_labels > 0:
            tokens = test_sentences[i]["tokens"]
            preds = preds[labels != -100]
            predictions = [tagmap.get_itos()[p] for p in preds]
            labels = labels[labels != -100]
            labels = [tagmap.get_itos()[l] for l in labels]
            break
    return tokens, predictions, labels

Y_actual, Y_preds = EvaluateModel(model, test_loader, question_2 = True)
tokens, predictions, labels = find_sentence_with_at_least_one_wrong_label(Y_actual, Y_preds)

print('\nThe actual and predicted tags for the tokens of this sentence were:')
print("\n{:<10} {:<10} {:<10}".format('Token', 'Actual Tag', 'Predicted Tag'))

for i in range(len(tokens)):
    print("{:<10} {:<10} {:<10}".format(tokens[i], labels[i], predictions[i]))

  0%|          | 0/432 [00:00<?, ?it/s]


The actual and predicted tags for the tokens of this sentence were:

Token      Actual Tag Predicted Tag
SOCCER     B-NP       B-NP      
-          O          O         
JAPAN      B-NP       B-NP      
GET        B-VP       I-NP      
LUCKY      B-NP       I-NP      
WIN        I-NP       I-NP      
,          O          O         
CHINA      B-NP       B-NP      
IN         B-PP       B-PP      
SURPRISE   B-NP       B-NP      
DEFEAT     I-NP       I-NP      
.          O          O         


In [125]:
test_sentence_dict = {'tokens': ['Zurich', 'and', 'some', 'other', 'insurers', 'with', 'big', 'fleets', 'of', 'engineers', 'are', 'advising', 'companies', 'on', 'how', 'to', 'fortify', 'their', 'properties', '.'],
                      'pos_tags': ['NNP', 'CC', 'DT', 'JJ', 'NNS', 'IN', 'JJ', 'NNS', 'IN', 'NNS', 'VBP', 'VBG', 'NNS', 'IN', 'WRB', 'TO', 'VB', 'PRP$', 'NNS', '.'],
                      'chunk_tags': ['B-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'B-VP', 'B-VP', 'B-NP', 'B-PP', 'B-ADVP', 'B-VP', 'B-VP', 'B-NP', 'I-NP', 'O'],
                      'ner_tags': ['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}

new_test_dataset = [encode(test_sentence_dict)]
new_test_loader = torch.utils.data.DataLoader(new_test_dataset, batch_size=1)
new_Y_actual, new_Y_preds = EvaluateModel(model,new_test_loader )

print('\nThe actual and predicted tags for the tokens of this sentence were:')
print("\n{:<10} {:<10} {:<10}".format('Token', 'Actual Tag', 'Predicted Tag'))

for i in range(len(test_sentence_dict['tokens'])):
    token = test_sentence_dict['tokens'][i]
    print("{:<10} {:<10} {:<10}".format(token, tagmap.get_itos()[new_Y_actual[i].item()], tagmap.get_itos()[new_Y_preds[i].item()]))

  0%|          | 0/1 [00:00<?, ?it/s]


The actual and predicted tags for the tokens of this sentence were:

Token      Actual Tag Predicted Tag
Zurich     B-NP       B-NP      
and        O          O         
some       B-NP       B-NP      
other      I-NP       I-NP      
insurers   B-NP       I-NP      
with       B-PP       B-PP      
big        B-NP       B-NP      
fleets     I-NP       I-NP      
of         B-PP       B-PP      
engineers  B-NP       B-NP      
are        B-VP       B-VP      
advising   B-VP       I-VP      
companies  B-NP       B-NP      
on         B-PP       B-PP      
how        B-ADVP     B-ADVP    
to         B-VP       B-VP      
fortify    B-VP       I-VP      
their      B-NP       B-NP      
properties I-NP       I-NP      
.          O          O         
