In order to change the code to perform POS tagging, the following changes were made:

1) From:
```
# build tagset and tag ids
tags = [sentence['ner_tags'] for sentence in train_sentences]

```
To:



```
# build tagset and tag ids
tags = [sentence['chunk_tags'] for sentence in train_sentences]
```

2) In the "encode function, from:
```
labels = align_label(encodings, sentence['ner_tags']
```
To:


```
labels = align_label(encodings, sentence['chunk_tags'])
```







In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m97.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
# dependencies
import torch
import torch.optim as optim 
from torchtext.vocab import build_vocab_from_iterator
from transformers import BertForTokenClassification, BertTokenizerFast
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
import tqdm
tqdmn = tqdm.notebook.tqdm

# hyper-parameters
EPOCHS = 3
BATCH_SIZE = 8
LR = 1e-5

# the path of the data files
base_path = '/content/drive/MyDrive/nlpdataset/'

# use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# read the data files
def load_sentences(filepath):

    sentences = []
    tokens = []
    pos_tags = []
    chunk_tags = []
    ner_tags = []

    with open(filepath, 'r') as f:
        
        for line in f.readlines():
            
            if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                if len(tokens) > 0:
                    sentences.append({'tokens': tokens, 'pos_tags': pos_tags, 'chunk_tags': chunk_tags, 'ner_tags': ner_tags})
                    tokens = []
                    pos_tags = []
                    chunk_tags = []
                    ner_tags = []
            else:
                l = line.split(' ')
                tokens.append(l[0])
                pos_tags.append(l[1])
                chunk_tags.append(l[2])
                ner_tags.append(l[3].strip('\n'))
    
    return sentences

print('loading data')
train_sentences = load_sentences(base_path + 'train.txt')
test_sentences = load_sentences(base_path + 'test.txt')
valid_sentences = load_sentences(base_path + 'valid.txt')

# build tagset and tag ids
tags = [sentence['chunk_tags'] for sentence in train_sentences]
tagmap = build_vocab_from_iterator(tags)
tagset = set([item for sublist in tags for item in sublist])
print('Tagset size:',len(tagset))

# load BERT tokenizer
bert_version = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(bert_version)

# map tokens and tags to token ids and label ids
def align_label(tokens, labels):

    word_ids = tokens.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(tagmap[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

def encode(sentence):
    encodings = tokenizer(sentence['tokens'], truncation=True, padding='max_length', is_split_into_words=True)
    labels = align_label(encodings, sentence['chunk_tags'])
    return { 'input_ids': torch.LongTensor(encodings.input_ids), 'attention_mask': torch.LongTensor(encodings.attention_mask), 'labels': torch.LongTensor(labels) }

print('encoding data')
train_dataset = [encode(sentence) for sentence in train_sentences]
valid_dataset = [encode(sentence) for sentence in valid_sentences]
test_dataset = [encode(sentence) for sentence in test_sentences]

# initialize the model including a classification layer with num_labels classes
print('initializing the model')
model = BertForTokenClassification.from_pretrained(bert_version, num_labels=len(tagset))
model.to(device)
optimizer = optim.AdamW(params=model.parameters(), lr=LR)

# prepare batches of data
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

# evaluate the performance of the model
def EvaluateModel(model, data_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds = [],[]
        for i, batch in enumerate(tqdmn(data_loader)):
            # move the batch tensors to the same device as the model
            batch = { k: v.to(device) for k, v in batch.items() }
            # send 'input_ids', 'attention_mask' and 'labels' to the model
            outputs = model(**batch)
            # iterate through the examples
            for idx, _ in enumerate(batch['labels']):
                # get the true values
                true_values_all = batch['labels'][idx]
                true_values = true_values_all[true_values_all != -100]
                # get the predicted values
                pred_values = torch.argmax(outputs[1], dim=2)[idx]
                pred_values = pred_values[true_values_all != -100]
                # update the lists of true answers and predictions
                Y_actual.append(true_values)
                Y_preds.append(pred_values)
        Y_actual = torch.cat(Y_actual)
        Y_preds = torch.cat(Y_preds)
    # Return list of actual labels, predicted labels 
    return Y_actual.detach().cpu().numpy(), Y_preds.detach().cpu().numpy()

# train the model
print('training the model')
for epoch in tqdmn(range(EPOCHS)):
    model.train()
    print('epoch',epoch+1)
    # iterate through each batch of the train data
    for i, batch in enumerate(tqdmn(train_loader)):
        # move the batch tensors to the same device as the model
        batch = { k: v.to(device) for k, v in batch.items() }
        # send 'input_ids', 'attention_mask' and 'labels' to the model
        outputs = model(**batch)
        loss = outputs[0]
        # set the gradients to zero
        optimizer.zero_grad()
        # propagate the loss backwards
        loss.backward()
        # update the model weights
        optimizer.step()
    # calculate performence on validation set
    Y_actual, Y_preds = EvaluateModel(model,valid_loader)
    print("\nValidation Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
    print("\nValidation Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))

print('applying the model to the test set')
# apply the trained model to the test set
Y_actual, Y_preds = EvaluateModel(model,test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
print("\nTest Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds,labels = tagmap(tagmap.get_itos()), target_names = tagmap.get_itos(), zero_division = 0))



loading data
Tagset size: 20


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

encoding data
initializing the model


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

training the model


  0%|          | 0/3 [00:00<?, ?it/s]

epoch 1


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.950

Validation Macro-Accuracy : 0.515
epoch 2


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.952

Validation Macro-Accuracy : 0.583
epoch 3


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.957

Validation Macro-Accuracy : 0.665
applying the model to the test set


  0%|          | 0/432 [00:00<?, ?it/s]


Test Accuracy : 0.951

Test Macro-Accuracy : 0.603

Classification Report : 
              precision    recall  f1-score   support

        I-NP       0.95      0.97      0.96     16177
        B-NP       0.96      0.95      0.95     12985
           O       0.98      0.98      0.98      6210
        B-PP       0.97      0.98      0.97      3979
        B-VP       0.95      0.91      0.93      3767
        I-VP       0.93      0.94      0.93      1913
      B-ADVP       0.76      0.76      0.76       559
      B-SBAR       0.83      0.84      0.83       296
      B-ADJP       0.77      0.65      0.71       276
       B-PRT       0.75      0.72      0.73       110
      I-ADJP       0.62      0.56      0.59        55
      I-ADVP       0.63      0.36      0.46        33
        I-PP       0.73      0.53      0.62        15
      B-INTJ       0.00      0.00      0.00        13
     I-CONJP       0.25      0.14      0.18         7
       B-LST       1.00      0.21      0.34        29
   

Overall, the model achieves high accuracy (0.951), recall, and F1-score for most of the commonly occurring entity labels such as I-NP, B-NP, O, and B-PP. This indicates that the model is able to accurately identify and classify these entities in the text.

However, the performance varies for less frequent or more specific entity labels. For instance, the model shows lower precision, recall, and F1-scores for labels such as B-ADJP, B-VP, and I-VP, suggesting that these entities might be more challenging for the model to identify correctly.

Additionally, some rare or specialized entity labels like B-INTJ, I-INTJ, B-LST, I-LST, B-CONJP, I-CONJP, I-ADJP, B-PRT, I-ADVP, I-PP, B-SBAR, and I-SBAR have relatively lower scores, indicating that the model struggles to accurately classify these entities, potentially due to limited training examples or their infrequent occurrence in the data.

This is indicated also by the macro accuracy (0.603) which  indicates that the model's performance varies across different entity labels. It suggests that while the model achieves high accuracy for some commonly occurring entity labels, it may struggle with rarer or more specialized entity types.

In [None]:
import random

def find_failed_sentence(test_sentences, test_dataset):
    indices = list(range(len(test_sentences)))
    random.shuffle(indices)

    for idx in indices:
        sentence = test_sentences[idx]
        if len(sentence['tokens']) >= 10:
            encoded = test_dataset[idx]
            batch = {k: v.unsqueeze(0).to(device) for k, v in encoded.items()}
            with torch.no_grad():
                outputs = model(**batch)
            pred_values = torch.argmax(outputs[1], dim=2)[0]
            pred_values = pred_values[encoded['labels'] != -100].detach().cpu().numpy()
            true_values = encoded['labels'][encoded['labels'] != -100].detach().cpu().numpy()

            if not (pred_values == true_values).all():
                return sentence, pred_values, true_values

    return None, None, None

failed_sentence, preds, actual = find_failed_sentence(test_sentences, test_dataset)

if failed_sentence:
    itos = tagmap.get_itos()
    print("Failed Sentence Tokens:", failed_sentence['tokens'])
    print("\nActual Tags:", [itos[tag] for tag in actual])
    print("\nPredicted Tags:", [itos[tag] for tag in preds])
    print("\nToken-wise comparison:")
    for token, actual_tag, pred_tag in zip(failed_sentence['tokens'], actual, preds):
        print(f"Token: {token}, Actual: {itos[actual_tag]}, Predicted: {itos[pred_tag]}")
else:
    print("No failed sentence with at least 10 tokens found.")

Failed Sentence Tokens: ['on', 'Friday', 'for', 'their', 'friendly', 'against', 'Scotland', 'at', 'Murrayfield', 'more', 'than', 'a', 'year', 'after', 'the', '30-year-old', 'wing', 'announced', 'he', 'was', 'retiring', 'following', 'differences', 'over', 'selection', '.']

Actual Tags: ['B-PP', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'B-PP', 'B-NP', 'B-ADJP', 'B-PP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'B-VP', 'B-NP', 'B-VP', 'I-VP', 'B-PP', 'B-NP', 'B-PP', 'B-NP', 'O']

Predicted Tags: ['B-PP', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'B-PP', 'B-NP', 'B-NP', 'I-NP', 'B-NP', 'I-NP', 'B-SBAR', 'B-NP', 'I-NP', 'I-NP', 'B-VP', 'B-NP', 'B-VP', 'I-VP', 'B-PP', 'B-NP', 'B-PP', 'B-NP', 'O']

Token-wise comparison:
Token: on, Actual: B-PP, Predicted: B-PP
Token: Friday, Actual: B-NP, Predicted: B-NP
Token: for, Actual: B-PP, Predicted: B-PP
Token: their, Actual: B-NP, Predicted: B-NP
Token: friendly, Actual: I-NP, Predicted: I-NP
Token: against, Actual: B-PP, Predict

Misclassified tokens:

1) Token: more - Actual: B-ADJP, Predicted: B-NP
2) Token: than - Actual: B-PP, Predicted: I-NP
In the given sequence, there are two misclassifications:

1) "more" was predicted as a noun phrase (B-NP) instead of an adjective phrase (B-ADJP).

2) "than" was predicted as an inside noun phrase (I-NP) instead of a prepositional phrase (B-PP).

These errors might have occurred because the model struggled to understand the grammatical structure and meaning of these words in the context. It's a reminder that language understanding can be challenging, even for sophisticated models like this one, and that further improvements and fine-tuning are necessary to enhance its accuracy. Overall though, the accuracy of the model is high in this phrase.

In [None]:
sentence = "Mount Everest, part of the Himalayas, is the Earth's highest mountain above sea level."
tokens = tokenizer(sentence, truncation=True, is_split_into_words=False, return_offsets_mapping=True)

input_ids = torch.tensor(tokens['input_ids']).unsqueeze(0).to(device)
attention_mask = torch.tensor(tokens['attention_mask']).unsqueeze(0).to(device)

model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    predicted_labels = torch.argmax(outputs[0], dim=2).squeeze().tolist()


predicted_labels = [tagmap.get_itos()[label_id] for label_id in predicted_labels]


words = []
for idx, (start, end) in enumerate(tokens['offset_mapping'][1:-1]):
    if start != end and tokens['input_ids'][idx+1] != tokenizer.cls_token_id and tokens['input_ids'][idx+1] != tokenizer.sep_token_id:
        words.append(sentence[start:end])


predicted_labels = [label for idx, label in enumerate(predicted_labels[1:-1]) if tokens['input_ids'][idx+1] != tokenizer.cls_token_id and tokens['input_ids'][idx+1] != tokenizer.sep_token_id]

correct_labels =   ["B-NP", "I-NP", "O", "B-NP", "B-PP", "B-NP", "I-NP", "O", "B-VP", "B-NP", "I-NP", "O", "B-VP", "I-NP", "I-NP", "B-PP", "B-NP", "I-NP", "O"]

indexnum = 1
for word, correct_label, predicted_label in zip(words, correct_labels, predicted_labels):
    print(f"{indexnum}: {word} - Correct: {correct_label}, Predicted: {predicted_label}")
    indexnum += 1

1: Mount - Correct: B-NP, Predicted: B-NP
2: Everest - Correct: I-NP, Predicted: I-NP
3: , - Correct: O, Predicted: O
4: part - Correct: B-NP, Predicted: B-NP
5: of - Correct: B-PP, Predicted: B-PP
6: the - Correct: B-NP, Predicted: B-NP
7: Himalayas - Correct: I-NP, Predicted: I-NP
8: , - Correct: O, Predicted: O
9: is - Correct: B-VP, Predicted: B-VP
10: the - Correct: B-NP, Predicted: B-NP
11: Earth - Correct: I-NP, Predicted: I-NP
12: ' - Correct: O, Predicted: B-NP
13: s - Correct: B-VP, Predicted: B-NP
14: highest - Correct: I-NP, Predicted: I-NP
15: mountain - Correct: I-NP, Predicted: I-NP
16: above - Correct: B-PP, Predicted: B-PP
17: sea - Correct: B-NP, Predicted: B-NP
18: level - Correct: I-NP, Predicted: I-NP
19: . - Correct: O, Predicted: O


12: " ' " - This token is essentially part of "Earth's" and shouldn't be tagged as an independent noun phrase ("B-NP"). The correct tag should likely be "B-NP" if it is considered separately from "Earth", different chunking could handle this differently. The prediction as "B-NP" is incorrect if we consider " ' " as part of "Earth's".

13: "s" - This is the second part of  " 's". If we treat "'s" as separate tokens, this should be "I-NP", continuing the noun phrase started by the " ' ".

Again, the handling of "'s" can be variable across different systems. Some might treat "Earth's" as a single token and some might separate it. Different chunking could handle this differently.