In order to change the code to perform POS tagging, the following changes were made:

1) From:
```
# build tagset and tag ids
tags = [sentence['ner_tags'] for sentence in train_sentences]

```
To:



```
# build tagset and tag ids
tags = [sentence['pos_tags'] for sentence in train_sentences]
```

2) In the "encode function, from:
```
labels = align_label(encodings, sentence['ner_tags']
```
To:


```
labels = align_label(encodings, sentence['pos_tags'])
```







In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m111.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [5]:
# dependencies
import torch
import torch.optim as optim 
from torchtext.vocab import build_vocab_from_iterator
from transformers import BertForTokenClassification, BertTokenizerFast
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
import tqdm
tqdmn = tqdm.notebook.tqdm

# hyper-parameters
EPOCHS = 3
BATCH_SIZE = 8
LR = 1e-5

# the path of the data files
base_path = '/content/drive/MyDrive/nlpdataset/'

# use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# read the data files
def load_sentences(filepath):

    sentences = []
    tokens = []
    pos_tags = []
    chunk_tags = []
    ner_tags = []

    with open(filepath, 'r') as f:
        
        for line in f.readlines():
            
            if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                if len(tokens) > 0:
                    sentences.append({'tokens': tokens, 'pos_tags': pos_tags, 'chunk_tags': chunk_tags, 'ner_tags': ner_tags})
                    tokens = []
                    pos_tags = []
                    chunk_tags = []
                    ner_tags = []
            else:
                l = line.split(' ')
                tokens.append(l[0])
                pos_tags.append(l[1])
                chunk_tags.append(l[2])
                ner_tags.append(l[3].strip('\n'))
    
    return sentences

print('loading data')
train_sentences = load_sentences(base_path + 'train.txt')
test_sentences = load_sentences(base_path + 'test.txt')
valid_sentences = load_sentences(base_path + 'valid.txt')

# build tagset and tag ids
tags = [sentence['pos_tags'] for sentence in train_sentences]
tagmap = build_vocab_from_iterator(tags)
tagset = set([item for sublist in tags for item in sublist])
print('Tagset size:',len(tagset))

# load BERT tokenizer
bert_version = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(bert_version)

# map tokens and tags to token ids and label ids
def align_label(tokens, labels):

    word_ids = tokens.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(tagmap[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

def encode(sentence):
    encodings = tokenizer(sentence['tokens'], truncation=True, padding='max_length', is_split_into_words=True)
    labels = align_label(encodings, sentence['pos_tags'])
    return { 'input_ids': torch.LongTensor(encodings.input_ids), 'attention_mask': torch.LongTensor(encodings.attention_mask), 'labels': torch.LongTensor(labels) }

print('encoding data')
train_dataset = [encode(sentence) for sentence in train_sentences]
valid_dataset = [encode(sentence) for sentence in valid_sentences]
test_dataset = [encode(sentence) for sentence in test_sentences]

# initialize the model including a classification layer with num_labels classes
print('initializing the model')
model = BertForTokenClassification.from_pretrained(bert_version, num_labels=len(tagset))
model.to(device)
optimizer = optim.AdamW(params=model.parameters(), lr=LR)

# prepare batches of data
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

# evaluate the performance of the model
def EvaluateModel(model, data_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds = [],[]
        for i, batch in enumerate(tqdmn(data_loader)):
            # move the batch tensors to the same device as the model
            batch = { k: v.to(device) for k, v in batch.items() }
            # send 'input_ids', 'attention_mask' and 'labels' to the model
            outputs = model(**batch)
            # iterate through the examples
            for idx, _ in enumerate(batch['labels']):
                # get the true values
                true_values_all = batch['labels'][idx]
                true_values = true_values_all[true_values_all != -100]
                # get the predicted values
                pred_values = torch.argmax(outputs[1], dim=2)[idx]
                pred_values = pred_values[true_values_all != -100]
                # update the lists of true answers and predictions
                Y_actual.append(true_values)
                Y_preds.append(pred_values)
        Y_actual = torch.cat(Y_actual)
        Y_preds = torch.cat(Y_preds)
    # Return list of actual labels, predicted labels 
    return Y_actual.detach().cpu().numpy(), Y_preds.detach().cpu().numpy()

# train the model
print('training the model')
for epoch in tqdmn(range(EPOCHS)):
    model.train()
    print('epoch',epoch+1)
    # iterate through each batch of the train data
    for i, batch in enumerate(tqdmn(train_loader)):
        # move the batch tensors to the same device as the model
        batch = { k: v.to(device) for k, v in batch.items() }
        # send 'input_ids', 'attention_mask' and 'labels' to the model
        outputs = model(**batch)
        loss = outputs[0]
        # set the gradients to zero
        optimizer.zero_grad()
        # propagate the loss backwards
        loss.backward()
        # update the model weights
        optimizer.step()
    # calculate performence on validation set
    Y_actual, Y_preds = EvaluateModel(model,valid_loader)
    print("\nValidation Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
    print("\nValidation Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))

print('applying the model to the test set')
# apply the trained model to the test set
Y_actual, Y_preds = EvaluateModel(model,test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
print("\nTest Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds,labels = tagmap(tagmap.get_itos()), target_names = tagmap.get_itos(), zero_division = 0))



loading data
Tagset size: 45


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

encoding data
initializing the model


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

training the model


  0%|          | 0/3 [00:00<?, ?it/s]

epoch 1


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.937

Validation Macro-Accuracy : 0.742
epoch 2


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.942

Validation Macro-Accuracy : 0.811
epoch 3


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.946

Validation Macro-Accuracy : 0.836
applying the model to the test set


  0%|          | 0/432 [00:00<?, ?it/s]


Test Accuracy : 0.941

Test Macro-Accuracy : 0.860

Classification Report : 
              precision    recall  f1-score   support

         NNP       0.91      0.92      0.92      8595
          NN       0.90      0.89      0.89      4931
          CD       0.97      1.00      0.98      5962
          IN       0.98      0.99      0.99      4018
          DT       0.99      0.99      0.99      2799
          JJ       0.87      0.81      0.84      2393
         NNS       0.91      0.94      0.93      2174
         VBD       0.93      0.96      0.94      1699
           .       1.00      1.00      1.00      1630
           ,       1.00      1.00      1.00      1637
          VB       0.91      0.90      0.90       933
         VBN       0.89      0.85      0.87       866
          RB       0.91      0.85      0.88       888
          CC       1.00      0.99      1.00       765
          TO       1.00      1.00      1.00       818
         PRP       1.00      0.97      0.98       605
   

The results of the POS (Part of Speech) tagging model on the test set are quite impressive, with a high overall accuracy of 0.941 and a macro-accuracy of 0.860. This high overall accuracy indicates that the model is adept at recognizing and classifying various parts of speech in the text. Furthermore, the macro-accuracy, which gives equal weight to each class irrespective of their frequency in the dataset, also signifies commendable performance, showing that the model has a balanced performance across all classes.

Examining the detailed classification report, we see that the model demonstrates excellent precision, recall, and f1-scores for most classes. Many classes score close to or at a perfect 1.00, indicating that the model is not only correctly identifying the parts of speech but also minimizing the number of false positives and false negatives. However, there are a few areas of concern. The model seems to struggle with certain classes such as 'NNPS', 'FW', and 'LS', which are evident from their relatively lower f1-scores. These lower scores could be attributed to these classes being underrepresented in the training data or the inherent difficulty in accurately identifying these specific parts of speech. Further investigation and possibly retraining the model with a more balanced dataset might be necessary to improve the model's performance for these classes. Despite these few shortcomings, the overall performance of the POS tagging model is strong, demonstrating high proficiency in identifying and classifying most parts of speech in the test dataset.

In [6]:
import random

def find_failed_sentence(test_sentences, test_dataset):
    indices = list(range(len(test_sentences)))
    random.shuffle(indices)

    for idx in indices:
        sentence = test_sentences[idx]
        if len(sentence['tokens']) >= 10:
            encoded = test_dataset[idx]
            batch = {k: v.unsqueeze(0).to(device) for k, v in encoded.items()}
            with torch.no_grad():
                outputs = model(**batch)
            pred_values = torch.argmax(outputs[1], dim=2)[0]
            pred_values = pred_values[encoded['labels'] != -100].detach().cpu().numpy()
            true_values = encoded['labels'][encoded['labels'] != -100].detach().cpu().numpy()

            if not (pred_values == true_values).all():
                return sentence, pred_values, true_values

    return None, None, None

failed_sentence, preds, actual = find_failed_sentence(test_sentences, test_dataset)

if failed_sentence:
    itos = tagmap.get_itos()
    print("Failed Sentence Tokens:", failed_sentence['tokens'])
    print("\nActual Tags:", [itos[tag] for tag in actual])
    print("\nPredicted Tags:", [itos[tag] for tag in preds])
    print("\nToken-wise comparison:")
    for token, actual_tag, pred_tag in zip(failed_sentence['tokens'], actual, preds):
        print(f"Token: {token}, Actual: {itos[actual_tag]}, Predicted: {itos[pred_tag]}")
else:
    print("No failed sentence with at least 10 tokens found.")

Failed Sentence Tokens: ['A', 'meeting', 'with', 'the', 'Pope', 'is', 'also', 'planned', ',', '"', 'Dariusz', 'Rosati', 'told', 'a', 'news', 'conference', '.']

Actual Tags: ['DT', 'NN', 'IN', 'DT', 'NNP', 'VBZ', 'RB', 'VBN', ',', '"', 'NNP', 'NNP', 'VBD', 'DT', 'NN', 'NN', '.']

Predicted Tags: ['DT', 'NN', 'IN', 'DT', 'NN', 'VBZ', 'RB', 'VBN', ',', '"', 'NNP', 'NNP', 'VBD', 'DT', 'NN', 'NN', '.']

Token-wise comparison:
Token: A, Actual: DT, Predicted: DT
Token: meeting, Actual: NN, Predicted: NN
Token: with, Actual: IN, Predicted: IN
Token: the, Actual: DT, Predicted: DT
Token: Pope, Actual: NNP, Predicted: NN
Token: is, Actual: VBZ, Predicted: VBZ
Token: also, Actual: RB, Predicted: RB
Token: planned, Actual: VBN, Predicted: VBN
Token: ,, Actual: ,, Predicted: ,
Token: ", Actual: ", Predicted: "
Token: Dariusz, Actual: NNP, Predicted: NNP
Token: Rosati, Actual: NNP, Predicted: NNP
Token: told, Actual: VBD, Predicted: VBD
Token: a, Actual: DT, Predicted: DT
Token: news, Actual: NN, 

In the provided sentence, there's only one token that was tagged incorrectly: "Pope". The correct tag for "Pope" is 'NNP', indicating a proper noun, but the model mistakenly tagged it as 'NN', suggesting a common noun. This misclassification could be attributed to the model not having encountered sufficient examples of "Pope" being used as a proper noun during training.

However, it's important to note that all other tokens in the sentence were correctly classified by the model, which demonstrates its overall effectiveness in POS tagging. Despite the misclassification of "Pope", the model has correctly identified and tagged other proper nouns such as "Dariusz" and "Rosati", along with various other parts of speech. This indicates the model's strength in generalizing from its training data, although it does underscore some areas for potential improvement.

In [14]:
sentence = "Mount Everest, part of the Himalayas, is the Earth's highest mountain above sea level."
tokens = tokenizer(sentence, truncation=True, is_split_into_words=False, return_offsets_mapping=True)

input_ids = torch.tensor(tokens['input_ids']).unsqueeze(0).to(device)
attention_mask = torch.tensor(tokens['attention_mask']).unsqueeze(0).to(device)

model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    predicted_labels = torch.argmax(outputs[0], dim=2).squeeze().tolist()


predicted_labels = [tagmap.get_itos()[label_id] for label_id in predicted_labels]


words = []
for idx, (start, end) in enumerate(tokens['offset_mapping'][1:-1]):
    if start != end and tokens['input_ids'][idx+1] != tokenizer.cls_token_id and tokens['input_ids'][idx+1] != tokenizer.sep_token_id:
        words.append(sentence[start:end])


predicted_labels = [label for idx, label in enumerate(predicted_labels[1:-1]) if tokens['input_ids'][idx+1] != tokenizer.cls_token_id and tokens['input_ids'][idx+1] != tokenizer.sep_token_id]

correct_labels = ["NNP", "NNP", ",", "NN", "IN", "DT", "NNPS", ",", "VBZ", "DT", "NNP", "POS", "POS", "JJS", "NN", "IN", "NN", "NN", "."]
indexnum = 1
for word, correct_label, predicted_label in zip(words, correct_labels, predicted_labels):
    print(f"{indexnum}: {word} - Correct: {correct_label}, Predicted: {predicted_label}")
    indexnum += 1

1: Mount - Correct: NNP, Predicted: NNP
2: Everest - Correct: NNP, Predicted: NNP
3: , - Correct: ,, Predicted: ,
4: part - Correct: NN, Predicted: NN
5: of - Correct: IN, Predicted: IN
6: the - Correct: DT, Predicted: DT
7: Himalayas - Correct: NNPS, Predicted: NNPS
8: , - Correct: ,, Predicted: ,
9: is - Correct: VBZ, Predicted: VBZ
10: the - Correct: DT, Predicted: DT
11: Earth - Correct: NNP, Predicted: NN
12: ' - Correct: POS, Predicted: POS
13: s - Correct: POS, Predicted: POS
14: highest - Correct: JJS, Predicted: JJS
15: mountain - Correct: NN, Predicted: NN
16: above - Correct: IN, Predicted: IN
17: sea - Correct: NN, Predicted: NN
18: level - Correct: NN, Predicted: NN
19: . - Correct: ., Predicted: .


In the given sequence, there are two misclassifications:

"Earth" was incorrectly predicted as a common noun (NN) instead of a proper noun (NNP). This misclassification might be due to the word "Earth" having multiple meanings, including referring to the ground. 


Overall, while the model performed well in identifying the majority of the entities and maintaining the correct punctuation, it made an error in classifying "Earth".