In [8]:
!pip install evaluate



--------------------
# **Library**

In [9]:
from typing import List
import numpy as np
import torch
import torch.nn as nn
import evaluate
from sklearn.model_selection import train_test_split
import nltk

from transformers import AutoTokenizer
from torch.utils.data import Dataset

-------------------------------
# **Data Sample**

In [10]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to /usr/share/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [11]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print(f"Length of Corpus: {len(tagged_sentences)}")
print(f"Type of Samples: {type(tagged_sentences)}")
print(f"Demo Sample: {tagged_sentences}")

# Take 1st Sample
sent1 = tagged_sentences[0]
print("-"*59)
sent, tag = zip(*sent1)
print(f"This is the sentence (Already splitted): {sent}")
print(f"This is the Tags/Labels of each words in sentence: {tag}")

Length of Corpus: 3914
Type of Samples: <class 'nltk.corpus.reader.util.ConcatenatedCorpusView'>
Demo Sample: [[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]
-----------------------------------------------------------
This is the sentence (Already splitted): ('Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.')
This is the Tags/Labels of each words in sentence: ('NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT',

In [12]:
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append([word.lower() for word in sentence])
    sentence_tags.append([tag for tag in tags])

--------------------------------
# **Train/Test Split**

In [13]:
# Train: 70% | Test: 30%
train_sentences, test_sentences, train_tags, test_tags = train_test_split(
    sentences, sentence_tags, test_size = .3
)

# Test: 30% => Test: 15% | Eval: 15% 
eval_sentences, test_sentences, valid_tags, test_tags = train_test_split(
    test_sentences,
    test_tags,
    test_size = 0.5
)

-------------------------
# **DataLoader**

In [14]:
model_name = 'QCRI_bert-base-multilingual-cased-pos-english'
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast = True
)
max_len = 256
class PosTagging_Dataset(Dataset):
    """
    Input:
        sentences: List of Sentences -> List
        tags: List of Labels for words in sentences -> List
        tokenizer: tokenizer
        label2idx: Dictionary to encode Label -> dict
        max_len: Max Length of a sentence -> int
    """
    def __init__(self,
                sentences,
                tags,
                tokenizer,
                label2idx,
                max_len):
        
        super().__init__()
        self.sentences = sentences
        self.tags = tags
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.label2id = label2id

    def __len__(self):
        return len(self.sentences)

    def __get__item(self, idx):
        input_token = self.sentences[idx]
        label_token = self.tags[idx]

        # It does EXACTLY what the method called
        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        # Initialize Attention Mask
        """
           [1, 1, 1, 1,..., 1 ] -> Length = Input Length
        """
        attention_mask = [1]*len(input_token)

        # Encode Labels
        labels = [self.label2id[token] for token in label_token]

        return{
            "input_ids": self.pad_and_truncate(input_token,
                                             pad_id = self.tokenizer.pad_token_id),
            "labels": self.pad_and_truncate(labels, pad_id = label2id['0']),
            "attention_mask": self.pad_and_truncate(attention_mask, pad_id = 0)
        }
    # Padding/Truncate to sentences that don't have the same length as max_len
    def pad_and_truncate(self, inputs, pad_id):
        if len(inputs) < self.max_len:
            padded_inputs = inputs + [pad_id]*(self.max_len - len(inputs))
        else:
            padded_inputs = inputs[:self.max_len] # Take til max_len reached
        return torch.as_tensor(padded_inputs)

    def label2id(self, tags):
        return[self.label2id[tag] for tag in tags]

OSError: QCRI_bert-base-multilingual-cased-pos-english is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
train_set = PosTagging_Dataset(train_sentences, train_tags, tokenizer, label2id)
test_set = PosTagging_Dataset(test_sentences, test_tags, tokenizer, label2id)
eval_set = PosTagging_Dataset(eval_sentences, eval_tags, tokenizer, label2id)

----------------------------------
# **Model**

In [None]:
model_name = 'QCRI_bert-base-multilingual-cased-pos-english'
model = AutoModelForTokenClassification.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda_is_available() else 'cpu')

----------------------------
# **Compute Metric**

In [None]:
accuracy = evaluate.load('accuracy')

ignore_label = len(label2id)

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    mask = labels != ignore_label
    predictions = np.argmax(predictions, axis = -1)
    return accuracy.compute(predictions = predictions[mask], references = labels[mask])

------------------------------
# **Train**

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = 'out_dir',
    learning_rate = 1e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 10,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_set,
    eval_dataset = val_set,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()

-----------------------
# **Infer**

In [None]:
test_sentence = "James Harden we are exploring pos tagging octopus penguin"
inpt = torch.as_tensor([tokenizer.convert_tokens_to_ids
                         (test_sentence.splt())])
inpt = inpt.to(device)

outputs = model(inpt)
_, preds = torch.max(outputs.logits, -1)
preds = preds[0].cpu().numpy()

# Decode
pred_tags = ""
for pred in preds:
    pred_tags += id2label[pred] + " "
    
print(pred_tags)