In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


--------------------
# **Library**

In [2]:
from typing import List
import numpy as np
import torch
import torch.nn as nn
import evaluate
from sklearn.model_selection import train_test_split
import nltk

from transformers import AutoTokenizer
from torch.utils.data import Dataset

-------------------------------
# **Data Sample**

In [3]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to /usr/share/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [4]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print(f"Length of Corpus: {len(tagged_sentences)}")
print(f"Type of Samples: {type(tagged_sentences)}")
print(f"Demo Sample: {tagged_sentences}")

# Take 1st Sample
sent1 = tagged_sentences[0]
print("-"*59)
sent, tag = zip(*sent1)
print(f"This is the sentence (Already splitted): {sent}")
print(f"This is the Tags/Labels of each words in sentence: {tag}")

Length of Corpus: 3914
Type of Samples: <class 'nltk.corpus.reader.util.ConcatenatedCorpusView'>
Demo Sample: [[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]
-----------------------------------------------------------
This is the sentence (Already splitted): ('Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.')
This is the Tags/Labels of each words in sentence: ('NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT',

In [5]:
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append([word.lower() for word in sentence])
    sentence_tags.append([tag for tag in tags])

--------------------------------
# **Train/Test Split**

In [6]:
# Train: 70% | Test: 30%
train_sentences, test_sentences, train_tags, test_tags = train_test_split(
    sentences, sentence_tags, test_size = .3
)

# Test: 30% => Test: 15% | Eval: 15% 
eval_sentences, test_sentences, eval_tags, test_tags = train_test_split(
    test_sentences,
    test_tags,
    test_size = 0.5
)

-------------------------
# **DataLoader**

In [7]:
model_name = "QCRI/bert-base-multilingual-cased-pos-english"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast = True
)
max_len = 256
class PosTagging_Dataset(Dataset):
    """
    Input:
        sentences: List of Sentences -> List
        tags: List of Labels for words in sentences -> List
        tokenizer: tokenizer
        label2idx: Dictionary to encode Label -> dict
        max_len: Max Length of a sentence -> int
    """
    def __init__(self,
                sentences,
                tags,
                tokenizer,
                label2id,
                max_len = max_len):
        
        super().__init__()
        self.sentences = sentences
        self.tags = tags
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.label2id = label2id

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        input_token = self.sentences[idx]
        label_token = self.tags[idx]

        # It does EXACTLY what the method called
        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        # Initialize Attention Mask
        """
           [1, 1, 1, 1,..., 1 ] -> Length = Input Length
        """
        attention_mask = [1]*len(input_token)

        # Encode Labels
        labels = [self.label2id[token] for token in label_token]

        return{
            "input_ids": self.pad_and_truncate(input_token,
                                             pad_id = self.tokenizer.pad_token_id),
            "labels": self.pad_and_truncate(labels, pad_id = 0),
            "attention_mask": self.pad_and_truncate(attention_mask, pad_id = 0)
        }
    # Padding/Truncate to sentences that don't have the same length as max_len
    def pad_and_truncate(self, inputs, pad_id):
        if len(inputs) < self.max_len:
            padded_inputs = inputs + [pad_id]*(self.max_len - len(inputs))
        else:
            padded_inputs = inputs[:self.max_len] # Take til max_len reached
        return torch.as_tensor(padded_inputs)

    def label2id(self, tags):
        return[self.label2id[tag] for tag in tags]

type(sentence_tags)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

list

In [8]:
Labels = []
for i in range(len(sentence_tags)):
    for j in sentence_tags[i]:
        if j not in Labels:
            Labels.append(j)

label2id = {v:k for k, v in enumerate(Labels)}
id2label = {k:v for k, v in enumerate(Labels)}

In [9]:
id2label

{0: 'NNP',
 1: ',',
 2: 'CD',
 3: 'NNS',
 4: 'JJ',
 5: 'MD',
 6: 'VB',
 7: 'DT',
 8: 'NN',
 9: 'IN',
 10: '.',
 11: 'VBZ',
 12: 'VBG',
 13: 'CC',
 14: 'VBD',
 15: 'VBN',
 16: '-NONE-',
 17: 'RB',
 18: 'TO',
 19: 'PRP',
 20: 'RBR',
 21: 'WDT',
 22: 'VBP',
 23: 'RP',
 24: 'PRP$',
 25: 'JJS',
 26: 'POS',
 27: '``',
 28: 'EX',
 29: "''",
 30: 'WP',
 31: ':',
 32: 'JJR',
 33: 'WRB',
 34: '$',
 35: 'NNPS',
 36: 'WP$',
 37: '-LRB-',
 38: '-RRB-',
 39: 'PDT',
 40: 'RBS',
 41: 'FW',
 42: 'UH',
 43: 'SYM',
 44: 'LS',
 45: '#'}

In [10]:
train_set = PosTagging_Dataset(train_sentences, train_tags, tokenizer, label2id)
test_set = PosTagging_Dataset(test_sentences, test_tags, tokenizer, label2id)
eval_set = PosTagging_Dataset(eval_sentences, eval_tags, tokenizer, label2id)

----------------------------------
# **Model**

In [11]:
from huggingface_hub import login

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HUGGINGFACE_TOKEN = user_secrets.get_secret("HF_TOKEN")

# Login to Hugging Face
login(HUGGINGFACE_TOKEN)

In [12]:
from transformers import AutoModelForTokenClassification

model_name = "QCRI/bert-base-multilingual-cased-pos-english"
model = AutoModelForTokenClassification.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


----------------------------
# **Compute Metric**

In [13]:
accuracy = evaluate.load('accuracy')

ignore_label = len(label2id)

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    mask = labels != ignore_label
    predictions = np.argmax(preds, axis = -1)
    return accuracy.compute(predictions = predictions[mask], references = labels[mask])

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

------------------------------
# **Train**

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = 'out_dir',
    learning_rate = 1e-5,
    logging_steps = 1,
    logging_dir = './logs',
    logging_strategy = 'epoch',
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
    optim = 'adamw_torch',
    report_to = 'none'
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_set,
    eval_dataset = eval_set,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8732,0.194499,0.950783
2,0.1299,0.074879,0.980495
3,0.0783,0.060995,0.983124
4,0.0664,0.054195,0.984927
5,0.0616,0.052243,0.985526




TrainOutput(global_step=430, training_loss=0.24187670086705407, metrics={'train_runtime': 442.934, 'train_samples_per_second': 30.919, 'train_steps_per_second': 0.971, 'total_flos': 1789941299604480.0, 'train_loss': 0.24187670086705407, 'epoch': 5.0})

-----------------------
# **Infer**

In [15]:
test_sentence = "James Harden we are exploring pos tagging octopus penguin. We have a dog that is a funny # UH, angry, sad, happy. Lebron James."
inpt = torch.as_tensor([tokenizer.convert_tokens_to_ids
                         (test_sentence.split())])
inpt = inpt.to(device)

outputs = model(inpt)
_, preds = torch.max(outputs.logits, -1)
preds = preds[0].cpu().numpy()

# Decode
pred_tags = ""
for pred in preds:
    pred_tags += id2label[pred] + " "
    
print(pred_tags)

NNP -NONE- PRP VBP RB VBG JJ NNP -NONE- PRP VBP DT NN WDT VBZ DT JJ -NONE- JJ NNP NNP NNP NNP NNP 
