# Environmental Setup

In [None]:
!pip install --quiet evaluate transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ibis-framework 7.1.0 requires pyarrow<15,>=2, but you have pyarrow 15.0.0 which is incompatible.[0m[31m
[0m

In [None]:
!pip install -U accelerate

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m204.8/280.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2


In [None]:
# Native env
from typing import List

# Torch env
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# Huggingface env
import evaluate
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer


# Others
import nltk
import numpy as np
from sklearn.model_selection import train_test_split


In [None]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

# Experiment

In [None]:
model_id = "QCRI/bert-base-multilingual-cased-pos-english"

In [None]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print(f'Number of samples: {len(tagged_sentences)}')
print(f'A sample: {tagged_sentences[0]}')

Number of samples: 3914
A sample: [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


## Preprocessing

In [None]:
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append([word.lower() for word in sentence])
    sentence_tags.append([tag for tag in tags])

In [None]:
print(f'Sentence: {sentences[0]}')
print(f'Sentence tags: {sentence_tags[0]}')

Sentence: ['pierre', 'vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.']
Sentence tags: ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.']


In [None]:
tags = ['0']

for sentence in sentence_tags:
    for tag in sentence:
        tags.append(tag)

tags = list(set(tags))
label2id = {}
count = 0
for tag in tags:
    label2id[tag] = count
    count += 1
id2label = {v: k for k, v in label2id.items()}

In [None]:
label2id

{'VBP': 0,
 'VBG': 1,
 'NNS': 2,
 'TO': 3,
 'WRB': 4,
 'NNPS': 5,
 'PRP': 6,
 'VBN': 7,
 'VBZ': 8,
 ':': 9,
 'RBR': 10,
 'POS': 11,
 'LS': 12,
 'VB': 13,
 ',': 14,
 'PDT': 15,
 'RB': 16,
 'WP': 17,
 "''": 18,
 'NNP': 19,
 'JJR': 20,
 'RBS': 21,
 'CC': 22,
 'PRP$': 23,
 '$': 24,
 'RP': 25,
 'MD': 26,
 '``': 27,
 '-NONE-': 28,
 'WDT': 29,
 'UH': 30,
 'NN': 31,
 'SYM': 32,
 'FW': 33,
 'DT': 34,
 '#': 35,
 '0': 36,
 'WP$': 37,
 'VBD': 38,
 'JJ': 39,
 '-LRB-': 40,
 'EX': 41,
 '-RRB-': 42,
 'CD': 43,
 '.': 44,
 'IN': 45,
 'JJS': 46}

In [None]:
train_sentences , test_sentences , train_tags , test_tags = train_test_split (sentences ,
                                                                              sentence_tags ,
                                                                              test_size =0.3
                                                                              )
valid_sentences , test_sentences , valid_tags , test_tags = train_test_split (test_sentences ,
                                                                              test_tags ,
                                                                              test_size =0.5
                                                                              )

In [None]:
print(train_sentences[0])
print(train_tags[0])

['the', 'loan', 'may', 'be', 'extended', '*-1', 'by', 'the', 'mcalpine', 'group', 'for', 'an', 'additional', 'year', 'with', 'an', 'increase', 'in', 'the', 'conversion', 'price', 'to', '$', '2.50', '*u*', 'a', 'share', '.']
['DT', 'NN', 'MD', 'VB', 'VBN', '-NONE-', 'IN', 'DT', 'NNP', 'NN', 'IN', 'DT', 'JJ', 'NN', 'IN', 'DT', 'NN', 'IN', 'DT', 'NN', 'NN', 'TO', '$', 'CD', '-NONE-', 'DT', 'NN', '.']


## Dataloader

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
MAX_LEN = 256

class DS_PoSTagging(Dataset):
    def __init__(self, sentences, tags, tokenizer, label2id, max_len=MAX_LEN):
        super().__init__()
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        input_token = self.sentences[idx]
        label_token = self.tags[idx]
        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        attention_mask = [1]*(len(input_token))
        labels = [self.label2id[token] for token in label_token]
        return {
            "input_ids": self.pad_and_truncate(input_token, pad_id=self.tokenizer.pad_token_id,),
            "labels": self.pad_and_truncate(labels, pad_id=label2id["0"]),
            "attention_mask": self.pad_and_truncate(attention_mask, pad_id=0)
        }

    def pad_and_truncate(self, inputs: List[int], pad_id:int):
        if len(inputs) < self.max_len:
            padded_inputs = inputs + [pad_id]*(self.max_len - len(inputs))
        else:
            padded_inputs = inputs[: self.max_len]
        return torch.as_tensor(padded_inputs)

In [None]:
train_ds = DS_PoSTagging(train_sentences, train_tags, tokenizer, label2id)
val_ds = DS_PoSTagging(valid_sentences, valid_tags, tokenizer, label2id)
test_ds = DS_PoSTagging(test_sentences, test_tags, tokenizer, label2id)

In [None]:

train_dl = DataLoader(
    train_ds,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
val_dl = DataLoader(
    val_ds, collate_fn=data_collator, batch_size=8
)
test_df = DataLoader(
    test_ds, collate_fn=data_collator, batch_size=8
)

In [None]:
sample = next(iter(train_ds))
sample

28


{'input_ids': tensor([10105, 33390, 11387, 10347, 21777,   100, 10155, 10105,   100, 11795,
         10142, 10151, 18568, 10924, 10169, 10151, 20299, 10106, 10105, 45109,
         30839, 10114,   109,   100,   100,   169, 23867,   119,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

## Model Implementation

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_id,
                                                        label2id=label2id,
                                                        id2label=id2label,
                                                        num_labels=len(label2id),
                                                        ignore_mismatched_sizes=True)

pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([46, 768]) in the checkpoint and torch.Size([47, 768]) in the model i

In [None]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

## Metrics

In [None]:
accuracy = evaluate.load('accuracy')
ignore_label = len(label2id)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mask = labels != ignore_label
    predictions = np.argmax(predictions, axis=-1)
    result = accuracy.compute(predictions=predictions[mask],
                              references=labels[mask])
    return result

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

## Training

In [None]:
training_args = TrainingArguments(
    output_dir="/content/log",
    learning_rate=1e-5,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


13
32
20
32
25
27
18
22


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1742,0.048164,0.987097
2,0.0504,0.041292,0.988521
3,0.0353,0.037758,0.989346
4,0.0307,0.036898,0.989705


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
24
60
2
38
30
42
19
27
13
34
20
18
32
37
32
21
19
20
45
37
30
25
33
36
51
40
25
4
31
39
23
36
15
20
2
22
35
7
35
36
15
38
30
16
19
44
5
39
35
23
54
30
29
26
21
36
28
24
22
15
31
51
46
12
19
17
14
20
30
24
27
41
33
34
7
24
39
25
30
37
22
13
35
28
40
33
38
38
25
16
24
40
21
11
12
25
6
24
22
19
33
15
24
34
31
41
17
25
58
33
16
42
25
22
37
31
33
81
31
7
39
24
24
71
14
25
16
34
16
27
52
39
24
37
33
7
17
33
23
6
22
29
27
21
24
54
15
27
8
27
12
48
45
37
25
23
37
22
5
17
23
23
9
24
44
19
25
32
28
8
47
33
18
23
8
14
32
29
28
31
8
8
24
15
23
24
36
14
35
9
18
6
23
18
46
41
20
2
27
19
44
34
29
28
22
10
12
31
7
28
24
43
20
19
18
58
8
32
30
26
13
42
32
26
12
35
29
33
7
34
28
40
19
19
34
33
8
21
27
27
7
35
11
39
16
27
6
25
26
17
22
14
20
22
26
31
21
12
25
22
43
30
21
4
32
25
19
18
24
12
29
23
27
72
7
31
32
16
32
35
26
55
46
40
33
28
67
18
27
54
13
23
31
35
34
44
16
16
27
23
21
29
14
30
19
13
24
12
6
25
44
11
8
18
17
45
28
14
26
12
23
37

TrainOutput(global_step=2740, training_loss=0.06316913827492372, metrics={'train_runtime': 776.2107, 'train_samples_per_second': 14.115, 'train_steps_per_second': 3.53, 'total_flos': 1431965980735488.0, 'train_loss': 0.06316913827492372, 'epoch': 4.0})

# Inference

In [None]:
# tokenization
test_sentence = "Part-of-speech (POS) tagging is a crucial task in Natural Language Processing (NLP) that assigns grammatical categories to each word in a text."
input = torch.as_tensor([ tokenizer . convert_tokens_to_ids ( test_sentence . split ())])
input = input.to(" cuda ")

# prediction
outputs = model(input)
_, preds = torch.max(outputs.logits, -1)
preds = preds[0].cpu().numpy()

# decode
pred_tags = ""
for pred in preds :
 pred_tags += id2label[pred] + " "
 pred_tags # => PRP VBP RB DT NN IN JJ NN