[BERT](https://arxiv.org/abs/1810.04805) is known to be good at Sequence tagging tasks like Named Entity Recognition. Let's see if it's true for POS-tagging.

In [6]:
__author__ = "kyubyong"
__address__ = "https://github.com/kyubyong/nlp_made_easy"
__email__ = "kbpark.linguist@gmail.com"

In [7]:
import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim

In [8]:
torch.__version__

'2.0.0+cu117'

# Data preparation

Thanks to the great NLTK, we don't have to worry about datasets. Some of Penn Tree Banks are included in it. I believe they serves for the purpose.

In [9]:
import nltk
tagged_sents = nltk.corpus.treebank.tagged_sents()
len(tagged_sents)

3914

In [10]:
tagged_sents[0]

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [11]:
tags = list(set(word_pos[1] for sent in tagged_sents for word_pos in sent))

In [12]:
",".join(tags)

"#,-LRB-,NNS,PDT,-RRB-,UH,VBG,JJ,DT,EX,TO,WP$,SYM,RP,NN,WP,NNPS,FW,RBR,.,JJR,PRP$,NNP,MD,RB,PRP,``,WDT,'',:,JJS,CC,VBN,WRB,,,RBS,CD,VBZ,POS,VB,LS,-NONE-,VBP,IN,$,VBD"

In [13]:
# By convention, the 0'th slot is reserved for padding.
tags = ["<pad>"] + tags

In [14]:
tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}

In [15]:
# Let's split the data into train and test (or eval)
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(tagged_sents, test_size=.1)
len(train_data), len(test_data)

(3522, 392)

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Data loader


In [17]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
import torch

model_name = "QCRI/bert-base-multilingual-cased-pos-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [18]:
class PosDataset(data.Dataset):
    def __init__(self, tagged_sents):
        sents, tags_li = [], [] # list of lists
        for sent in tagged_sents:
            words = [word_pos[0] for word_pos in sent]
            tags = [word_pos[1] for word_pos in sent]
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tags_li.append(["<pad>"] + tags + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen


In [19]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

# Model

In [20]:
class Net(nn.Module):
    def __init__(self, vocab_size=None):
        super().__init__()
        self.model = AutoModelForTokenClassification.from_pretrained(model_name)
        self.bert = self.model.bert
        self.masking_layer = torch.ones(768).to("cuda")

        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, x, y):
        '''
        x: (N, T). int64
        y: (N, T). int64
        '''
        x = x.to(device)
        y = y.to(device)
        
        if self.training:
            self.bert.train()
            encoded_layers = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers = self.bert(x)
                enc = encoded_layers[-1]
        # enc = nn.ReLU(enc)
        enc = enc * self.masking_layer
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        confidence = logits.softmax(-1).max(-1).values
        return enc, logits, y, y_hat, confidence

# Train an evaluate

In [21]:
def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        enc, logits, y, _, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

## Load model and train

In [22]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
# model = nn.DataParallel(model)

  return self.fget.__get__(instance, owner)()
Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Net(
  (model): BertForTokenClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
 

In [23]:
train_dataset = PosDataset(train_data)
eval_dataset = PosDataset(test_data)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=8,
                             collate_fn=pad)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=1,
                             shuffle=False,
                             num_workers=8,
                             collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr = 0.0001)

criterion = nn.CrossEntropyLoss(ignore_index=0)

In [24]:
def eval(model, iterator):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    encodings_by_tag = {}  # Dictionary to store encodings by tag
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            enc, _, _, y_hat, conf = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())
            for t, encoding, h in zip(y[0].numpy(), enc[0], is_heads[0]):
                if(h):
                    if t not in encodings_by_tag:
                        encodings_by_tag[t] = []
                
                    encodings_by_tag[t].append(encoding.cpu().numpy())

    ## gets results and save
    with open("result", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")
            
    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

    acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)

    print("acc=%.3f"%acc)
    return encodings_by_tag


In [25]:
for i in range(1):
    train(model, train_iter, optimizer, criterion)
# enc_dict = eval(model, test_iter)


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


step: 0, loss: 4.003780364990234
step: 10, loss: 0.4466284513473511
step: 20, loss: 0.15584906935691833
step: 30, loss: 0.14181546866893768
step: 40, loss: 0.15746544301509857
step: 50, loss: 0.1511995792388916
step: 60, loss: 0.16946524381637573
step: 70, loss: 0.07899316400289536
step: 80, loss: 0.18350891768932343
step: 90, loss: 0.15849536657333374
step: 100, loss: 0.07864541560411453
step: 110, loss: 0.1466234028339386
step: 120, loss: 0.1417870968580246
step: 130, loss: 0.07233582437038422
step: 140, loss: 0.09654643386602402
step: 150, loss: 0.0435744933784008
step: 160, loss: 0.09424896538257599
step: 170, loss: 0.0653313398361206
step: 180, loss: 0.07478292286396027
step: 190, loss: 0.10434367507696152
step: 200, loss: 0.16346926987171173
step: 210, loss: 0.07044389843940735
step: 220, loss: 0.0687364712357521
step: 230, loss: 0.06377595663070679
step: 240, loss: 0.19981169700622559
step: 250, loss: 0.08935202658176422
step: 260, loss: 0.07559703290462494
step: 270, loss: 0.04

In [26]:
import numpy as np
import torch

def compute_masks(fc_vals, percent):
    # Convert input to numpy array
    fc_vals_array = np.array(fc_vals)
    
    # Compute statistics
    mean_vals = np.mean(np.abs(fc_vals_array), axis=0)
    std_vals = np.std(fc_vals_array, axis=0)
    min_vals = np.min(fc_vals_array, axis=0)
    max_vals = np.max(fc_vals_array, axis=0)
    
    # Normalize standard deviation
    std_vals_normalized = (std_vals - min_vals) / (max_vals - min_vals)
    
    # Convert to PyTorch tensors
    mean_vals_tensor = torch.from_numpy(mean_vals)
    std_vals_tensor = torch.from_numpy(std_vals_normalized)
    
    # Compute masks
    mask_max = compute_max_mask(mean_vals_tensor, percent)
    mask_std = compute_std_mask(std_vals_tensor, percent)
    mask_max_low_std = compute_max_low_std_mask(mean_vals_tensor, std_vals_tensor, percent)
    mask_intersection = torch.logical_or(mask_std, mask_max).float()
    
    return mask_max, mask_std, mask_intersection, mask_max_low_std

def compute_max_mask(values, percent):
    sorted_indices = torch.argsort(values, descending=True)
    mask_count = int(percent * len(values))
    mask = torch.ones_like(values)
    mask[sorted_indices[:mask_count]] = 0.0
    return mask

def compute_std_mask(values, percent):
    sorted_indices = torch.argsort(values, descending=False)
    mask_count = int(percent * len(values))
    mask = torch.ones_like(values)
    mask[sorted_indices[:mask_count]] = 0.0
    return mask

def compute_max_low_std_mask(mean_vals, std_vals, percent):
    # Get indices of bottom 50% std values
    bottom_50_percent_std_count = int(0.99 * len(std_vals))
    bottom_50_percent_std_indices = torch.argsort(std_vals)[:bottom_50_percent_std_count]
    
    # Create a mask for bottom 50% std values
    bottom_50_percent_std_mask = torch.zeros_like(std_vals, dtype=torch.bool)
    bottom_50_percent_std_mask[bottom_50_percent_std_indices] = True
    
    # Filter mean values
    mean_vals_filtered = mean_vals.clone()
    mean_vals_filtered[~bottom_50_percent_std_mask] = float('-inf')
    
    # Compute mask
    return compute_max_mask(mean_vals_filtered, percent)

In [28]:
from utilities import mask, eval
tok = 33
model.masking_layer = torch.ones(768).to("cuda")
activation_iter = data.DataLoader(dataset=train_dataset+eval_dataset,
                             batch_size=1,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

enc_dict = eval(model, test_iter, idx2tag, tag2idx, tok)

mask_max, mask_std, mask_intersection,mask_max_low_std = compute_masks(enc_dict[tok],0.3)

model = mask(model,mask_max_low_std)

enc_dict = eval(model, activation_iter, idx2tag, tag2idx, tok)

# # size of encodings_by_tag
# i=0
# for k, v in enc_dict.items():
#     print(i,k ,idx2tag[k], len(v))
#     i+=1
    
# print(idx2tag)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Overall accuracy: 0.971
Overall confidence: 0.971
<pad>: N/A (0 occurrences)
UH: N/A (0 occurrences)
SYM: N/A (0 occurrences)
FW: N/A (0 occurrences)
Specific token accuracy: 0.9643, Specific token confidence: 0.954
Other tokens accuracy: 0.9707, Other tokens confidence: 0.9718


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Overall accuracy: 0.976
Overall confidence: 0.890
<pad>: N/A (0 occurrences)
Specific token accuracy: 0.8655, Specific token confidence: 0.2313
Other tokens accuracy: 0.9786, Other tokens confidence: 0.9041


In [37]:
from utilities import compute_masks, mask, eval

model.masking_layer = torch.ones(768).to("cuda")
activation_iter = data.DataLoader(dataset=train_dataset+eval_dataset,
                             batch_size=1,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

enc_dict = eval(model, activation_iter, idx2tag, tag2idx, tok)

mask_max, _,mask_std,_ = compute_masks(enc_dict[tok],0.3)

model = mask(model,mask_max)

enc_dict = eval(model, activation_iter, idx2tag, tag2idx, tok)

Overall accuracy: 0.996
Overall confidence: 0.998
<pad>: N/A (0 occurrences)
Specific token accuracy: 0.9987, Specific token confidence: 1.0
Other tokens accuracy: 0.9957, Other tokens confidence: 0.998
Overall accuracy: 0.996
Overall confidence: 0.985
<pad>: N/A (0 occurrences)
Specific token accuracy: 0.9987, Specific token confidence: 0.6694
Other tokens accuracy: 0.9957, Other tokens confidence: 0.9874
