# Model

In [2]:
!pip install transformers



In [3]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [4]:
import torch
import torch.nn.utils.prune as prune
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForTokenClassification
from transformers import AdamW
# phobert = AutoModelForTokenClassification.from_pretrained("vinai/phobert-base", num_labels=9)#AutoModel.from_pretrained("vinai/phobert-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

## Number of parameters

In [6]:
# sum(p.numel() for p in phobert.parameters() if p.requires_grad)

# Data

In [7]:
import io
from torch.utils.data import DataLoader, Dataset

In [8]:
# from google.colab import drive
# drive.mount('/content/drive')

In [9]:
POS_PATH = 'data/POS_data/POS_data'
NER_PATH = 'data/NER_data'

In [10]:
class TokenClassificationDataset(Dataset):
  """
    text_data's line has the form: WORD\tLABEL
    label_map is a list containing of all target label (e.g ["N", "CH"])

  """
  def __init__(self, text_data, label_map):
    super().__init__()
    self.sentences = []
    self.labels = []
    self.pad_token_label_id = -100
    self.max_seq_length = 256


    for sent in text_data:
      t_label, t_sent = [], ""
      for line in sent.split("\n"):
        t_label.append(label_map.index(line.split("\t")[1]))
        t_sent += line.split("\t")[0] + " "
      self.sentences.append(t_sent.strip())
      self.labels.append(t_label)

  def __len__(self):
    return len(self.labels)
  def __getitem__(self, idx):

    sentence = self.sentences[idx]
    labels = self.labels[idx]

    tokens = []
    label_ids = []

    # mimic the feature of fast tokenizer in a dumb way
    offset_mapping = [-1] # CLS
    for w, tag in zip(sentence.split(" "), labels):
      word_tokens = tokenizer.tokenize(w)
      # -2 for [cls] and end token
      offset_mapping += (list(range(len(word_tokens) - 2)))

      # bert-base-multilingual-cased sometimes output nothing ([]) when calling tokenize with just a space.
      if (len(word_tokens) > 0):
        tokens.extend(word_tokens)
        label_ids.extend([tag] + [self.pad_token_label_id] * (len(word_tokens) - 1))

    # add pad & end token
    offset_mapping += ([-1]*(256 - len(offset_mapping)))
    sent_encode_target = np.ones(len(offset_mapping),dtype=int) * -100
    offset_mapping = np.array(offset_mapping)
    
    # remap class for first token, others token but first will have class -100
    idx = 0
    for i in range(len(sent_encode_target)):
      if (offset_mapping[i] == 0):
        sent_encode_target[i] = labels[idx]
        idx += 1
      

    special_tokens_count = tokenizer.num_special_tokens_to_add()
    if len(tokens) > self.max_seq_length - special_tokens_count:
      tokens = tokens[: (self.max_seq_length - special_tokens_count)]
      label_ids = label_ids[: (self.max_seq_length - special_tokens_count)]
    
    # add sep token and its label at the tail
    tokens += ["</s>"]
    label_ids += [self.pad_token_label_id]

    # add [cls] token at the beginning
    tokens = ['<s>'] + tokens
    label_ids = [self.pad_token_label_id] + label_ids

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # padding
    input_mask = [1] * len(input_ids)
    padding_length = self.max_seq_length - len(input_ids)
    input_ids += [1] * padding_length # pad input_ids with 1 token (pad token)
    input_mask += [0] * padding_length
    label_ids += [self.pad_token_label_id] * padding_length

    # {input_ids, token_type_ids, attention_mask}
    # ids = torch.tensor(phobert_sents['input_ids'], dtype=torch.long)
    # masks = torch.tensor(phobert_sents['attention_mask'], dtype=torch.long)
    # token_type_ids = torch.tensor(phobert_sents['token_type_ids'], dtype=torch.long)

    ids = torch.tensor(input_ids, dtype=torch.long)
    masks = torch.tensor(input_mask, dtype=torch.long)
    label_ids = torch.tensor(label_ids, dtype=torch.long)
    # token_type_ids = torch.tensor(phobert_sents['token_type_ids'], dtype=torch.long)
    
    return {
        'ids': ids,
        'masks': masks,
        # 'token_type_ids': token_type_ids,
        'labels': label_ids,
        # 'offset_mapping': torch.tensor(offset_mapping, dtype=torch.long),
    }

## POS

In [11]:
pos_list = ["N", "Np", "CH", "M", "R", "A", "P", "V", "Nc", "E", "L", "C", "Ny", 
            "T", "Nb", "Y", "Nu", "Cc", "Vb", "I", "X", "Z", "B", "Eb", "Vy", 
            "Cb", "Mb", "Pb", "Ab", "Ni", "Xy", "NY"]

In [12]:
with io.open(POS_PATH + '/VLSP2013_POS_train.txt', encoding='utf-8') as f:
  train_pos = f.read()
with io.open(POS_PATH + '/VLSP2013_POS_test.txt', encoding='utf-8') as f:
  test_pos = f.read()
with io.open(POS_PATH + '/VLSP2013_POS_dev.txt', encoding='utf-8') as f:
  dev_pos = f.read()

In [13]:
train_pos = train_pos.split('\n\n')
test_pos = test_pos.split('\n\n')
dev_pos = dev_pos.split('\n\n')
len(train_pos), len(dev_pos), len(test_pos)
# train, dev, test

(23907, 2010, 3482)

In [14]:
" ".join([p.split('\t')[0] + "(" + p.split('\t')[1] + ")" for p in train_pos[0].split('\n')])

'Hải_tặc(N) eo_biển(N) Malacca(Np) ((CH) kỳ(N) 1(M) )(CH) :(CH) Eo_biển(N) không(R) yên_tĩnh(A) ...(CH)'

In [15]:
#test
temp = TokenClassificationDataset(train_pos[:-1], pos_list) # remove the last redundant '\n'

In [16]:
temp[0]

{'ids': tensor([    0, 24327,  7419, 13411, 10820, 31357,    20,   560,    99,    19,
            27, 33800,    17,  9756,   135,     2,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,  

## NER

In [17]:
ner_list = [
    "O",       # Outside of a named entity
    "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
    "I-MISC",  # Miscellaneous entity
    "B-PER",   # Beginning of a person's name right after another person's name
    "I-PER",   # Person's name
    "B-ORG",   # Beginning of an organisation right after another organisation
    "I-ORG",   # Organisation
    "B-LOC",   # Beginning of a location right after another location
    "I-LOC"    # Location
]


In [18]:
with io.open(NER_PATH + '/train.txt', encoding='utf-8') as f:
  train_ner = f.read()
with io.open(NER_PATH + '/test.txt', encoding='utf-8') as f:
  test_ner = f.read()
with io.open(NER_PATH + '/dev.txt', encoding='utf-8') as f:
  dev_ner = f.read()

In [19]:
train_ner = train_ner.split('\n\n')
test_ner = test_ner.split('\n\n')
dev_ner = dev_ner.split('\n\n')
len(train_ner), len(dev_ner), len(test_ner)
# train, dev, test

(14862, 2001, 2832)

In [20]:
" ".join([p.split('\t')[0] + "(" + p.split('\t')[1] + ")" for p in train_ner[0].split('\n')])

'Đó(O) là(O) con(O) đường(O) biển(O) ngắn(O) nhất(O) để(O) đi(O) từ(O) Ấn_Độ_Dương(B-LOC) sang(O) Thái_Bình_Dương(B-LOC) ,(O) chiếm(O) đến(O) lượng(O) hàng_hoá(O) lưu_thông(O) đường_biển(O) của(O) thế_giới(O) ,(O) đó(O) là(O) hải_trình(O) lớn(O) nhất(O) từ(O) tây(O) sang(O) đông(O) với(O) 50.000(O) lượt(O) tàu_bè(O) qua_lại(O) mỗi(O) năm(O) ...(O)'

In [21]:
temp = TokenClassificationDataset(train_ner[:-1], ner_list) # remove the last redundant '\n'
temp[0]

{'ids': tensor([    0,   669,     8,    73,   109,   262,  1131,    67,    24,    57,
            39, 10148,   295,  2832,     4,   724,    30,   525,  1075,  2227,
         12033,     7,   198,     4,    37,     8, 31425,   103,    67,    39,
          2615,   295,   553,    15,  3976,  1030, 30254,  4457,   205,    29,
           135,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,  

# Model

## Train

In [22]:
class EarlyStopping:
    def __init__(self, patience=7, delta=0., path='model_checkpoint.pt'):
        self.patience = patience
        self.delta = delta
        self.path = path
        
        self.best_score = None
        self.early_stop = False
        self.counter = 0
        
        self.val_loss_min = np.Inf
    def __call__(self, val_loss, model):
        
        # change the sign if we want larger score
        score  = val_loss
        
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score > self.best_score:# + self.delta:
            self.counter += 1
            if (self.counter > self.patience):
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0
            self.save_checkpoint(val_loss, model)
    def save_checkpoint(self, val_loss, model):
        torch.save(model.state_dict(), self.path)
        print(f'Validation score decrease {self.val_loss_min} -> {val_loss}. Save model..')
        self.val_loss_min = val_loss

In [23]:
"""
  predictions shape: batch_size x max_seq_length x num_classes
  targets shape: batch_size x seq_length x 1

  example: [[0.1, 0.9], [0.2, 0.8], [0.3, 0.7]], [1, 2, -100]
           Return => [1, 1], [1, 2] (ignore -100 class)
"""
def align_prediction(predictions, targets):
  preds = torch.argmax(predictions, axis=2)
  
  batch_size, seq_len = preds.shape

  true_target_list = [[] for _ in range(batch_size)]
  preds_target_list = [[] for _ in range(batch_size)]

  for i in range(batch_size):
    for j in range(seq_len):
      if (targets[i, j] != nn.CrossEntropyLoss().ignore_index): # -100
        true_target_list[i].append(targets[i, j])
        preds_target_list[i].append(preds[i, j])
  return preds_target_list, true_target_list
  
"""
  predictions shape: batch_size x max_seq_length x num_classes
  targets shape: batch_size x seq_length x 1
"""
def compute_metrics(predictions, targets):
  preds_target_list, true_target_list = align_prediction(predictions, targets)

  results = []
  for preds_target, true_target in zip(preds_target_list, true_target_list):
    results.append((np.array(preds_target) == np.array(true_target)).mean())

  return torch.tensor(results).mean().float()

predictions = torch.randint(low=0, high=9, size=(14, 256, 9))
targets = torch.randint(low=0, high=9, size=(14, 256, 1))
compute_metrics(predictions, targets)

tensor(0.1080)

In [24]:
def loss_fn(output, target):
  # output shape: batch_size x max_seq_length x num_classes
  # target shape: batch_size x max_seq_length
  output = output.permute(0, 2, 1)  
  return nn.CrossEntropyLoss()(predictions, targets.long())

In [25]:
def train_epoch(model, data_iter, optimizer, scheduler):
  model.train()

  losses = []
  accuracy = []

  for data in tqdm(data_iter):
    optimizer.zero_grad()

    ids = data['ids'].to(device, non_blocking=True)
    masks = data['masks'].to(device, non_blocking=True)
    # token_type_ids = data['token_type_ids'].to(device, non_blocking=True)
    labels = data['labels'].to(device, non_blocking=True)

    output = model(ids, masks, labels=labels)
    # print(output)
    # print(loss_fn(output.logits, target))
    loss = output.loss#loss_fn(output.logits, target)

    losses.append(loss.item())
    accuracy.append(compute_metrics(output.logits.detach().cpu(), labels.detach().cpu()))

    loss.backward()
    optimizer.step()

    if scheduler:
      scheduler.step()
  losses = np.mean(losses)
  accuracy = np.mean(accuracy)

  return accuracy, losses

In [26]:
def evaluate_epoch(model, data_iter):
  model.eval()

  accuracy = []

  for data in data_iter:
    # updater.zero_grad()

    ids = data['ids'].to(device, non_blocking=True)
    masks = data['masks'].to(device, non_blocking=True)
    # token_type_ids = data['token_type_ids'].to(device, non_blocking=True)
    labels = data['labels'].to(device, non_blocking=True)

    output = model(ids, masks)

    accuracy.append(compute_metrics(output.logits.detach().cpu(), labels.detach().cpu()))

  accuracy = np.mean(accuracy)

  return accuracy

In [27]:
def train(model, train_iter, val_iter, optimizer, scheduler, epochs):
    train_scores, train_losses = [], []
    val_scores = []
    # best_score = None
    es = EarlyStopping(patience=5, path='checkpoint.pt')

    for epoch in range(epochs):
      train_score, train_loss = train_epoch(model, train_iter, optimizer, scheduler)
      train_scores.append(train_score)
      train_losses.append(train_loss)
      
      val_score = evaluate_epoch(model, val_iter)
      val_scores.append(val_score)
      
      es(val_score, model)

      print(f'''Epoch {epoch},
      train loss: {train_loss:.2f}, train score: {train_score:.2f}, val score: {val_score:.2f}''')

      if (es.early_stop):
        print('Early stopping')
        break
      
    return train_scores, train_losses, val_scores

In [28]:
# same as in the paper
epochs = 30
lr = 1e-5
# change this when real train
batch_size = 1

In [29]:
train_dataset = TokenClassificationDataset(train_pos[:1], pos_list)
val_dataset = TokenClassificationDataset(dev_pos[:1], pos_list)

train_iter = DataLoader(
    train_dataset,
    batch_size = batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

val_iter = DataLoader(
    val_dataset,
    batch_size = batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

In [None]:
num_classes = len(pos_list)
model = AutoModelForTokenClassification.from_pretrained("vinai/phobert-base", num_labels=num_classes)
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr)    
train_scores, train_losses, val_scores = train( model, train_iter, val_iter, 
                                                optimizer, None, epochs)

# Evaluation

In [32]:
def evaluate_test(model, test_iter):
  model.eval()

  pred_labels = []
  true_labels = []
  accuracy = []

  for data in test_iter:
    # updater.zero_grad()

    ids = data['ids'].to(device, non_blocking=True)
    masks = data['masks'].to(device, non_blocking=True)
    # token_type_ids = data['token_type_ids'].to(device, non_blocking=True)
    labels = data['labels'].to(device, non_blocking=True)

    output = model(ids, masks)

    pred_labels.append(output.logits)
    true_labels.append(labels)

    # accuracy.append(compute_metrics(output.logits, labels))
  for pred_label, true_label in zip(pred_labels, true_labels):
    accuracy.append(compute_metrics(pred_label, true_label))

  print(accuracy)
  accuracy = np.mean(accuracy)

  return accuracy

In [33]:
test_dataset = TokenClassificationDataset(test_pos[:5], pos_list)

test_iter = DataLoader(
    test_dataset,
    batch_size = batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

In [None]:
evaluate_test(model, test_iter)