In [None]:
# !pip install transformers

In [2]:
# from google.colab import drive
# drive.mount("/content/drive")

In [3]:
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import numpy as np
import json, os
from transformers import RobertaTokenizer, RobertaModel, AutoModel
import random
from tqdm import trange
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
SEED = 0
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [5]:
!ls

Maxpool_Phobert_256_lower_bt16_1e5.pth.tar
Maxpool_Phobert_256_lower_bt16_1e5_voting.pth.tar
Maxpool_Phobert_256_lower_bt16_baseline.pth.tar
Maxpool_Phobert_256_lower_bt16_fc_1e3_freeze.pth.tar
Maxpool_Phobert_256_lower_bt16_fc_2e4.pth.tar
Maxpool_Phobert_256_lower_bt16_fc_2e5.pth.tar
Maxpool_Phobert_256_lower_bt16_fc_2e5_voting.pth.tar
Maxpool_Phobert_256_lower_bt16_fc_3e3_freeze.pth.tar
Maxpool_Phobert_256_lower_bt16_fc_5e3.pth.tar
Maxpool_Phobert_256_lower_bt16.pth.tar
Maxpool_Phobert_256_lower.pth.tar
Maxpool_Phobert_256.pth.tar
MaxPool_PhoBERT.ipynb
MaxPool_PhoBERT.ipynb.invalid
MaxPool_PhoBERT-Voting.ipynb
tbsa_09022022_v0
transformers


## Process Data

In [6]:
train_sentence_packs = [json.loads(line) for line in open('./tbsa_09022022_v0/train/file_1.json', 'r', encoding='utf-8')]
dev_sentence_packs = [json.loads(line) for line in open('./tbsa_09022022_v0/dev/file_1.json', 'r', encoding='utf-8')]
test_sentence_packs = [json.loads(line) for line in open('./tbsa_09022022_v0/test/file_1.json', 'r', encoding='utf-8')]

In [7]:
print(len(test_sentence_packs))

549


In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')
train_packs = []
dev_packs = []
test_packs = []

def handle_sen(sentence_pack):
    tokens = sentence_pack['dos']['words']
    token_start = 0
    token_range = []
    for i, w, in enumerate(tokens):
        token_end = token_start + len(tokenizer.encode(w, add_special_tokens=False))
        token_range.append([token_start, token_end-1])
        token_start = token_end
    
    n = int((token_range[-1][-1]) / 256) + 1
    
    new_packs = []
    start_word = 0
    for sen in range(n):
        start = sen * 256
#         print('start', start)
        end = (sen+1) * 256
#         print('end', end)
        list_entity = []
        new_pack = {}
        for entity in sentence_pack['dos']['entities']:
            if token_range[entity['start_idx']][0] >= start and token_range[entity['end_idx']][1] < end:
                ent = {'start_idx': entity['start_idx'] - start_word,
                        'end_idx': entity['end_idx'] - start_word,
                        'sentiment': entity['sentiment'] }
                list_entity.append(ent)
        new_pack['entities'] = list_entity
#         end_word = list_entity[-1]['end_idx']
#         if start_word + 256 < len(tokens):
#             end_word = start_word + 256
#         else:
#             end_word = len(tokens)
        for e in range(len(tokens)):
            if token_range[e][0] >= end:
#                 print(token_range[e][0])
                end_word = e
#                 print('In',token_range[e][0])
                break;
    
#         print(token_range[len(tokens)-1][-1])
#         print(token_range[len(tokens)-1][0])
        if token_range[len(tokens)-1][-1] <= end or token_range[len(tokens)-1][0] <= end:
            end_word = len(tokens)
#         print(start_word)
#         print(end_word)
        new_pack['words'] = sentence_pack['dos']['words'][start_word: end_word]
        
#         assert len(new_pack['words']) > 0

#         try:
#             assert len(new_pack['words']) > 0
#         except AssertionError as msg:
#             print(start_word)
#             print(end_word)
        start_word = end_word
        if len(new_pack['words']) > 0 and len(new_pack['entities']) > 0:
            new_packs.append(new_pack)
#         print('New pack', new_pack)
    return new_packs

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
for sen in train_sentence_packs:
    a = handle_sen(sen)
    train_packs.extend(a)

for sen in dev_sentence_packs:
    a = handle_sen(sen)
    dev_packs.extend(a)

for sen in test_sentence_packs:
    a = handle_sen(sen)
    test_packs.extend(a)

In [10]:
import gc 
del train_sentence_packs, dev_sentence_packs, test_sentence_packs
gc.collect()

20

In [11]:
train_packs[3]

{'entities': [{'start_idx': 0, 'end_idx': 0, 'sentiment': 'neutral'}],
 'words': ['yêu_cầu', 'cần', 'bảo_vệ', ')']}

In [12]:
# train_sentence_packs[2]['dos']['words'][198:199]

In [13]:
def count_label(packs):
  labels = []
  for pack in packs:
    for o in pack['entities']:
      labels.append(o['sentiment'])
  return labels
# labels = count_label(train_sentence_packs)

In [14]:
labels = count_label(train_packs)

In [15]:
labels = pd.DataFrame(labels)

In [16]:
labels.value_counts()

neutral     28587
positive    10505
negative     5448
conflict      346
dtype: int64

In [17]:
labels.value_counts()

neutral     28587
positive    10505
negative     5448
conflict      346
dtype: int64

In [18]:
# ' '.join(train_sentence_packs[0]['dos']['words'])

In [19]:
sentiment2id = {'negative': 0, 'neutral': 1, 'positive': 2}

class Instance(object):
    def __init__(self, tokenizer, sentence_pack, max_sequence_len = 256, task='triplet'):
        self.sentence = ' '.join(sentence_pack['words']).lower()
        self.tokenizer = tokenizer
        self.tokens = self.add_special(self.sentence.strip().split())
        self.sen_length = len(self.tokens)
        self.bert_tokens_padding = torch.ones(max_sequence_len).long()
        self.aspect_tags = torch.zeros(max_sequence_len).long()
        self.sentiment_tags = {'labels':[]}
        self.mask = torch.zeros(max_sequence_len)
        self.aspect_masks = []
        self.max_len = max_sequence_len


        for triple in sentence_pack['entities']:
            if triple['sentiment'] != 'conflict' and triple['end_idx'] < max_sequence_len:
              l, r = triple['start_idx'], triple['end_idx']
              aspect_range = r-l+1
              aspect_mask = [0] * self.sen_length
              aspect_mask[l+1:r+2] = [1] * aspect_range
              self.aspect_masks.append(aspect_mask)
              self.sentiment_tags['labels'].append(sentiment2id[triple['sentiment']])
        
        self.pieces = []
        self.piece_masks = [[] for i in range(len(self.aspect_masks))]
        self.aspect_mask_tags = [torch.zeros(max_sequence_len).long() for i in range(len(self.aspect_masks))]
        for i, w, in enumerate(self.tokens):
            bpes = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(w))
            for sub_w in bpes:
              self.pieces.append(sub_w) 
            
            for j in range(len(self.piece_masks)):
              self.piece_masks[j].extend([self.aspect_masks[j][i]] * len(bpes))
#               self.sentiment_tags['labels'][j] = self.sentiment_tags['labels'][j] * len(bpes)
        
        try:
            assert len(self.piece_masks) > 0
        except AssertionError as msg:
            print('Oke',sentence_pack['entities'])
            print(sentence_pack['words'])
            
        
        try:
            assert len(self.piece_masks[0]) == len(self.pieces)
        except AssertionError as msg:
            print(self.piece_masks)
            print(self.pieces)

        if len(self.pieces) < self.max_len:
          for i in range(len(self.pieces)):
              self.bert_tokens_padding[i] = self.pieces[i]
              for j in range(len(self.aspect_masks)):
                self.aspect_mask_tags[j][i] = self.piece_masks[j][i]
          self.mask[:len(self.pieces)] = 1
        else:
          for i in range(self.max_len):
              self.bert_tokens_padding[i] = self.pieces[i]
              for j in range(len(self.aspect_masks)):
                self.aspect_mask_tags[j][i] = self.piece_masks[j][i]
          self.mask[:self.max_len] = 1
        
        for j in range(len(self.aspect_mask_tags) - 1, -1, -1):
            if self.aspect_mask_tags[j].sum() == 0:
                self.aspect_mask_tags.pop(j)
                self.sentiment_tags['labels'].pop(j)
            else:
                self.sentiment_tags['labels'][j] = [self.sentiment_tags['labels'][j]] * self.aspect_mask_tags[j].sum()
        assert len(self.aspect_mask_tags) == len(self.sentiment_tags['labels'])
        # print(len(self.bert_tokens_padding))
        # print(len(self.mask))
        # print(len(self.aspect_mask_tags))

    def add_special(self,token):
        token.insert(0,self.tokenizer.cls_token)
        token.append(self.tokenizer.sep_token)
        return token

In [20]:
# for sen in test_packs:
#     a = Instance(tokenizer, sen)

In [21]:
from transformers import AutoTokenizer

class text_dataset(Dataset):
  def __init__(self, pack):
    self.sentence_packs = pack
    self.tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')
  
  def __len__(self):
    return len(self.sentence_packs)
  
  def __getitem__(self,idx):
    instance = Instance(self.tokenizer, self.sentence_packs[idx])
    # labels = torch.Tensor([-1] * len(instance.bert_tokens_padding))
    # print(labels)
    # print(len(instance.sentiment_tags))
    # for i in range(len(instance.sentiment_tags)):
    #   labels[i] = instance.sentiment_tags[i]

    return (instance.bert_tokens_padding, instance.aspect_mask_tags, instance.mask, instance.sentiment_tags)

In [22]:
# tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')
# sen_len = [len(Instance(tokenizer, sen).pieces) for sen in train_sentence_packs]
# pd.Series(sen_len).hist(bins = 30)

In [23]:
train_dat = text_dataset(train_packs)
dev_dat = text_dataset(dev_packs)

train_gen = DataLoader(train_dat, batch_size=16, shuffle=True, collate_fn=lambda x: x)
dev_gen = DataLoader(dev_dat, batch_size=16, shuffle=False, collate_fn=lambda x: x)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [24]:
data = next(iter(dev_gen))

In [25]:
data[0]

(tensor([    0,  2771,  3852,  3763,  1388,  9037, 17593,  2158,   458,  2303,
          3321,  4055, 10079,  2938,  4407,   360,    13,   308,    71,  1797,
          1340,  9856,  1427,     6, 20543,  9405,    28,   119,  3628,   381,
           381,   381, 10958, 12495,  6525,    31,   175,    13,     9,   308,
             6,    94,    18,   103,     5,  7113, 13587,  1947,    95,    99,
         10958,  2771,  3852,   482, 55259, 38389,    13,   308,  3763,  1388,
         45152, 14785, 17229,  1395, 50453,  4407,    11, 12723,    39,  5373,
          1415,  2857,    20, 25399, 33496, 35435,    19,     6, 12504,  4843,
             8,  2004,   159,    24, 38774,  3802,    77,   156,    44,   371,
          9605,    48, 30875,    15,     9,   265,  2353,     5,   335, 20074,
             4,   351,  1427,     4,  7527,     4,   469,  7549,   974,    13,
           482,   308,    12, 18761, 52701,  1340,  9856,  1427,    11, 10135,
             6,  7050,     5,  6817,     4,  8992,  

In [26]:
# aspect_sample = []
# for sample in data:
#   aspect_sample.append([s for s in sample[1]])

In [27]:
# total_term = sum([len(sample) for sample in aspect_sample])
# print(total_term)
# batch_feature = torch.zeros(total_term, 256, 768)
# batch_feature_bert = torch.rand(16,256,768)
# batch_feature_bert[0]
# term_count = 0
# for i in range(len(aspect_sample)):
#   num_term = len(aspect_sample[i])
#   for j in range(len(aspect_sample[i])):
#     idx = j + term_count
#     # print(idx)
#     batch_feature[idx] = batch_feature_bert[i]
#     batch_feature[idx] = batch_feature[idx].masked_fill(aspect_sample[i][j].unsqueeze(-1).eq(0), 0)
#     # print(i,idx,batch_feature[i][idx])
#   term_count += num_term

# # batch_feature = batch_feature.max(1)[0]
# # # batch_feature = batch_feature[batch_feature.abs().sum(dim=2) != 0]

In [28]:
# total = 0
# for sample in aspect_sample:
#     for i in sample:
#         total += i.sum()

# print(total)

In [29]:
# labels = []
# for sample in data:
#     for token_label in sample[-1]['labels']:
#         labels.extend(token_label) 

In [30]:
# #sentence_ids, bert_tokens, lengths, masks, sens_lens, token_ranges, aspect_tags, tags
# _, tokens, masks, aspect_mask_tags, labels = trainset.get_batch(0)
# tokens.shape

## MaxpoolPhoBERT-Model

In [31]:
import math

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Computation device: {device}\n")

Computation device: cuda



In [32]:
BATCH_SIZE = 16
class MaxPoolBert(torch.nn.Module):
    def __init__(self, bert_feature_dim = 768, class_num = 3, max_sequence_len = 256):
        super(MaxPoolBert, self).__init__()

        self.bert_feature_dim = 768
        self.class_num = class_num
        self.max_sequence_len = max_sequence_len

        self.bert = AutoModel.from_pretrained("vinai/phobert-base")

        self.embed_dropout = nn.Dropout(0.5)
        self.ffn = nn.Sequential(
            nn.Linear(bert_feature_dim, bert_feature_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(bert_feature_dim, class_num),
        )

    def process_term(self, bert_feature, aspect_sample):
        total_term = sum([len(sample) for sample in aspect_sample])
        batch_feature = torch.zeros(total_term, self.max_sequence_len, self.bert_feature_dim).to(device)
        term_count = 0
        for i in range(len(aspect_sample)):
          num_term = len(aspect_sample[i])
          for j in range(len(aspect_sample[i])):
            idx = j + term_count
            # print(idx)
            batch_feature[idx] = bert_feature[i]
            batch_feature[idx] = batch_feature[idx].masked_fill(aspect_sample[i][j].unsqueeze(-1).eq(0), 0)
            # print(i,idx,batch_feature[i][idx])
          term_count += num_term
        
        # numterm * 256 * 768
#         batch_feature = batch_feature.max(1)[0]
#         print(batch_feature.shape)
#         print(batch_feature[batch_feature.abs().sum(dim=2) != 0].shape)
        
        return batch_feature[batch_feature.abs().sum(dim=2) != 0]

    def forward(self, tokens, masks, aspect_masks, labels = None):
        bert_feature = self.bert(tokens, masks)
        bert_feature = bert_feature.last_hidden_state

        # num_term = (labels != -1).numpy().astype(int)
        assert not isinstance(bert_feature, str)
        bert_feature = self.embed_dropout(bert_feature)
#         print(bert_feature.shape)

        batch_feature = self.process_term(bert_feature, aspect_masks)

        logits = self.ffn(batch_feature)
#         print(logits.shape)
        # print(logits.shape)

        return logits

In [33]:
# criterion = nn.CrossEntropyLoss()

# val_epoch_loss, val_epoch_f1 = eval(
#         model, dev_gen, criterion
#     )

In [34]:
# BATCH_SIZE = 6
# class MaxPoolBert(torch.nn.Module):
#     def __init__(self, bert_feature_dim = 768, class_num = 3, max_sequence_len = 256):
#         super(MaxPoolBert, self).__init__()

#         self.bert_feature_dim = 768
#         self.class_num = class_num
#         self.max_sequence_len = max_sequence_len

#         self.bert = AutoModel.from_pretrained("vinai/phobert-base")

#         self.embed_dropout = nn.Dropout(0.5)
#         self.ffn = nn.Sequential(
#             nn.Linear(bert_feature_dim, bert_feature_dim),
#             nn.ReLU(),
#             nn.Dropout(0.3),
#             nn.Linear(bert_feature_dim, class_num),
#         )

#     def process_term(self, bert_feature, aspect_sample):
#         total_term = sum([len(sample) for sample in aspect_sample])
#         bert_feature = bert_feature.unsqueeze(1).repeat(1, total_term, 1, 1)
#         # batch_feature = torch.zeros(BATCH_SIZE, total_term, self.max_sequence_len, self.bert_feature_dim).to(device)
#         zeros = torch.zeros(self.max_sequence_len,1).to(device)
#         term_count = 0
#         for i in range(len(aspect_sample)):
#           num_term = len(aspect_sample[i])
#           for j in range(len(aspect_sample[i])):
#             idx = j + term_count
#             # print(idx)
#             # batch_feature[i][idx] = bert_feature[i]
#             # af = batch_feature[i][idx].masked_fill(aspect_sample[i][j].unsqueeze(-1).eq(0), 0)
#             bert_feature[i][idx] = bert_feature[i][idx].masked_fill(aspect_sample[i][j].unsqueeze(-1).eq(0), 0)
#             # print(i,idx,bert_feature[i][idx])
#           bert_feature[i][:term_count] = bert_feature[i][:term_count].masked_fill(zeros.eq(0), 0)
#           term_count += num_term
#           bert_feature[i][term_count:] = bert_feature[i][term_count:].masked_fill(zeros.eq(0), 0)

#         # zeros = zeros.cpu().numpy()
#         bert_feature = bert_feature.max(2)[0]
#         ## numterm * 768
#         bert_feature = bert_feature[bert_feature.abs().sum(dim=2) != 0]
        
#         return bert_feature

#     def forward(self, tokens, masks, aspect_masks, labels = None):
#         bert_feature = self.bert(tokens, masks)
#         bert_feature = bert_feature.last_hidden_state

#         # num_term = (labels != -1).numpy().astype(int)
#         assert not isinstance(bert_feature, str)
#         bert_feature = self.embed_dropout(bert_feature)
#         # print(bert_feature.shape)

#         batch_feature = self.process_term(bert_feature, aspect_masks)

#         logits = self.ffn(batch_feature)
#         # print(logits.shape)

#         return logits

In [35]:
# BATCH_SIZE = 8
# class MaxPoolBert(torch.nn.Module):
#     def __init__(self, bert_feature_dim = 768, class_num = 3, max_sequence_len = 256):
#         super(MaxPoolBert, self).__init__()

#         self.bert_feature_dim = 768
#         self.class_num = class_num
#         self.max_sequence_len = max_sequence_len

#         self.bert = AutoModel.from_pretrained("vinai/phobert-base")

#         self.embed_dropout = nn.Dropout(0.5)
#         self.ffn = nn.Sequential(
#             nn.Linear(bert_feature_dim, bert_feature_dim),
#             nn.ReLU(),
#             nn.Dropout(0.3),
#             nn.Linear(bert_feature_dim, class_num),
#         )

#     def process_term(self, bert_feature, aspect_sample):
#         total_term = sum([len(sample) for sample in aspect_sample])

#         batch_feature = torch.zeros(BATCH_SIZE, total_term, self.max_sequence_len, self.bert_feature_dim).to(device)
#         term_count = 0
#         for i in range(len(aspect_sample)):
#           num_term = len(aspect_sample[i])
#           for j in range(len(aspect_sample[i])):
#             idx = j + term_count
#             # print(idx)
#             batch_feature[i][idx] = bert_feature[i]
#             # af = batch_feature[i][idx].masked_fill(aspect_sample[i][j].unsqueeze(-1).eq(0), 0)
#             batch_feature[i][idx] = batch_feature[i][idx].masked_fill(aspect_sample[i][j].unsqueeze(-1).eq(0), 0)
#             # print(i,idx,batch_feature[i][idx])
#           term_count += num_term

#         batch_feature = batch_feature.max(2)[0]
#         ## numterm * 768
#         batch_feature = batch_feature[batch_feature.abs().sum(dim=2) != 0]
        
#         return batch_feature

#     def forward(self, tokens, masks, aspect_masks, labels = None):
#         bert_feature = self.bert(tokens, masks)
#         bert_feature = bert_feature.last_hidden_state

#         # num_term = (labels != -1).numpy().astype(int)
#         assert not isinstance(bert_feature, str)
#         bert_feature = self.embed_dropout(bert_feature)
#         # print(bert_feature.shape)

#         batch_feature = self.process_term(bert_feature, aspect_masks)

#         logits = self.ffn(batch_feature)
#         # print(logits.shape)

#         return logits

In [36]:
model = MaxPoolBert()
model.to(device)

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


MaxPoolBert(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), 

## Training

In [37]:
class LRScheduler():
    def __init__(
        self, optimizer, patience=1, min_lr=1e-6, factor=0.1
    ):
        self.optimizer = optimizer
        self.patience = patience
        self.min_lr = min_lr
        self.factor = factor
        self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( 
                self.optimizer,
                mode='max',
                patience=self.patience,
                factor=self.factor,
                min_lr=self.min_lr,
                verbose=True
            )
    def __call__(self, val_loss):
        self.lr_scheduler.step(val_loss)

class EarlyStopping():
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
    def __call__(self, val_loss):
        if self.best_loss == None:
            self.best_loss = val_loss
        elif self.best_loss - val_loss > self.min_delta:
            self.best_loss = val_loss
            # reset counter if validation loss improves
            self.counter = 0
        elif self.best_loss - val_loss < self.min_delta:
            self.counter += 1
            print(f"INFO: Early stopping counter {self.counter} of {self.patience}")
            if self.counter >= self.patience:
                print('INFO: Early stopping')
                self.early_stop = True

In [38]:
from tqdm import tqdm
def fit(model, train_gen, criterion, optimizer, epoch):
  running_loss = 0.0
  train_running_loss = 0.0
  reporting_step = 200

  counter = 0
  model.train()
  for step, data in enumerate(tqdm(train_gen)):
      counter += 1
      batch_sample = [sample[0] for sample in data]
      masks_sample = [sample[2] for sample in data]
      aspect_sample = []
      labels = []
      for sample in data:
        aspect_sample.append([s.to(device) for s in sample[1]])
        for token_label in sample[-1]['labels']:
            labels.extend(token_label) 
      texts = torch.stack(batch_sample).to(device)
      masks = torch.stack(masks_sample).to(device)
      labels = torch.tensor(labels).to(device)
      preds = model(texts, masks, aspect_sample)

      loss = criterion(preds,labels)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      running_loss += loss.item()
      train_running_loss += loss.item()
      # _, preds = torch.max(outputs.data, 1)

      if step % reporting_step == reporting_step-1:
            print(f"Epoch {epoch} Step {step} ave_loss {running_loss/reporting_step:0.4f}")
            running_loss = 0.0
            
  train_loss = train_running_loss / counter

  return train_loss

In [None]:
from collections import Counter

def voting(predictions, aspect_sample, output_scores):
    voting_predictions = np.array([])
    total_term = sum([len(sample) for sample in aspect_sample])
    ot_scores = np.zeros((total_term, 3))
    total_token = 0
    total_term = 0
    start_token = 0

    for i in range(len(aspect_sample)):
        num_term = len(aspect_sample[i])
        for j in range(len(aspect_sample[i])):
            sentiment = [0] * 3
            idx1 = total_term + j
            numtoken = sum(aspect_sample[i][j].cpu())
            ot_scores[idx1] = np.sum(output_scores[start_token:start_token+numtoken], axis=0)
            for k in range(numtoken):
                idx = k + total_token
                sentiment[predictions[idx]] += 1
            voting_count = np.where(sentiment == np.amax(sentiment))
            if len(voting_count[0]) > 1:
#                 print(voting_count)
#                 print(np.argmax(ot_scores[idx1]))
#                 print(ot_scores[idx1])
                voting_predictions = np.append(voting_predictions,np.argmax(ot_scores[idx1]))
            else:
                voting_predictions = np.append(voting_predictions,np.argmax(sentiment))
            
            start_token = start_token+numtoken
            total_token += numtoken
        total_term += num_term
    return voting_predictions
        
def eval(model, val_gen, criterion):
    model.eval()
    val_running_loss = 0.0
    val_preds = np.array([])
    val_labels = np.array([])


    soft_m = nn.Softmax(dim=-1)

    counter = 0
    with torch.no_grad():
        for step, data in enumerate(val_gen):
            counter += 1
            batch_sample = [sample[0] for sample in data]
            masks_sample = [sample[2] for sample in data]
            aspect_sample = []
            labels = []
            origin_labels = []
            for sample in data:
              aspect_sample.append([s.to(device) for s in sample[1]])
              for token_label in sample[-1]['labels']:
                labels.extend(token_label) 
                origin_labels.append(token_label[0])
            texts = torch.stack(batch_sample).to(device)
            masks = torch.stack(masks_sample).to(device)
            labels = torch.tensor(labels).to(device)
            preds = model(texts, masks, aspect_sample)

            loss = criterion(preds, labels)

            output_scores = soft_m(preds)
            predictions = torch.argmax(output_scores, dim=-1)        
            predictions = predictions.cpu().numpy()
            output_scores = output_scores.cpu().numpy()
                
            voting_predictions = voting(predictions, aspect_sample, output_scores)
            val_running_loss += loss.item()
#             val_preds = torch.cat((val_preds, predictions), dim=0)
            val_preds = np.concatenate((val_preds, voting_predictions), axis=0)
#             val_labels = torch.cat((val_labels, origin_labels), dim=0)
            val_labels = np.concatenate((val_labels, origin_labels), axis=0)


        val_loss = val_running_loss / counter
        target_names = ['negative', 'neutral', 'positive']
        print(classification_report(val_labels, val_preds, target_names=target_names))

    return val_loss, classification_report(val_labels, val_preds, output_dict=True)['macro avg']['f1-score']

In [40]:
from collections import Counter
sentiment = [0, 1, 1]
voting_count = np.where(sentiment == np.amax(sentiment))
len(voting_count[0])

2

In [41]:
# criterion = nn.CrossEntropyLoss()

# val_epoch_loss, val_epoch_f1 = eval(
#         model, dev_gen, criterion
#     )

In [42]:
!ls

Maxpool_Phobert_256_lower_bt16_1e5.pth.tar
Maxpool_Phobert_256_lower_bt16_1e5_voting.pth.tar
Maxpool_Phobert_256_lower_bt16_baseline.pth.tar
Maxpool_Phobert_256_lower_bt16_fc_1e3_freeze.pth.tar
Maxpool_Phobert_256_lower_bt16_fc_2e4.pth.tar
Maxpool_Phobert_256_lower_bt16_fc_2e5.pth.tar
Maxpool_Phobert_256_lower_bt16_fc_2e5_voting.pth.tar
Maxpool_Phobert_256_lower_bt16_fc_3e3_freeze.pth.tar
Maxpool_Phobert_256_lower_bt16_fc_5e3.pth.tar
Maxpool_Phobert_256_lower_bt16.pth.tar
Maxpool_Phobert_256_lower.pth.tar
Maxpool_Phobert_256.pth.tar
MaxPool_PhoBERT.ipynb
MaxPool_PhoBERT.ipynb.invalid
MaxPool_PhoBERT-Voting.ipynb
tbsa_09022022_v0
transformers


In [43]:
def freeze(model):  
  for param in model.bert.parameters():
    param.requires_grad = False
  
  return model

def unfreeze(model):  
  for param in model.BERT.PhoBERT.parameters():
    param.requires_grad = True
    
  return model

In [44]:
# model = freeze(model)

In [45]:
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

135,591,171 total parameters.
135,591,171 training parameters.


In [46]:
optimizer = torch.optim.Adam([
        {'params': model.bert.parameters(), 'lr': 1e-5},
        {'params': model.ffn.parameters(),'lr': 2e-5}
    ], lr=2e-5)

best_joint_f1 = 0
best_joint_epoch = 0
epochs = 20
criterion = nn.CrossEntropyLoss()
lr_scheduler = LRScheduler(optimizer)
early_stopping = EarlyStopping()

for epoch in range(epochs):
    print('Epoch:{}'.format(epoch))
    train_epoch_loss = fit(model, train_gen, criterion, optimizer, epoch)
    
    val_epoch_loss, val_epoch_f1 = eval(
        model, dev_gen, criterion
    )

    print(f"Train Loss: {train_epoch_loss:.4f}")
    print(f'Val Loss: {val_epoch_loss:.4f}')

    if val_epoch_f1 > best_joint_f1:
        print('Better ver saved')
        print(val_epoch_f1)
        model_path = './Maxpool_Phobert_256_lower_bt16_fc_2e5_voting.pth.tar'
        best_joint_f1 = val_epoch_f1
        checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, 'best_f1':  best_joint_f1}
        torch.save(checkpoint, model_path)
        best_joint_epoch = epoch
        
    lr_scheduler(val_epoch_f1)

    early_stopping(val_epoch_loss)
    if early_stopping.early_stop:
        break
print('best epoch: {}\tbest dev triplet f1: {:.5f}\n\n'.format(best_joint_epoch, best_joint_f1))


Epoch:0


100%|█████████████████████████████████████████| 163/163 [03:55<00:00,  1.45s/it]


              precision    recall  f1-score   support

    negative       0.72      0.48      0.58      1175
     neutral       0.82      0.85      0.84      6311
    positive       0.67      0.71      0.69      2217

    accuracy                           0.78      9703
   macro avg       0.74      0.68      0.70      9703
weighted avg       0.77      0.78      0.77      9703

Train Loss: 0.7632
Val Loss: 0.5324
Better ver saved
0.7007352880140831
Epoch:1


100%|█████████████████████████████████████████| 163/163 [03:53<00:00,  1.43s/it]


              precision    recall  f1-score   support

    negative       0.70      0.65      0.67      1175
     neutral       0.86      0.85      0.86      6311
    positive       0.70      0.77      0.73      2217

    accuracy                           0.80      9703
   macro avg       0.75      0.75      0.75      9703
weighted avg       0.81      0.80      0.81      9703

Train Loss: 0.5361
Val Loss: 0.4696
Better ver saved
0.7528673253708341
Epoch:2


100%|█████████████████████████████████████████| 163/163 [03:53<00:00,  1.43s/it]


              precision    recall  f1-score   support

    negative       0.68      0.73      0.70      1175
     neutral       0.90      0.80      0.85      6311
    positive       0.66      0.84      0.74      2217

    accuracy                           0.80      9703
   macro avg       0.75      0.79      0.76      9703
weighted avg       0.82      0.80      0.81      9703

Train Loss: 0.4585
Val Loss: 0.4707
Better ver saved
0.7626602461429481
INFO: Early stopping counter 1 of 5
Epoch:3


100%|█████████████████████████████████████████| 163/163 [03:53<00:00,  1.43s/it]


              precision    recall  f1-score   support

    negative       0.69      0.74      0.71      1175
     neutral       0.89      0.85      0.87      6311
    positive       0.72      0.79      0.75      2217

    accuracy                           0.82      9703
   macro avg       0.77      0.79      0.78      9703
weighted avg       0.83      0.82      0.82      9703

Train Loss: 0.4057
Val Loss: 0.4464
Better ver saved
0.7789503966713786
Epoch:4


100%|█████████████████████████████████████████| 163/163 [03:52<00:00,  1.43s/it]


              precision    recall  f1-score   support

    negative       0.73      0.72      0.72      1175
     neutral       0.89      0.86      0.87      6311
    positive       0.72      0.81      0.76      2217

    accuracy                           0.83      9703
   macro avg       0.78      0.79      0.79      9703
weighted avg       0.83      0.83      0.83      9703

Train Loss: 0.3641
Val Loss: 0.4366
Better ver saved
0.7863101301293572
Epoch:5


100%|█████████████████████████████████████████| 163/163 [03:52<00:00,  1.43s/it]


              precision    recall  f1-score   support

    negative       0.70      0.77      0.73      1175
     neutral       0.91      0.83      0.87      6311
    positive       0.71      0.83      0.76      2217

    accuracy                           0.83      9703
   macro avg       0.77      0.81      0.79      9703
weighted avg       0.84      0.83      0.83      9703

Train Loss: 0.3243
Val Loss: 0.4514
Better ver saved
0.7878602761827094
INFO: Early stopping counter 1 of 5
Epoch:6


100%|█████████████████████████████████████████| 163/163 [03:52<00:00,  1.43s/it]


              precision    recall  f1-score   support

    negative       0.75      0.72      0.73      1175
     neutral       0.90      0.87      0.88      6311
    positive       0.75      0.81      0.78      2217

    accuracy                           0.84      9703
   macro avg       0.80      0.80      0.80      9703
weighted avg       0.84      0.84      0.84      9703

Train Loss: 0.2947
Val Loss: 0.4443
Better ver saved
0.7981567625341647
INFO: Early stopping counter 2 of 5
Epoch:7


100%|█████████████████████████████████████████| 163/163 [03:52<00:00,  1.43s/it]


              precision    recall  f1-score   support

    negative       0.68      0.78      0.73      1175
     neutral       0.91      0.84      0.87      6311
    positive       0.73      0.83      0.78      2217

    accuracy                           0.83      9703
   macro avg       0.77      0.82      0.79      9703
weighted avg       0.84      0.83      0.83      9703

Train Loss: 0.2636
Val Loss: 0.4780
INFO: Early stopping counter 3 of 5
Epoch:8


100%|█████████████████████████████████████████| 163/163 [03:52<00:00,  1.42s/it]


              precision    recall  f1-score   support

    negative       0.70      0.76      0.73      1175
     neutral       0.91      0.84      0.87      6311
    positive       0.72      0.84      0.78      2217

    accuracy                           0.83      9703
   macro avg       0.78      0.81      0.79      9703
weighted avg       0.84      0.83      0.83      9703

Train Loss: 0.2348
Val Loss: 0.4886
Epoch 00009: reducing learning rate of group 0 to 1.0000e-06.
Epoch 00009: reducing learning rate of group 1 to 2.0000e-06.
INFO: Early stopping counter 4 of 5
Epoch:9


100%|█████████████████████████████████████████| 163/163 [03:52<00:00,  1.43s/it]


              precision    recall  f1-score   support

    negative       0.70      0.76      0.73      1175
     neutral       0.90      0.86      0.88      6311
    positive       0.74      0.81      0.78      2217

    accuracy                           0.84      9703
   macro avg       0.78      0.81      0.80      9703
weighted avg       0.84      0.84      0.84      9703

Train Loss: 0.2112
Val Loss: 0.4784
INFO: Early stopping counter 5 of 5
INFO: Early stopping
best epoch: 6	best dev triplet f1: 0.79816




In [49]:
del model
import gc
gc.collect()

20

## Test model

In [53]:
## Load model after train
model_path = './Maxpool_Phobert_256_lower_bt16_fc_2e5_voting.pth.tar'
checkpoint = torch.load(model_path, map_location=device)
print(checkpoint['best_f1'])

model = MaxPoolBert()
model.load_state_dict(checkpoint['state_dict'])
model.to(device)

# test_sentence_packs = [json.loads(line) for line in open('./tbsa_09022022_v0/test/file_1.json', 'r', encoding='utf-8')]
test_dat = text_dataset(test_packs)
test_gen = DataLoader(test_dat, batch_size=16, shuffle=False, collate_fn=lambda x: x)
criterion = nn.CrossEntropyLoss()

def test(model):
    print("Evaluation on testset:")
    model.eval()
  
    val_epoch_loss, val_epoch_f1 = eval(model, test_gen, criterion)
    print(val_epoch_f1)

0.7981567625341647


Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [55]:
print(checkpoint['epoch'])

6


In [48]:
test(model)

Evaluation on testset:
              precision    recall  f1-score   support

    negative       0.71      0.75      0.73      1314
     neutral       0.89      0.86      0.88      6244
    positive       0.76      0.81      0.78      2162

    accuracy                           0.84      9720
   macro avg       0.79      0.81      0.80      9720
weighted avg       0.84      0.84      0.84      9720

0.7972226127781683


In [54]:
test(model)

Evaluation on testset:
              precision    recall  f1-score   support

    negative       0.75      0.69      0.72      1314
     neutral       0.88      0.88      0.88      6244
    positive       0.76      0.80      0.78      2162

    accuracy                           0.84      9720
   macro avg       0.80      0.79      0.79      9720
weighted avg       0.84      0.84      0.84      9720

0.7930544797629536


## Keep training

In [None]:
from tqdm import tqdm

optimizer = torch.optim.Adam([
        {'params': model.bert.parameters(), 'lr': 5e-6},
        {'params': model.ffn.parameters()}
    ], lr=2e-6)

optimizer.load_state_dict(checkpoint['optimizer'])

best_joint_f1 = checkpoint['best_f1']
best_joint_epoch = 0
epochs = 10
criterion = nn.CrossEntropyLoss()
lr_scheduler = LRScheduler(optimizer)
early_stopping = EarlyStopping()
early_stopping(best_joint_f1)

for epoch in range(epochs):
    print('Epoch:{}'.format(epoch))
    train_epoch_loss = fit(model, train_gen, criterion, optimizer, epoch)
    
    val_epoch_loss, val_epoch_f1 = eval(
        model, dev_gen, criterion
    )
    lr_scheduler(val_epoch_f1)

    early_stopping(val_epoch_f1)
    if early_stopping.early_stop:
        break

    print(f"Train Loss: {train_epoch_loss:.4f}")
    print(f'Val Loss: {val_epoch_loss:.4f}')

    if val_epoch_f1 > best_joint_f1:
        model_path = './Maxpool_Phobert_keeptrain.pth.tar'
        best_joint_f1 = val_epoch_f1
        checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, 'best_f1':  best_joint_f1}
        torch.save(checkpoint, model_path)
        best_joint_epoch = epoch
print('best epoch: {}\tbest dev triplet f1: {:.5f}\n\n'.format(best_joint_epoch, best_joint_f1))


Epoch:0


 39%|████████████████                         | 100/255 [04:09<07:02,  2.72s/it]

Epoch 0 Step 99 ave_loss 0.0859


 78%|████████████████████████████████▏        | 200/255 [08:15<02:17,  2.50s/it]

Epoch 0 Step 199 ave_loss 0.0874


100%|█████████████████████████████████████████| 255/255 [10:31<00:00,  2.48s/it]


              precision    recall  f1-score   support

           0       0.74      0.77      0.75      1107
           1       0.91      0.89      0.90      5864
           2       0.78      0.84      0.81      2023

    accuracy                           0.86      8994
   macro avg       0.81      0.83      0.82      8994
weighted avg       0.86      0.86      0.86      8994

INFO: Early stopping counter 1 of 5
Train Loss: 0.0864
Val Loss: 0.4623
Epoch:1


 39%|████████████████                         | 100/255 [04:00<05:26,  2.11s/it]

Epoch 1 Step 99 ave_loss 0.0755


 78%|████████████████████████████████▏        | 200/255 [08:11<02:17,  2.50s/it]

Epoch 1 Step 199 ave_loss 0.0734


100%|█████████████████████████████████████████| 255/255 [10:29<00:00,  2.47s/it]


              precision    recall  f1-score   support

           0       0.74      0.77      0.75      1107
           1       0.91      0.89      0.90      5864
           2       0.78      0.84      0.81      2023

    accuracy                           0.86      8994
   macro avg       0.81      0.83      0.82      8994
weighted avg       0.86      0.86      0.86      8994

INFO: Early stopping counter 2 of 5
Train Loss: 0.0758
Val Loss: 0.4805
Epoch:2


 39%|████████████████                         | 100/255 [04:05<06:51,  2.66s/it]

Epoch 2 Step 99 ave_loss 0.0718


 78%|████████████████████████████████▏        | 200/255 [08:11<02:12,  2.41s/it]

Epoch 2 Step 199 ave_loss 0.0718


100%|█████████████████████████████████████████| 255/255 [10:25<00:00,  2.45s/it]


              precision    recall  f1-score   support

           0       0.74      0.77      0.75      1107
           1       0.91      0.89      0.90      5864
           2       0.79      0.83      0.81      2023

    accuracy                           0.86      8994
   macro avg       0.81      0.83      0.82      8994
weighted avg       0.86      0.86      0.86      8994

Train Loss: 0.0717
Val Loss: 0.4898
Epoch:3


 39%|████████████████                         | 100/255 [04:10<06:40,  2.58s/it]

Epoch 3 Step 99 ave_loss 0.0625


 78%|████████████████████████████████▏        | 200/255 [08:11<02:02,  2.23s/it]

Epoch 3 Step 199 ave_loss 0.0651


100%|█████████████████████████████████████████| 255/255 [10:22<00:00,  2.44s/it]


              precision    recall  f1-score   support

           0       0.75      0.76      0.76      1107
           1       0.91      0.89      0.90      5864
           2       0.78      0.84      0.81      2023

    accuracy                           0.86      8994
   macro avg       0.81      0.83      0.82      8994
weighted avg       0.86      0.86      0.86      8994

INFO: Early stopping counter 1 of 5
Train Loss: 0.0652
Val Loss: 0.4948
Epoch:4


 39%|████████████████                         | 100/255 [04:05<05:47,  2.24s/it]

Epoch 4 Step 99 ave_loss 0.0583


 78%|████████████████████████████████▏        | 200/255 [08:11<02:21,  2.58s/it]

Epoch 4 Step 199 ave_loss 0.0608


100%|█████████████████████████████████████████| 255/255 [10:23<00:00,  2.45s/it]


              precision    recall  f1-score   support

           0       0.77      0.75      0.76      1107
           1       0.91      0.90      0.90      5864
           2       0.79      0.83      0.81      2023

    accuracy                           0.86      8994
   macro avg       0.82      0.82      0.82      8994
weighted avg       0.86      0.86      0.86      8994

Train Loss: 0.0592
Val Loss: 0.5187
Epoch:5


 39%|████████████████                         | 100/255 [04:09<07:33,  2.93s/it]

Epoch 5 Step 99 ave_loss 0.0538


 78%|████████████████████████████████▏        | 200/255 [08:08<02:24,  2.63s/it]

Epoch 5 Step 199 ave_loss 0.0572


100%|█████████████████████████████████████████| 255/255 [10:20<00:00,  2.43s/it]


              precision    recall  f1-score   support

           0       0.76      0.76      0.76      1107
           1       0.91      0.90      0.90      5864
           2       0.80      0.83      0.81      2023

    accuracy                           0.86      8994
   macro avg       0.82      0.83      0.82      8994
weighted avg       0.86      0.86      0.86      8994

Train Loss: 0.0549
Val Loss: 0.5211
Epoch:6


 39%|████████████████                         | 100/255 [04:00<06:52,  2.66s/it]

Epoch 6 Step 99 ave_loss 0.0472


 78%|████████████████████████████████▏        | 200/255 [08:06<02:13,  2.43s/it]

Epoch 6 Step 199 ave_loss 0.0491


100%|█████████████████████████████████████████| 255/255 [10:22<00:00,  2.44s/it]


              precision    recall  f1-score   support

           0       0.74      0.77      0.75      1107
           1       0.91      0.88      0.90      5864
           2       0.78      0.84      0.81      2023

    accuracy                           0.86      8994
   macro avg       0.81      0.83      0.82      8994
weighted avg       0.86      0.86      0.86      8994

INFO: Early stopping counter 1 of 5
Train Loss: 0.0495
Val Loss: 0.5432
Epoch:7


 33%|█████████████▋                            | 83/255 [03:19<07:26,  2.59s/it]

In [51]:
!nvidia-smi

Sat Mar 26 18:40:30 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:5E:00.0 Off |                  N/A |
|  0%   33C    P5    33W / 250W |   3352MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:86:00.0 Off |                  N/A |
|  0%   24C    P8     4W / 250W |   3496MiB / 11019MiB |      0%      Default |
|       

In [42]:
torch.cuda.memory_allocated()

4440615424

In [50]:
torch.cuda.empty_cache()