In [1]:
import time
from transformers import BertTokenizer, BertConfig, BertModel
from TorchCRF import CRF
import torch.nn as nn
import torch
import warnings
warnings.filterwarnings("ignore")
import pickle

In [2]:
start = time.time()
tokenizer = BertTokenizer.from_pretrained('mBert_Tokenizer', do_lower_case=False,use_fast=False)
config = BertConfig.from_pretrained('mBert_Config', output_hidden_states=True)
config.max_position_embeddings = 512

bert_model = BertModel.from_pretrained(
                        'mBert_Model',
                        config=config,
                        add_pooling_layer=False
)
print("Loading: ",time.time()-start, " S")

file mBert_Tokenizer\config.json not found


Loading:  1.851839542388916  S


In [3]:
import torch
dir_train = 'dataset/vlsp21/final_7900.pkl'
dir_dev = 'dataset/vlsp21/dev.pkl'
dir_test = 'dataset/vlsp21/test.pkl'
path_model = 'model/16_bert_4_crf_ner_08750.pt'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tag = ['ADDRESS','SKILL','EMAIL','PERSON','PHONENUMBER','MISCELLANEOUS','QUANTITY','PERSONTYPE',
              'ORGANIZATION','PRODUCT','IP','LOCATION','O','DATETIME','EVENT', 'URL']

In [4]:
########################### BERT_DATALOADER #############################################



from keras.preprocessing.sequence import pad_sequences  
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

class BERT_DATALOADER:
    def __init__(self, dataset, tokenizer, tag_values,  device):
        self.dataset = dataset
        self.X , self.Y = split_data(self.dataset)
        self.MAX_LEN = 256
        self.BATCH_SIZE = 32
        self.Epoch = 60
        self.Patient = 15
        self.tag_values = ['PAD'] + tag_values 
        self.tag2idx = {t: i for i, t in enumerate(self.tag_values)}
        self.device = device
        self.tokenizer = tokenizer
    
    def create_dataloader(self, mode = 'evaluation', type = 'train'):
        X_subword, y_subword = self.add_subword2data()
        X_padding, y_padding, attention_masks = self._padding_data(X_subword,y_subword)
        X_tensor,y_tensor,masks = self._covert2tensor(X_padding, y_padding, attention_masks, mode)
        if type == 'train':
            train_data = TensorDataset(X_tensor, masks, y_tensor)
            train_sampler = RandomSampler(train_data)
            train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size = self.BATCH_SIZE)
            return train_dataloader
        elif type == 'dev' or type == 'test':
            valid_data = TensorDataset(X_tensor, masks, y_tensor)
            valid_sampler = SequentialSampler(valid_data)
            valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size = self.BATCH_SIZE)
            return valid_dataloader
    
    def _add_subword(self, sentence, text_labels):
        '''
        input:
            sentence = ['Phạm', 'Văn', 'Mạnh']
            text_labels = ['B-PER', 'I-PER','I-PER']

        output: 
            ['Phạm', 'Văn', 'M', '##ạnh'],
            ['B-PER', 'I-PER', 'I-PER', 'I-PER']
        '''
        tokenized_sentence = []
        labels = []
        for word, label in zip(sentence, text_labels):
            subwords = self.tokenizer.tokenize(word)
            tokenized_sentence.extend(subwords)
            
            labels.extend([label] * len(subwords))
        return tokenized_sentence, labels

    def add_subword2data(self):
        '''
            input:
                sentence = [['Phạm', 'Văn', 'Mạnh',..],....]
                text_labels = [['B-PER', 'I-PER','I-PER',..],...]

            output: 
                [['Phạm', 'Văn', 'M', '##ạnh',..],....],
                [['B-PER', 'I-PER','I-PER','I-PER',..],...]
        '''
        tokenized_texts_and_labels = [self._add_subword(sent, labs) for sent, labs in zip(self.X, self.Y)]
        tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
        labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]
        return tokenized_texts,labels
    
    def _padding_data(self,X_subword,y_subword):
        '''
            input:
                X = [['Phạm', 'Văn', 'M', '##ạnh',..],....]
                Y = [['B-PER', 'I-PER','I-PER','I-PER',..],...]

            output: 
            [[10,20,30,40,0,0,0,0,0,0,0,0...],...],
            [[1, 2,3,4,5,5,5,5,5,5,5,5,5,...],...]
        '''
        X_padding = pad_sequences([self.tokenizer.convert_tokens_to_ids(txt) for txt in X_subword],
                          maxlen=self.MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

        y_padding = pad_sequences([[self.tag2idx.get(l) for l in lab] for lab in y_subword],
                        maxlen=self.MAX_LEN, value=self.tag2idx["PAD"], padding="post",
                        dtype="long", truncating="post")
        attention_masks = [[float(i != 0.0) for i in ii] for ii in X_padding]
        return X_padding, y_padding,attention_masks
    
    def _covert2tensor(self, X_padding, Y_padding, attention_masks, mode):
        if mode == 'training':
            X_tensor = torch.tensor(X_padding).to(self.device) 
            y_tensor = torch.tensor(Y_padding).to(self.device) 
            masks = torch.tensor(attention_masks).to(self.device)  

        elif mode =='evaluation':
            X_tensor = torch.tensor(X_padding).type(torch.LongTensor).to(self.device) 
            y_tensor = torch.tensor(Y_padding).type(torch.LongTensor).to(self.device) 
            masks = torch.tensor(attention_masks).type(torch.LongTensor).to(self.device) 
        return  X_tensor, y_tensor, masks


#########################################################################################################################
def split_data(data):
    #(x,y)=> X= [x...] , Y= [y....]
    X, Y = [], []
    for sent in data:
        temp_x = []
        temp_y = []
        for word in sent:
            temp_x.append(word[0])
            temp_y.append(word[1])
        X.append(temp_x)
        Y.append(temp_y)
    return X, Y


In [5]:
########################### BERT_EVALUATE#############################################

import torch
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import numpy as np


def valuation_bert_multi(model, valid_dataloader,tag_values,device):
    '''
        input: 
            - model
            - valid_dataloader
            - tag_values: ['O', 'PER', .... ]
            - device: cuda
        output:
            - report F1
            - loss 

    '''
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in tqdm(valid_dataloader, desc = 'Progress Bar'):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags,average='micro')))
    print(classification_report(valid_tags, pred_tags,digits = 4))
    print("####################")

#################################################################################################################

def valuation_bert_4_crf(model, tokenizer, valid_dataloader,tag_values,device,mode):
    '''
        input: 
            - model
            - valid_dataloader
            - tag_values: ['O', 'PER', .... ]
            - device: cuda
        output:
            - report F1
            - loss 

    '''
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    predictions_f1 , true_labels_f1 = [], []
    for batch in tqdm(valid_dataloader, desc = 'Progress Bar'):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model.forward_custom(b_input_ids, b_input_mask, b_labels, token_type_ids=None)
        
        predict_labels = outputs[1]
        label_ids = b_labels.to('cpu').numpy().tolist()
        predictions = []
        for predict_label in predict_labels:
            predictions.append(predict_label)

        for b_input_id, preds, labels in zip(b_input_ids, predictions, label_ids):
            tokens = tokenizer.convert_ids_to_tokens(b_input_id.to('cpu').numpy())

            new_tokens, new_labels, new_preds = [], [], []
            for token, label_idx, pred in zip(tokens, labels, preds):
                if token.startswith("##"):
                    new_tokens[-1] = new_tokens[-1] + token[2:]
                else:
                    new_labels.append(label_idx)
                    new_preds.append(pred)
                    new_tokens.append(token)
            for token, pred, label in zip(new_tokens, new_preds, new_labels):
                predictions_f1.extend([tag_values[pred]])
                true_labels_f1.extend([tag_values[label]])

    if mode == 'train':
        print("Validation F1-Score: {}".format(f1_score(true_labels_f1, predictions_f1,average='macro')))
        print(classification_report(true_labels_f1, predictions_f1,digits = 4))
    elif mode == 'dev':
        labels = tag_values.copy()
        if 'IP' in labels:
            labels.remove('IP')
        if 'SKILL' in labels:
            labels.remove('SKILL')
        if 'PAD' in labels:
            labels.remove('PAD')
        if 'EMAIL' in labels:
            labels.remove('EMAIL')
        print("Validation F1-Score: {}".format(f1_score(true_labels_f1, predictions_f1,labels = labels ,average='macro')))
        print(classification_report(true_labels_f1, predictions_f1,labels = labels ,digits = 4))
    elif mode =='test':
        labels = tag_values.copy()
        if 'IP' in labels:
            labels.remove('IP')
        if 'SKILL' in labels:
            labels.remove('SKILL')
        if 'PAD' in labels:
            labels.remove('PAD')
        print("Validation F1-Score: {}".format(f1_score(true_labels_f1, predictions_f1,labels = labels ,average='macro')))
        print(classification_report(true_labels_f1, predictions_f1,labels = labels ,digits = 4))

################################################################################################################

def valuation_bert_4_sofmax(model, valid_dataloader,tag_values,device,mode):
    '''
        input: 
            - model
            - valid_dataloader
            - tag_values: ['O', 'PER', .... ]
            - device: cuda
        output:
            - report F1
            - loss 

    '''
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in tqdm(valid_dataloader, desc = 'Progress Bar'):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model.forward_custom(input_ids=b_input_ids, attention_mask=b_input_mask, 
                                       labels=b_labels,head_mask=None)
        # Move logits and labels to CPU
        
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    if mode == 'train':
        print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags,average='macro')))
        print(classification_report(valid_tags, pred_tags,digits = 4))
    elif mode == 'dev':
        labels = tag_values.copy()
        if 'IP' in labels:
            labels.remove('IP')
        if 'SKILL' in labels:
            labels.remove('SKILL')
        if 'PAD' in labels:
            labels.remove('PAD')
        if 'EMAIL' in labels:
            labels.remove('EMAIL')
        print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags,labels = labels ,average='macro')))
        print(classification_report(valid_tags, pred_tags,labels = labels ,digits = 4))
    elif mode =='test':
        labels = tag_values.copy()
        if 'IP' in labels:
            labels.remove('IP')
        if 'SKILL' in labels:
            labels.remove('SKILL')
        if 'PAD' in labels:
            labels.remove('PAD')
        print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags,labels = labels ,average='macro')))
        print(classification_report(valid_tags, pred_tags,labels = labels ,digits = 4))

#################################################################################################################
def BERT_EVALUATE(model, tokenizer, dataloader, tag_values, device, type_dataset, model_type):
    if model_type == 'crf':
        valuation_bert_4_crf(model ,tokenizer, dataloader, tag_values , device, type_dataset)
    elif model_type == 'softmax':
        valuation_bert_4_sofmax(model, dataloader, tag_values , device, type_dataset)

In [6]:
########################### BERT_PREDICT#############################################
import torch
from pyvi import ViTokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
############################## UTILS PREDICT ########################################################
def transform_test(test="", mode='token'):
  # ['bộ', 'văn_hóa', 'và', 'truyền_thông']
  tokens = ViTokenizer.tokenize(test).split()
  temp = []
  for i, w in enumerate(tokens):
    if mode == 'token':
        k = w.replace("_", " ")
        temp.append(k)
    else:   # mode == 'word'
        k = w.replace("_", " ").split()
        for j in k:
            temp.append(j)
  return temp

def tokenize_predict(tokenizer, sentence):
  '''
    sentence: ['văn_hóa','và','nghệ_thuật']
    
    output: ['văn_@@', 'h@@', 'ó@@', 'a', 'và', 'nghệ_thuật']
  '''
  subwords = []

  for word in sentence:
    subword = tokenizer.tokenize(word)

    subwords.extend(subword)

  return subwords
#####################################################################################################
def predict_crf(model, tokenizer, tag, device, texts):
    test_sentence_token = transform_test(texts, 'word')
    subwords = tokenize_predict(tokenizer, test_sentence_token)
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(subwords)],
                              maxlen=512, dtype="long", value=0.0,
                              truncating="post", padding="post")
    input_ids_tensor = torch.tensor(input_ids).type(torch.LongTensor).to(device)
    input_mask = [[float(i != 0.0) for i in ii] for ii in input_ids]
    input_mask_tensor = torch.tensor(input_mask).type(torch.LongTensor).to(device) 
    with torch.no_grad():
        outputs = model.forward_custom(input_ids_tensor, input_mask_tensor)
    predict = outputs[0]
    
    tags_predict = [ tag[i]  for i in  predict]
    tags = []
    tests = []
    for index in range(len(subwords)):
        if "##" not in subwords[index]:
            tags.append(tags_predict[index])
            tests.append(subwords[index])
        else:
            tests[-1] = tests[-1] + subwords[index].replace("##","")
    return [(w,t) for w,t in zip(tests,tags)]


def predict_softmax(model, tokenizer, tag_values, device, test_sentence):
    #predict with model
    model.eval()
    test_sentence_token = transform_test(test_sentence, 'word')
    subwords = tokenize_predict(tokenizer, test_sentence_token)
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(subwords)],
                              maxlen=512, dtype="long", value=0.0,
                              truncating="post", padding="post")
    input_ids_tensor = torch.tensor(input_ids).type(torch.LongTensor).to(device) #Fixfug here
    input_mask = [[float(i != 0.0) for i in ii] for ii in input_ids]
    input_mask_tensor = torch.tensor(input_mask).type(torch.LongTensor).to(device)
    with torch.no_grad():
        outputs = model.forward_custom(input_ids=input_ids_tensor, attention_mask=input_mask_tensor, 
                                       labels=None,head_mask=None)
    logits = outputs[0].detach().cpu().numpy()
    
    #Precroces subword
    
    len_subword = sum(input_ids[0] != 0)
    tokens = tokenizer.convert_ids_to_tokens(input_ids_tensor[0].to('cpu').numpy())[:len_subword]
    predict = np.argmax(logits, axis=2)[0][:len_subword]
    
    tags_predict = [ tag_values[i]  for i in  predict]
    
    tags = []
    tests = []
    for index in range(len(tokens)):
        if "##" not in tokens[index]:
            tags.append(tags_predict[index])
            tests.append(tokens[index])
        else:
            tests[-1] = tests[-1] + tokens[index].replace("##","")
    
    return [(w,t) for w,t in zip(tests,tags)]

def BERT_PREDICT(model, tokenizer, tag, device, test_sentence, type_model):
    if type_model == 'crf':
        return predict_crf(model, tokenizer, tag, device, test_sentence)
    elif type_model == 'softmax':
        return predict_softmax(model, tokenizer, tag, device, test_sentence)


In [7]:
########################### BERT_VISUALIZE#############################################
from spacy import displacy

COLORS ={
    'EMAIL':'gray', 
    'ADDRESS':'maroon',
    'PERSON':'red',
    'PHONENUMBER': 'purple',
    'MISCELLANEOUS':'fuchsia',
    'QUANTITY':'green',
    'PERSONTYPE':'lime',
    'ORGANIZATION':'olive',
    'PRODUCT':'yellow',
    'SKILL':'navy',
    'IP':'blue',
    'LOCATION':'teal',
    'DATETIME':'aqua',
    'EVENT':'darkorange',
    'URL':'deeppink'
}
NER = list(COLORS.keys())

OPTIONS = {'ents': NER, 'colors': COLORS}
    
## visualize result
## input: predict format [(word, tag)]

def BERT_VISUALIZE(arr):
    if len(arr) < 1:
        return None
    text = ' '.join([i for i, j in arr])
    pos = 0
    start_end_labels = []
    for word, tag in arr:
        if len(start_end_labels) > 0 and tag == start_end_labels[-1][2]:
            temp = [start_end_labels[-1][0], pos+len(word), tag]
            start_end_labels[-1] = temp.copy()
        else:
            temp = [pos, pos+len(word), tag]
            start_end_labels.append(temp)
        pos += len(word) + 1
        
    ex = [{'text': text, 'ents': [{'start': x[0], 'end': x[1], 'label': x[2]} for x in start_end_labels if x[2]!= 0]}]
    return displacy.render(ex, manual=True, jupyter=True, style='ent', options = OPTIONS )

In [8]:
########################### BERT_MODEL#############################################
from TorchCRF import CRF
import torch.nn as nn
import torch
import torch.nn.functional as F
log_soft = F.log_softmax

class BERT_4_CRF(nn.Module):
    def __init__(self, bert_model, num_labels):
        super(BERT_4_CRF, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.4)
        # 4 last of layer
        self.classifier = nn.Linear(4*768, num_labels)
        self.crf = CRF(num_labels, batch_first = True)
    
    def forward_custom(self, b_input_ids, b_input_mask,  b_labels=None, token_type_ids=None):
        outputs = self.bert(b_input_ids, attention_mask=b_input_mask)
        sequence_output = torch.cat((outputs[1][-1], outputs[1][-2], outputs[1][-3], outputs[1][-4]),-1)
        sequence_output = self.dropout(sequence_output)
        
        emission = self.classifier(sequence_output) # [32,256,17]
        
        if b_labels is not None:
            loss = -self.crf(log_soft(emission, 2), b_labels, mask=b_input_mask.type(torch.uint8), reduction='mean')
            prediction = self.crf.decode(emission, mask=b_input_mask.type(torch.uint8))
            return [loss, prediction]
                
        else:
            prediction = self.crf.decode(emission, mask=b_input_mask.type(torch.uint8))
            return prediction



class BERT_4_SOFTMAX(nn.Module):
    def __init__(self, bert_model, num_labels):
        super(BERT_4_SOFTMAX, self).__init__()
        self.num_labels = num_labels
        self.bert = bert_model
        self.dropout = nn.Dropout(0.25)
        # 4 last of layer
        self.classifier = nn.Linear(4*768, num_labels)

    def forward_custom(self, input_ids, attention_mask=None, labels=None, head_mask=None):
        outputs = self.bert(input_ids = input_ids, attention_mask=attention_mask)
        sequence_output = torch.cat((outputs[1][-1], outputs[1][-2], outputs[1][-3], outputs[1][-4]),-1)
        sequence_output = self.dropout(sequence_output)
        
        logits = self.classifier(sequence_output) # bsz, seq_len, num_labels
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=0)
            if attention_mask is not None:
                    active_loss = attention_mask.view(-1) == 1
                    active_logits = logits.view(-1, self.num_labels)[active_loss]
                    active_labels = labels.view(-1)[active_loss]
                    loss = loss_fct(active_logits, active_labels)
            else:
                    loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs
        return outputs  #scores, (hidden_states), (attentions)    




######################################################################################################################
import torch
import torch.nn as nn
import torch.nn.functional as F

class FocalLoss(nn.Module):
    '''Multi-class Focal loss implementation'''
    def __init__(self, gamma=2, weight=None,ignore_index=-100):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.weight = weight
        self.ignore_index=ignore_index

    def forward(self, input, target):
        """
        input: [N, C]
        target: [N, ]
        """
        logpt = F.log_softmax(input, dim=1)
        pt = torch.exp(logpt)
        logpt = (1-pt)**self.gamma * logpt
        loss = F.nll_loss(logpt, target, self.weight,ignore_index=self.ignore_index)
        return loss


class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, eps=0.1, reduction='mean',ignore_index=-100):
        super(LabelSmoothingCrossEntropy, self).__init__()
        self.eps = eps
        self.reduction = reduction
        self.ignore_index = ignore_index

    def forward(self, output, target):
        c = output.size()[-1]
        log_preds = F.log_softmax(output, dim=-1)
        if self.reduction=='sum':
            loss = -log_preds.sum()
        else:
            loss = -log_preds.sum(dim=-1)
            if self.reduction=='mean':
                loss = loss.mean()
        return loss*self.eps/c + (1-self.eps) * F.nll_loss(log_preds, target, reduction=self.reduction,
                                                           ignore_index=self.ignore_index)

################################################################################################################
# Define bert 4 layer
class BERT_LSTM_SOFTMAX(nn.Module):
    def __init__(self, bert_model, num_labels):
        super(BERT_LSTM_SOFTMAX, self).__init__()
        self.num_labels = num_labels
        self.bert = bert_model
        self.classifier_1 = nn.Linear(768, num_labels)
        self.dropout_1 = nn.Dropout(0.2)
        self.classifier_2 = nn.Linear(768, num_labels)
        self.dropout_2 = nn.Dropout(0.2)
        self.classifier_3 = nn.Linear(768, num_labels)
        self.dropout_3 = nn.Dropout(0.2)
        self.classifier_4 = nn.Linear(768, num_labels)
        self.dropout_4 = nn.Dropout(0.2)
        self.classifier = nn.Linear(256 + 768, num_labels)
        self.dropout = nn.Dropout(0.25)
        
        self.lstm = nn.LSTM(input_size = 4*num_labels ,hidden_size = 256 , num_layers = 4*num_labels, dropout = 0.2)
        

        
        
    def forward_custom(self, input_ids, attention_mask=None, 
                       head_mask=None, labels=None):
        outputs = self.bert(input_ids = input_ids, attention_mask=attention_mask)
        out_bert = outputs[1][-1]
        out_1 = self.dropout_1(outputs[1][-1])
        out_2 = self.dropout_1(outputs[1][-2])
        out_3 = self.dropout_1(outputs[1][-3])
        out_4 = self.dropout_1(outputs[1][-4])
        sequence_1 = self.classifier_1(out_1)
        sequence_2 = self.classifier_1(out_2)
        sequence_3 = self.classifier_1(out_3)
        sequence_4 = self.classifier_1(out_4)
        
        sequence_output = torch.cat((sequence_1, sequence_2, sequence_3, sequence_4),-1)
        lstm_output = self.lstm(sequence_output)
        sequence_output = torch.cat((lstm_output[0], out_bert),-1)
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output) # bsz, seq_len, num_labels
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        
        if labels is not None:
            loss_fct = FocalLoss(ignore_index=0)
            if attention_mask is not None:
                    active_loss = attention_mask.view(-1) == 1
                    active_logits = logits.view(-1, self.num_labels)[active_loss]
                    active_labels = labels.view(-1)[active_loss]
                    loss = loss_fct(active_logits, active_labels)
            else:
                    loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs
        return outputs  

In [9]:
########################### BERT_UTILS#############################################
import pickle
def read_dataset(dir_train):
    with open(dir_train ,'rb') as f:
        _data = pickle.load(f)
    data = [sq for sq in _data if len(sq) >= 1]
    return data

def split_data(data):
    #(x,y)=> X= [x...] , Y= [y....]
    X, Y = [], []
    for sent in data:
        temp_x = []
        temp_y = []
        for word in sent:
            temp_x.append(word[0])
            temp_y.append(word[1])
        X.append(temp_x)
        Y.append(temp_y)
    return X, Y



In [10]:
#1. Create Dataloader
data_train = read_dataset(dir_train)
TRAIN_SET = BERT_DATALOADER(data_train, tokenizer, tag, device)
train_dataloader = TRAIN_SET.create_dataloader()

data_dev = read_dataset(dir_dev)
DEV_SET = BERT_DATALOADER(data_dev, tokenizer, tag, device)
dev_dataloader = DEV_SET.create_dataloader()

data_test = read_dataset(dir_test)
TEST_SET = BERT_DATALOADER(data_test, tokenizer, tag, device)
test_dataloader = TEST_SET.create_dataloader()

In [11]:
#2. loading model
model = BERT_4_CRF(bert_model, num_labels=len(TRAIN_SET.tag2idx))
model.load_state_dict(torch.load(path_model), strict=False)
model.to(device)

BERT_4_CRF(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [12]:
#3. Evaluate model
BERT_EVALUATE(model=model, tokenizer=tokenizer, dataloader=dev_dataloader, tag_values= ['PAD']+tag, device = device, type_dataset='dev', model_type='crf')

Progress Bar: 100%|████████████████████████████████████████████████████████████████████| 81/81 [01:41<00:00,  1.25s/it]


Validation F1-Score: 0.6909863644988353
               precision    recall  f1-score   support

      ADDRESS     0.3944    0.8000    0.5283        35
       PERSON     0.9188    0.9307    0.9247      3075
  PHONENUMBER     0.2581    1.0000    0.4103         8
MISCELLANEOUS     0.4724    0.5128    0.4918       234
     QUANTITY     0.6020    0.8284    0.6973      2133
   PERSONTYPE     0.5372    0.7877    0.6388       843
 ORGANIZATION     0.7390    0.9177    0.8187      4214
      PRODUCT     0.4349    0.5962    0.5029      1149
     LOCATION     0.8433    0.8688    0.8559      3636
            O     0.9856    0.9487    0.9668    110861
     DATETIME     0.5699    0.8727    0.6896      1760
        EVENT     0.4512    0.6609    0.5363       699
          URL     0.8545    1.0000    0.9216        47

    micro avg     0.9355    0.9354    0.9354    128694
    macro avg     0.6201    0.8250    0.6910    128694
 weighted avg     0.9479    0.9354    0.9399    128694



In [13]:
BERT_EVALUATE(model=model, tokenizer=tokenizer, dataloader=test_dataloader, tag_values= ['PAD']+tag, device = device, type_dataset='test', model_type='crf')

Progress Bar: 100%|██████████████████████████████████████████████████████████████████| 133/133 [02:42<00:00,  1.22s/it]


Validation F1-Score: 0.583148808905693
               precision    recall  f1-score   support

      ADDRESS     0.0811    0.7500    0.1463        16
        EMAIL     0.0000    0.0000    0.0000         7
       PERSON     0.9438    0.9482    0.9460      6623
  PHONENUMBER     0.1875    0.5000    0.2727         6
MISCELLANEOUS     0.1683    0.3522    0.2277       247
     QUANTITY     0.8044    0.8256    0.8149      7185
   PERSONTYPE     0.6378    0.3752    0.4725      3651
 ORGANIZATION     0.7253    0.8541    0.7845      6571
      PRODUCT     0.4527    0.5191    0.4836      2886
     LOCATION     0.8603    0.8445    0.8523      3698
            O     0.9784    0.9695    0.9740    232378
     DATETIME     0.8270    0.9557    0.8867      4563
        EVENT     0.2456    0.6481    0.3562       449
          URL     0.9300    0.9637    0.9466       331

    micro avg     0.9463    0.9463    0.9463    268611
    macro avg     0.5602    0.6790    0.5831    268611
 weighted avg     0.9501

In [14]:
#4. Predict model
test_sentence = '''
Apple đứng đầu danh sách thương hiệu tốt nhất của Interbrand năm thứ 9 liên tiếp
Năm thứ 9 liên tiếp, công ty tư vấn thương hiệu toàn cầu Interbrand đã xếp Apple vào vị trí đầu danh sách các thương hiệu giá trị nhất thế giới.
Giá trị thương hiệu được công ty Interbrand định giá cho Apple là 408,6 tỷ USD. Con số này tăng 26% so với năm ngoái khi thương hiệu của Apple được định giá 323 tỷ USD.
Amazon và Microsoft chiếm vị trí thứ hai và thứ ba với mức định giá tương ứng là 249 tỷ USD và 210 tỷ USD trong khi Google và Samsung lọt vào top 5. Định giá chung của Apple, Amazon và Microsoft chiếm 62,3% tổng giá trị của 10 thương hiệu hàng đầu được đánh giá trong báo cáo năm nay.
"Apple đã đa dạng hóa sản phẩm hơn trong lĩnh vực chăm sóc sức khỏe với Apple Watch hiện ghi lại nồng độ oxy trong máu, dịch vụ đăng ký trong giải trí, lưu trữ dữ liệu và âm nhạc. Thương hiệu này tiếp tục gắn bó với khách hàng và tiếp tục phát triển mạnh mẽ" - Báo cáo của Interbrand viết - "Cuối cùng, Apple vẫn thể hiện hướng đi đáng chú ý và giá trị thương hiệu tập trung vào việc cung cấp cho người tiêu dùng trải nghiệm đơn giản, liền mạch. Điều này đã được khẳng định khi gia tăng 26% về giá trị thương hiệu".
Interbrand tính toán định giá thương hiệu như một thước đo sức mạnh thương hiệu, tính đến nhiều yếu tố bên trong và bên ngoài chủ quan như khả năng lãnh đạo, mức độ gắn kết và mức độ liên quan. Kết quả là đánh giá tác động của một công ty đối với khách hàng, nhân viên và nhà đầu tư. Interbrand cho biết, các thương hiệu mạnh tạo ra ảnh hưởng đối với người tiêu dùng, tạo ra một lượng khách hàng trung thành, thu hút và giữ chân nhân viên cũng như giảm chi phí tài chính.
Theo dữ liệu lịch sử của Interbrand, Apple bắt đầu giành được chỗ đứng trước các đối thủ cạnh tranh trong phân khúc công nghệ và các tên tuổi gia dụng như McDonalds và GE vào năm 2012. Apple đã vượt qua Coca-Cola, trở thành thương hiệu giá trị nhất thế giới vào năm 2013.
'''
prediction= BERT_PREDICT(model, tokenizer, ['PAD']+tag, device, test_sentence, 'crf')
BERT_VISUALIZE(prediction)