In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pickle
import torch
from sklearn.metrics import classification_report

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig, BertModel
from transformers import XLMRobertaTokenizer, XLMRobertaConfig, XLMRobertaModel
from transformers import AutoTokenizer, AutoConfig, AutoModel

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import transformers
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup, get_constant_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup

from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm, trange
import numpy as np 

import torch.nn as nn



import torch.nn.functional as F
log_soft = F.log_softmax

In [4]:
#Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(device)

MAX_LEN = 256
BS = 32

linear_dropout = 0.3
bert_att_dropout = 0.2
bert_hidd_dropout = 0.2

bert_lr = 1e-5
bert_weight_decay = 1e-5
softmax_lr = 5e-4
softmax_weight_decay = 1e-3

### Preprocesing data

In [5]:
class BertDataLoader:
    def __init__(self, dir, tokenizer, tag_values, device, is_train = False, bs = 32, maxlen = 256):
        self.dir_train = dir
        self.MAX_LEN = maxlen
        self.BATCH_SIZE = bs
        self.tag_values = tag_values
        self.tag2idx = {t: i for i, t in enumerate(self.tag_values)}
        self.device = device
        self.tokenizer = tokenizer
        self.is_train = is_train
    
    def read_dataset(self):
        with open(self.dir_train ,'rb') as f:
            data = pickle.load(f)
        data = [sq for sq in data if len(sq) >= 1]
        return data

    def split_data(self, data):
        #(x,y)=> X= [x...] , Y= [y....]
        X, Y = [], []
        for sent in data:
            temp_x = []
            temp_y = []
            for word in sent:
                temp_x.append(word[0])
                temp_y.append(word[1])
            X.append(temp_x)
            Y.append(temp_y)
        return X, Y    

    def check_label(self, data):
        '''
        input: [[('Hello','O'),...],...]
        output: {'O','LOC',"ORG",...}
        '''
        a = []
        for i in data:
            for j in i:
                _, l = j
                a.append(l)
        return list(set(a))

#     def isSubword(self, x, idx, sub = '##'):
#         return sub not in x[idx] and idx > 0 and idx < len(x) - 1 and sub not in x[idx-1] and sub not in x[idx+1]

    def isNotSubword(self, x, idx, sub = '##'):
        if sub == '##':
            return sub not in x[idx] and idx < len(x) - 1 and sub not in x[idx+1]
        elif sub == '@@':
            return sub not in x[idx] and idx > 0 and sub not in x[idx-1]
        return sub in x[idx] and idx < len(x) - 1 and sub in x[idx+1]
    
    def cutting_subword(self, X, y):
        res_X, res_y = [], []
        punct = '.!?'
        st = 0
        cur = 0

        while (st < len(X)-self.MAX_LEN):
            flag = True
            for i in range(st+self.MAX_LEN-1, st-1, -1):
                if X[i] in punct and y[i] == 'O':
                    cur = i+1
                    flag = False
                    break
            if flag:
                for i in range(st+self.MAX_LEN-1, st-1, -1):
                    if self.isNotSubword(X, i, sub='_'):
                        cur = i+1
                        if y[i] == 'O':
                            cur = i+1
                            break
            if st == cur:
                cur += self.MAX_LEN

            res_X.append(X[st: cur])
            res_y.append(y[st: cur])
            st = cur

        res_X.append(X[cur:])
        res_y.append(y[cur:])
        return res_X, res_y
    
    def add_subword(self, sentence, text_labels):
        '''
        input:
            sentence = ['Phạm', 'Văn', 'Mạnh']
            text_labels = ['B-PER', 'I-PER','I-PER']

        output: 
            ['Phạm', 'Văn', 'M', '##ạnh'],
            ['B-PER', 'I-PER', 'I-PER', 'I-PER']
        '''
        tokenized_sentence = []
        labels = []
        for word, label in zip(sentence, text_labels):
            subwords = self.tokenizer.tokenize(word)
            tokenized_sentence.extend(subwords)
            
            labels.extend([label] * len(subwords))
        return tokenized_sentence, labels


    def add_subword2data(self, X, Y):
        '''
            input:
                sentence = [['Phạm', 'Văn', 'Mạnh',..],....]
                text_labels = [['B-PER', 'I-PER','I-PER',..],...]

            output: 
                [['Phạm', 'Văn', 'M', '##ạnh',..],....],
                [['B-PER', 'I-PER','I-PER','I-PER',..],...]
        '''
        tokenized_texts_and_labels = [self.add_subword(sentence, text_labels) for sentence, text_labels in zip(X, Y)]
        tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
        labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]
        return tokenized_texts,labels
    
    
    def padding_data(self,X_subword,y_subword):
        '''
            input:
                X = [['Phạm', 'Văn', 'M', '##ạnh',..],....]
                Y = [['B-PER', 'I-PER','I-PER','I-PER',..],...]

            output: 
            [[10,20,30,40,0,0,0,0,0,0,0,0...],...],
            [[1, 2,3,4,5,5,5,5,5,5,5,5,5,...],...]
        '''
        X_padding = pad_sequences([self.tokenizer.convert_tokens_to_ids(txt) for txt in X_subword],
                          maxlen=self.MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

        y_padding = pad_sequences([[self.tag2idx.get(l) for l in lab] for lab in y_subword],
                        maxlen=self.MAX_LEN, value=self.tag2idx["PAD"], padding="post",
                        dtype="long", truncating="post")
        attention_masks = [[float(i != 0.0) for i in ii] for ii in X_padding]
        
        return X_padding, y_padding,attention_masks
    
    
    def covert2tensor(self, X_padding, Y_padding, attention_masks):
        if self.is_train == True:
            X_tensor = torch.tensor(X_padding).to(self.device) 
            y_tensor = torch.tensor(Y_padding).to(self.device) 
            masks = torch.tensor(attention_masks).to(self.device)  
        elif self.is_train == False:
            X_tensor = torch.tensor(X_padding).type(torch.LongTensor).to(self.device) 
            y_tensor = torch.tensor(Y_padding).type(torch.LongTensor).to(self.device) 
            masks = torch.tensor(attention_masks).type(torch.LongTensor).to(self.device) 
        return  X_tensor, y_tensor, masks

    def create_dataloader(self):
        dataset = self.read_dataset()
        labels = self.check_label(dataset)
        X, Y = self.split_data(dataset)
        X_subword, y_subword = self.add_subword2data(X, Y)
        long_subword = [seq for seq in X_subword if len(seq) > self.MAX_LEN]
        print(f"Before cutting: \nX_subword: {X_subword[0]}, \nMax_seq: {max([len(line) for line in X_subword])}\
           \nThe number of seq have len larger {self.MAX_LEN}: {len(long_subword)} \nThe number of total seq: {len(X_subword)}")
        X_subword_at, y_subword_at = [], []
        for i in range(len(X_subword)):
            res_x, res_y = self.cutting_subword(X_subword[i], y_subword[i])
            X_subword_at += res_x
            y_subword_at += res_y
        long_subword_at = [seq for seq in X_subword_at if len(seq) > self.MAX_LEN]
        print(f"After cutting: \nX_subword: {X_subword_at[0]}, \nMax_seq: {max([len(line) for line in X_subword_at])}\
           \nThe number of seq have len larger: {self.MAX_LEN}: {len(long_subword_at)} \nThe number of total seq: {len(X_subword_at)}")
        X_padding, y_padding, attention_masks = self.padding_data(X_subword_at, y_subword_at)
        X_tensor,y_tensor,masks = self.covert2tensor(X_padding, y_padding, attention_masks)
        train_data = TensorDataset(X_tensor, masks, y_tensor)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = self.BATCH_SIZE)
        return train_dataloader, labels

In [6]:
dir_train = '../input/vlsp2021-newfinal/train_full_15t12_9h45.pkl'
dir_dev = '../input/vlsp2021-newfinal/dev_full_15t12_9h45.pkl'
dir_test = '../input/vlsp2021-newfinal/test_full_15t12_9h45.pkl'
dir_demo = '../input/vlsp2021-newfinal/demo.pkl'
pre_train = {
    'mbert': 'bert-base-multilingual-cased',
    'xlmr_base': 'xlm-roberta-base',
    'xlmr_large': 'xlm-roberta-large',
    'phobert': 'vinai/phobert-base'
}
model_name = 'xlmr_softmax'
pre_trained_name = pre_train['xlmr_base']

In [7]:
tokenizer = AutoTokenizer.from_pretrained(pre_trained_name, do_lower_case=False,use_fast=False)
config = AutoConfig.from_pretrained(pre_trained_name, output_hidden_states=True)
config.hidden_dropout_prob = bert_hidd_dropout
config.attention_probs_dropout_prob = bert_att_dropout
model = AutoModel.from_pretrained(pre_trained_name, config=config, add_pooling_layer=False)

In [8]:
tag_values = ['PAD', 'ADDRESS','SKILL','EMAIL','PERSON','PHONENUMBER','MISCELLANEOUS','QUANTITY','PERSONTYPE',
              'ORGANIZATION','PRODUCT','IP','LOCATION','O','DATETIME','EVENT', 'URL']
# #Merge ADDRESS
# tag_values = ['PAD','SKILL','EMAIL','PERSON','PHONENUMBER','MISCELLANEOUS','QUANTITY','PERSONTYPE',
#               'ORGANIZATION','PRODUCT','IP','LOCATION','O','DATETIME','EVENT', 'URL']
tag2idx = {t: i for i, t in enumerate(tag_values)}
tag2idx

In [9]:
TRAIN = BertDataLoader(dir_train, tokenizer, tag_values, device, is_train = True, bs = BS, maxlen = MAX_LEN)
DEV = BertDataLoader(dir_dev, tokenizer, tag_values, device, is_train = False, bs = BS, maxlen = MAX_LEN)
TEST = BertDataLoader(dir_test, tokenizer, tag_values, device, is_train = False, bs = BS, maxlen = MAX_LEN)
#DEMO = BertDataLoader(dir_demo, tokenizer, tag_values, device, is_train = False, bs = BS, maxlen = MAX_LEN)

In [10]:
train_loader, train_labels = TRAIN.create_dataloader()
dev_loader, dev_labels = DEV.create_dataloader()
test_loader, test_labels = TEST.create_dataloader()
#demo_loader, demo_labels = DEMO.create_dataloader()

## 2.Modelling

In [None]:
# eps = ['SKILL', 'PRODUCT', 'PERSONTYPE', 'MISC', 'EVENT', 'ADDRESS']
# class_weights =  [1.2 if tag in eps else 1 for tag in tag_values]
# # converting list of class weights to a tensor
# weights= torch.tensor(class_weights,dtype=torch.float)
# weights = weights.to(device)

In [11]:
class BaseBertSoftmax(nn.Module):
    def __init__(self, model, drop_out , num_labels , concat=True):
        super(BaseBertSoftmax, self).__init__()
        self.concat = concat
        self.num_labels = num_labels
        self.model = model
        self.dropout = nn.Dropout(drop_out)
#         if self.concat:
#             self.lstm = nn.LSTM(4*768, 256, batch_first=True, dropout=0.2, bidirectional=True)
#             self.classifier = nn.Linear(256*2, num_labels) # 4 last of layer
#         else:
#             self.lstm = nn.LSTM(768, 256, batch_first=True, dropout=0.2, bidirectional=True)
#             self.classifier = nn.Linear(256*2, num_labels)

        if self.concat:
            self.classifier = nn.Linear(4*768, num_labels) # 4 last of layer
        else:
            self.classifier = nn.Linear(768, num_labels)
        
        
    def forward_custom(self, input_ids, attention_mask=None,
                        labels=None, head_mask=None):
        outputs = self.model(input_ids = input_ids, attention_mask=attention_mask)
        if self.concat:
            sequence_output = torch.cat((outputs[1][-1], outputs[1][-2], outputs[1][-3], outputs[1][-4]),-1)
            sequence_output = self.dropout(sequence_output)
        else:
            sequence_output = self.dropout(outputs[0])
            
        
#         if self.concat:
#             sequence_output = torch.cat((outputs[1][-1], outputs[1][-2], outputs[1][-3], outputs[1][-4]),-1)
#             lstm_output, (h,c) = self.lstm(sequence_output) ## extract the 1st token's embeddings
#             hidden = torch.cat((lstm_output[:,-1, :256],lstm_output[:,0, 256:]),dim=-1)
#             sequence_output = self.dropout(sequence_output)
#         else:
#             lstm_output, (h,c) = self.lstm(sequence_output[0]) ## extract the 1st token's embeddings
#             hidden = torch.cat((lstm_output[:,-1, :256],lstm_output[:,0, 256:]),dim=-1)
#             sequence_output = self.dropout(outputs[0])
        #logits = self.linear(hidden.view(-1,256*2)) 
        

        logits = self.classifier(sequence_output) # bsz, seq_len, num_labels
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=0)
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)[active_loss]
                active_labels = labels.view(-1)[active_loss]
                loss = loss_fct(active_logits, active_labels)
            else:
                    loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs
        return outputs  #scores, (hidden_states), (attentions)

In [12]:
model = BaseBertSoftmax(model=model, drop_out=linear_dropout, num_labels=len(tag_values))
model.to(device)

#### Exploring the parameters

In [13]:
# #!pip install prettytable
# from prettytable import PrettyTable

# def count_parameters(model):
#     table = PrettyTable(["Modules", "Parameters"])
#     total_params = 0
#     for name, parameter in model.named_parameters():
#         if not parameter.requires_grad: continue
#         param = parameter.numel()
#         table.add_row([name, param])
#         total_params+=param
#     print(table)
#     print(f"Total Trainable Params: {total_params}")
#     return total_params
    
# count_parameters(model)

In [14]:
cnt = -1
num_layer = 197

for param in model.named_parameters():
    cnt += 1
    if cnt>=num_layer:
        param[1].requires_grad = True
    else:
        param[1].requires_grad = True
    print(cnt,param[0],'\t',param[1].requires_grad)


FINETUNING = True
if FINETUNING:
    param_optimizer1 = list(model.named_parameters())[:num_layer]
    param_optimizer2 = list(model.named_parameters())[num_layer:]
    no_decay = ['bias', 'LayerNorm.weight'] #['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer1 if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': bert_weight_decay},
        {'params': [p for n, p in param_optimizer1 if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0},
        
        {'params': [p for n, p in param_optimizer2 if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': softmax_weight_decay,
         'lr': softmax_lr},
        {'params': [p for n, p in param_optimizer2 if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0,
         'lr':softmax_lr},
    ]
    
optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=bert_lr,
    eps=1e-8
)

### Evalutate

In [15]:
def show_span_f1(dic):
    index = []
    da = []
    for tag, detail in dic.items():
        index.append(tag)
        da.append(detail)
    df = pd.DataFrame(da)
    df = df.set_index([pd.Index(index)])
    return df

def convert_spanformat(arr):
    if len(arr) < 1:
        return None
    text = ' '.join([i for i, j in arr])
    pos = 0
    start_end_labels = []
    for word, tag in arr:
        if len(start_end_labels) > 0 and tag == start_end_labels[-1][2]:
            temp = [start_end_labels[-1][0], pos+len(word), tag]
            start_end_labels[-1] = temp.copy()
        else:
            temp = [pos, pos+len(word), tag]
            start_end_labels.append(temp)
        pos += len(word) + 1

    res = dict()   
    for s, e, l in start_end_labels:
        if l != 'O':
            if l not in res:
                res[l] = [(s, e)]
            else:
                res[l].append((s, e))
    return res
 
def compare_span(span1, span2, res, strict= True):
    all_labels = set(list(span1.keys()) + list(span2.keys()))
    for l in all_labels:
        if l not in res:
            res[l] = [0, 0, 0, 0]
        if l not in span1:
            res[l][3] += len(span2[l])
            continue
        if l not in span2:
            res[l][2] += len(span1[l])
            continue
        res[l][2] += len(span1[l])
        res[l][3] += len(span2[l])
        for s, e in span1[l]:
            for s1, e1 in span2[l]:
                temp0, temp1 = iou_single(s, e, s1, e1)
                if strict:
                    temp0, temp1 = int(temp0), int(temp1)
                res[l][0] += temp0
                res[l][1] += temp1
    return res
 
def iou_single(s1, e1, s2, e2):
    smax = max(s1, s2)
    emin = min(e1, e2)
    return max(0, emin - smax) / (e1 - s1) if e1 - s1 > 0 else 0, max(0, emin - smax) / (e2 - s2) if e2 - s2 > 0 else 0
 
# (token - True - pred) 
# [[ ],[ ]]           
def span_f1(arr, labels = None, strict=True, digit=4):
    all_labels = set()
    dictt = dict()
    for ar in arr:
        text, gt, pred = list(zip(*ar))
        gtSpan = convert_spanformat(list(zip(text, gt)))
        predSpan = convert_spanformat(list(zip(text, pred)))
        dictt = compare_span(predSpan, gtSpan, dictt, strict)

        all_labels.update(list(gtSpan.keys()))
    classfication_rp = dict()
    # print(dictt)
    f1_avg = 0
    if labels is None:
        labels = all_labels
    for i in labels:
        precision = dictt[i][0] / dictt[i][2] if dictt[i][2] > 0 else 0
        recall = dictt[i][1] / dictt[i][3] if dictt[i][3] > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        classfication_rp[i] = {'precision': round(precision, digit), 'recall': round(recall, digit), 'f1': round(f1, digit), 'support': dictt[i][3]}
        f1_avg += f1
    return f1_avg / len(labels), classfication_rp

def merge_subtags_3column(tokens, tags_true, tags_predict, model_name):
    tags = []
    tests = []
    trues = []
    if 'mbert' in model_name:
        for index in range(len(tokens)):
            if "##" not in tokens[index]:
                tags.append(tags_predict[index])
                tests.append(tokens[index])
                trues.append(tags_true[index])
            else:
                tests[-1] = tests[-1] + tokens[index].replace("##","")
    elif 'phobert' in model_name:
        for index in range(len(tokens)):
            if len(tests) == 0:
                tests.append(tokens[index])
                tags.append(tags_predict[index])
                trues.append(tags_true[index])
            elif "@@" in tests[-1]:
                tests[-1] = tests[-1][:-2] + tokens[index]
            else:
                tests.append(tokens[index])
                tags.append(tags_predict[index])
                trues.append(tags_true[index])
    elif 'xlmr' in model_name:
        for index in range(len(tokens)):
            if len(tests) == 0:
                if "▁" in tokens[index]:
                    tests.append(tokens[index][1:])
                else:
                    tests.append(tokens[index])
                tags.append(tags_predict[index])
                trues.append(tags_true[index])
            elif "▁" in tokens[index]:
                tests.append(tokens[index][1:])
                tags.append(tags_predict[index])
                trues.append(tags_true[index])
            else:
                tests[-1] = tests[-1] + tokens[index]
    return tests, trues, tags

In [16]:
def evaluate(model, dataloader, is_train = False):
        model.eval()
        eval_loss = 0
        predictions_f1 , true_labels_f1 = [], []
        out = []
        for batch in dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            with torch.no_grad():
                outputs = model.forward_custom(b_input_ids, b_input_mask, b_labels)
            eval_loss += outputs[0].mean().item()
            
            label_ids = b_labels.to('cpu').numpy().tolist()
            b_input_ids = b_input_ids.to('cpu').numpy().tolist()
            if 'crf' in model_name:
                predict_labels = outputs[1]
            else:
                predict_labels = []
                logits = outputs[1].detach().cpu().numpy()
                for predicts in np.argmax(logits, axis=2):
                    predict_labels.append(predicts)

            for b_input_id, preds, labels in zip(b_input_ids, predict_labels, label_ids):
                n = sum(np.array(b_input_id) != 0)
                tokens = tokenizer.convert_ids_to_tokens(b_input_id)[:n]
                labels = [tag_values[i] for i in labels][:n]
                preds = [tag_values[i] for i in preds]
                token_new, label_new, pred_new = merge_subtags_3column(tokens, labels, preds, model_name)
                #(token - label - pred)
                temp = list(zip(token_new, label_new, pred_new))
                out.append(temp)   
                for _ , pred, label in zip(token_new, pred_new, label_new):
                    predictions_f1.extend([pred])
                    true_labels_f1.extend([label])
        eval_loss = eval_loss / len(dataloader)
        print(f"Validation loss: {eval_loss:.4f}")
        LABEL = list(set(true_labels_f1))
        print("Classification_report:\n {}".format(classification_report(true_labels_f1, predictions_f1, labels =  LABEL, digits=4)))
#         if is_train:
#             f1 = f1_score(true_labels_f1, predictions_f1, labels =  LABEL,average='macro')
#             utils.summary_result(y_true = true_labels_f1 , y_pred = predictions_f1, is_show= is_show_cm)
#             return f1,eval_loss
#         #utils.summary_result(y_true = true_labels_f1 , y_pred = predictions_f1, is_show= is_show_cm)
        return span_f1(out, labels = None, strict=True, digit=4)

### Training

In [17]:
def train_model(model, train_loader, dev_loader, scheduler, epochs=60, patience=15, max_grad_norm=1.0, PATH='ner_model.pt', dev_labels=None):
    ## Store the average loss after each epoch so we can plot them.
    train_loss_values, valid_loss_values = [], []
    f1_max = 0
    loss_min = np.Inf
    f1_train_list, f1_dev_list = [], []
    history = {}
    #loss_path =  PATH[:-3] + '_loss.pt'
    #f1_path = PATH[:-3] + '_f1.pt'
    for epoch in trange(epochs, desc="Epoch"):
        # ========================================
        #               Training
        # ========================================
        model.train()
        # Training loop
        train_loss = 0
        train_pred , train_true = [], []
        for step, batch in enumerate(train_loader):
            # add batch to gpu
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            # Always clear any previously calculated gradients before performing a backward pass.
            model.zero_grad()

            outputs = model.forward_custom(input_ids=b_input_ids, attention_mask=b_input_mask, 
                                           labels=b_labels, head_mask=None)
            #################################
            logits = outputs[1].detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences.
            train_pred.extend([list(p) for p in np.argmax(logits, axis=2)])
            train_true.extend(label_ids)
            ################################
            loss = outputs[0]
            # Perform a backward pass to calculate the gradients.
            loss.backward()
            # track train loss
            train_loss += loss.item()
            # Clip the norm of the gradient
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
            # update parameters
            optimizer.step()
            # Update the learning rate.
            scheduler.step()
        avg_train_loss = train_loss / len(train_loader)
        train_loss_values.append(avg_train_loss)
        ################################
        train_pred_tags = [tag_values[p_i] for p, l in zip(train_pred, train_true)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
        train_true_tags = [tag_values[l_i] for l in train_true
                                  for l_i in l if tag_values[l_i] != "PAD"]
        f1_train = f1_score(y_true=train_true_tags, y_pred=train_pred_tags, labels=dev_labels, average='macro')
        # ========================================
        #               Validation
        # ========================================
        f1_dev, result = evaluate(model, dev_loader, is_train = False)
        show_span_f1(result)
        
        f1_train_list.append(f1_train)
        f1_dev_list.append(f1_dev)

        history['train_loss_values'] = train_loss_values
        history['valid_loss_values'] = valid_loss_values
        history['f1_train_list'] = f1_train_list
        history['f1_dev_list'] = f1_dev_list

        if f1_dev > f1_max:
            print(f'f1_span improved from: {f1_max:.4f} to {f1_dev:.4f}')
            print(f'Best model saved to {PATH}')
            f1_max = f1_dev
            torch.save(model.state_dict(), PATH)
            epochs_no_improve = 0
            best_epoch = epoch
        else:
            print(f'f1_score dont improve from: {f1_max:.4f} to {f1_dev:.4f}')
            epochs_no_improve += 1
            if epochs_no_improve < patience:
                print(f'EarlyStopping count: {epochs_no_improve}/{patience}')
            else:
                print(f'\nEarly Stopping! Total epochs: {epochs}. Best epoch: {best_epoch} with f1_score: {f1_max:.4f}')
                break
    model.load_state_dict(torch.load(PATH), strict=False)
    return model, history

In [None]:
PATH = './xlmr_full_025_025.pt'
epochs = 50
patience = 10
max_grad_norm = 1.0
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=int(total_steps/10), num_training_steps=total_steps)
model, history = train_model(model, train_loader, dev_loader, scheduler, epochs, patience, max_grad_norm, PATH, dev_labels)

In [None]:
#Checkout the history
#pd.DataFrame(history).plot(figsize=(10,7), xlabel="epochs");

In [None]:
import matplotlib.pyplot as plt
plt.plot(history['train_loss_values'][:-10], label='train_loss')
plt.plot(history['valid_loss_values'][:-10], label='dev_loss')
plt.legend()
plt.show()
plt.plot(history['f1_train_list'][:-10], label='f1_train')
plt.plot(history['f1_dev_list'][:-10], label='f1_dev')
plt.xlabel('epochs')
plt.ylabel('values')
plt.legend()
plt.show()

In [None]:
#f1_path = './xlmr_softmax_f1.pt'
#loss_path = './xlmr_softmax_loss.pt'
def evaluation(model, weight_path, data_loader, labels):
    #model.load_state_dict(torch.load(weight_path), strict=False)
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    dev_pred , dev_true = [], []
    for batch in data_loader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs =  model.forward_custom(input_ids=b_input_ids, attention_mask=b_input_mask, 
                                       head_mask=None, labels=b_labels)
        # Move logits and labels to CPU

        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        dev_pred.extend([list(p) for p in np.argmax(logits, axis=2)])
        dev_true.extend(label_ids)

    #############################################
    dev_pred_tags = [tag_values[p_i] for p, l in zip(dev_pred, dev_true)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    dev_true_tags = [tag_values[l_i] for l in dev_true
                                  for l_i in l if tag_values[l_i] != "PAD"]
    report = classification_report(y_true=dev_true_tags, y_pred=dev_pred_tags, labels = labels, digits = 4)
    return report

## F1 classification_report

In [None]:
train_report = evaluation(model, PATH, train_loader, train_labels)
print(f'F1_report: \n{train_report}')

In [None]:
dev_report = evaluation(model, PATH, dev_loader, dev_labels)
print(f'F1_report: \n{dev_report}')

In [None]:
test_report = evaluation(model, PATH, test_loader, test_labels)
print(f'F1_report: \n{test_report}')

## Download File
<a href="./xlmr_full_025_025.pt"> Download File </a>

#### Evaluation