In [1]:
from utils import *
from transformers import AutoTokenizer
from transformers import BertForTokenClassification, AdamW, BertModel, BertConfig
import torch
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm, trange



In [2]:
from utilities_check_err import *
from visual_test import *

# 1. load model

In [3]:
dir_valid = 'vlsp2018/val_2018.txt'
MAXLEN = 256
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False,use_fast=False)

In [4]:
IO_tag_values = ['PER','LOC','ORG','MISC','O']
IO_data_valid = process_data_for_BERT(dir_valid, tokenizer, IO_tag_values, MAXLEN, type='IO')

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [6]:
IO_tag2idx = IO_data_valid.tag2idx

In [7]:
#BERT base
IO_model = BertForTokenClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=len(IO_tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
IO_model.load_state_dict(torch.load('IO_BERT_MULTI.pt'), strict=False)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

<All keys matched successfully>

In [8]:
IO_model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [9]:
def predict_data_train_T(model, tokenizer, tag_values, dataset , idx, tag2idx, MAXLEN):
    """
        input:
            - model:
            - tokenizer:
            - tag_value: ['O', 'PER', 'LOC',...]
            - dataset: [[('anh','O'),...]]
            - idx: index in dataset
            - tag2idx: {'PER': 0, 'LOC': 1, 'ORG': 2, 'MISC': 3, 'O': 4, 'PAD': 5}
            - MAXLEN: 256,512,...
    """
    X = dataset.X[idx]
    Y = dataset.Y[idx]
    X_Sub, Y_Sub = add_subword(tokenizer, X, Y)
    X_padding, Y_padding = padding_data(tokenizer,X_Sub,Y_Sub,MAXLEN,tag2idx)
    input_ids_tensor = torch.tensor(X_padding).type(torch.LongTensor).cuda() #Fixfug here
    input_mask = [[float(i != 0.0) for i in ii] for ii in X_padding]
    input_mask_tensor = torch.tensor(input_mask).type(torch.LongTensor).cuda() 
    with torch.no_grad():
        outputs = model(input_ids_tensor, token_type_ids = None, attention_mask = input_mask_tensor)

    logits = outputs[0].detach().cpu().numpy()

        #Precroces subword

    len_subword = sum(X_padding[0] != 0)
    tokens = tokenizer.convert_ids_to_tokens(input_ids_tensor[0].to('cpu').numpy())[:len_subword]
    predict = np.argmax(logits, axis=2)[0][:len_subword]

    tags_predict = [tag_values[i]  for i in  predict]
    tags_true = [tag_values[i]  for i in  Y_padding[0]]
    y_predict = []
    words = []
    y_true = []
    for index in range(len(tokens)):
        if "##" not in tokens[index]:
            y_predict.append(tags_predict[index])
            y_true.append(tags_true[index])
            words.append(tokens[index])
        else:
            words[-1] = words[-1] + tokens[index].replace("##","")
    return words,y_true, y_predict

## Testing

In [16]:
X = []
Y_TRUE = []
Y_PREDICT = []
iS_TRUE = []
for i in range(len(IO_data_valid.data)):
    x, y_true, y_predict= predict_data_train_T(IO_model, tokenizer, IO_tag_values ,IO_data_valid, i ,IO_tag2idx, 256)
    if y_true == y_predict:
        iS_TRUE.append(1)
    else:
        iS_TRUE.append(0)
    X.append(x)
    Y_TRUE.append(y_true)
    Y_PREDICT.append(y_predict)

In [18]:
pd.DataFrame({'Tokens':X, 'True tag': Y_TRUE, 'Predict tag':Y_PREDICT,'IS_TRUE':iS_TRUE}).head()

Unnamed: 0,Tokens,True tag,Predict tag,IS_TRUE
0,"[6, ngư, dân, Quảng, Ngãi, trên, tàu, cá, bị, ...","[O, O, O, LOC, LOC, O, O, O, O, O, O, O, O, O,...","[O, O, O, LOC, LOC, O, O, O, O, O, O, O, O, O,...",1
1,"[Sáng, 22, /, 9, ,, Trạm, Biên, phòng, Mũi, Tấ...","[O, O, O, O, O, LOC, LOC, LOC, LOC, LOC, O, LO...","[O, O, O, O, O, ORG, ORG, ORG, ORG, ORG, O, LO...",0
2,"[Ngư, dân, Quảng, Ngãi, trên, tàu, cá, bị, nạn...","[O, O, LOC, LOC, O, O, O, O, O, O, ORG, ORG, O...","[O, O, LOC, LOC, O, O, O, O, O, O, ORG, ORG, O...",1
3,"[Trước, đó, ,, tàu, cá, QNg, 95183, TS, do, ng...","[O, O, O, O, O, O, O, O, O, O, O, PER, PER, PE...","[O, O, O, O, O, O, O, O, O, O, O, PER, PER, PE...",1
4,"[Ngư, dân, Nguyễn, Văn, Trung, đã, phát, tín, ...","[O, O, PER, PER, PER, O, O, O, O, O, O, O, O, ...","[O, O, PER, PER, PER, O, O, O, O, O, O, O, O, ...",1


In [20]:
df = pd.DataFrame({'Tokens':X, 'True tag': Y_TRUE, 'Predict tag':Y_PREDICT,'IS_TRUE':iS_TRUE})
df.to_csv('data_valid_Bert_multi_2018_TINH.csv')

In [15]:
Y_TRUE[0] == Y_PREDICT[0]

True

# Train set

In [21]:
dir_train = 'vlsp2018/train_2018.txt'
IO_tag_values = ['PER','LOC','ORG','MISC','O']
IO_data_train = process_data_for_BERT(dir_train, tokenizer, IO_tag_values, MAXLEN, type='IO')

In [23]:
X = []
Y_TRUE = []
Y_PREDICT = []
iS_TRUE = []
for i in range(len(IO_data_train.data)):
    x, y_true, y_predict= predict_data_train_T(IO_model, tokenizer, IO_tag_values ,IO_data_train, i ,IO_tag2idx, 256)
    if y_true == y_predict:
        iS_TRUE.append(1)
    else:
        iS_TRUE.append(0)
    X.append(x)
    Y_TRUE.append(y_true)
    Y_PREDICT.append(y_predict)

In [24]:
pd.DataFrame({'Tokens':X, 'True tag': Y_TRUE, 'Predict tag':Y_PREDICT,'IS_TRUE':iS_TRUE}).head()

Unnamed: 0,Tokens,True tag,Predict tag,IS_TRUE
0,"[Bế, mạc, Hội, nghị, các, quan, chức, cao, cấp...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1
1,"[Ngày, 22, /, 9, ,, Hội, nghị, lần, thứ, 11, c...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1
2,"[Các, đại, biểu, tham, dự, Hội, nghị, nghe, gi...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1
3,"[Trong, hai, ngày, làm, việc, ,, Hội, nghị, đã...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1
4,"[Hội, nghị, đã, cơ, bản, thông, qua, Bản, khuy...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1


In [26]:
df = pd.DataFrame({'Tokens':X, 'True tag': Y_TRUE, 'Predict tag':Y_PREDICT,'IS_TRUE':iS_TRUE})
df.to_csv('data_train_Bert_multi_2018_TINH.csv')