In [1]:
def add_subword(tokenizer, X,  Y):
        '''
        input:
            sentence = ['Phạm', 'Văn', 'Mạnh']
            text_labels = ['B-PER', 'I-PER','I-PER']

        output: 
            ['Phạm', 'Văn', 'M', '##ạnh'],
            ['B-PER', 'I-PER', 'I-PER', 'I-PER']
        '''
        tokenized_sentence = []
        labels = []
        for word, label in zip(X, Y):
          
          subwords = tokenizer.tokenize(word)
          tokenized_sentence.extend(subwords)
          labels.extend([label] * len(subwords))
        return tokenized_sentence, labels

def padding_data(tokenizer,X_subword,y_subword,MAXLEN,tag2idx):
        '''
            input:
                X = [['Phạm', 'Văn', 'M', '##ạnh',..],....]
                Y = [['B-PER', 'I-PER','I-PER','I-PER',..],...]

            output: 
            [[10,20,30,40,0,0,0,0,0,0,0,0...],...],
            [[1, 2,3,4,5,5,5,5,5,5,5,5,5,...],...]
        '''
        X_padding = pad_sequences([tokenizer.convert_tokens_to_ids(X_subword)],
                          maxlen=MAXLEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

        y_padding = pad_sequences([[tag2idx.get(l) for l in y_subword]],
                        maxlen=MAXLEN, value=tag2idx["PAD"], padding="post",
                        dtype="long", truncating="post")
        
        return X_padding,y_padding

def predict_data_train(model, tokenizer, tag_values, dataset , idx, tag2idx, MAXLEN, device):
    """
        input:
            - model:
            - tokenizer:
            - tag_value: ['O', 'PER', 'LOC',...]
            - dataset: [[('anh','O'),...]]
            - idx: index in dataset
            - tag2idx: {'PER': 0, 'LOC': 1, 'ORG': 2, 'MISC': 3, 'O': 4, 'PAD': 5}
            - MAXLEN: 256,512,...
    """
    X = [[w for w,_ in sq] for  sq in dataset][idx]
    Y = [[t for _,t in sq] for  sq in dataset][idx]
    X_Sub, Y_Sub = add_subword(tokenizer, X, Y)
    X_padding, Y_padding = padding_data(tokenizer,X_Sub,Y_Sub,MAXLEN,tag2idx)
    input_ids_tensor = torch.tensor(X_padding).type(torch.LongTensor).to(device)
    input_mask = [[float(i != 0.0) for i in ii] for ii in X_padding]
    input_mask_tensor = torch.tensor(input_mask).type(torch.LongTensor).to(device)
    with torch.no_grad():
        outputs = model.forward_custom(input_ids_tensor, attention_mask = input_mask_tensor)
    logits = outputs[0].detach().cpu().numpy()

        #Precroces subword

    len_subword = sum(X_padding[0] != 0)
    tokens = tokenizer.convert_ids_to_tokens(input_ids_tensor[0].to('cpu').numpy())[:len_subword]
    predict = np.argmax(logits, axis=2)[0][:len_subword]

    tags_predict = [tag_values[i]  for i in  predict]
    tags_true = [tag_values[i]  for i in  Y_padding[0]]
    y_predict = []
    words = []
    y_true = []
    for index in range(len(tokens)):
        if "##" not in tokens[index]:
            y_predict.append(tags_predict[index])
            y_true.append(tags_true[index])
            words.append(tokens[index])
        else:
            words[-1] = words[-1] + tokens[index].replace("##","")
    return [(w,'?',t) for w,t in zip(words,y_true)], y_predict