### Setup

#### Start pdf-converter OCR service

Using this tool:
https://github.com/D2P-APPS/pdf-ocr-tool

Run the following command to run the container and start the webservice:

    docker-compose up

#### Set up BERT masked language prediction

In [1]:
# Predicting neighbors to a word in sentence using BERTMaskedLM. 
# Neighbors are from BERT vocab (which includes subwords and full words) 
import torch
from transformers import BertTokenizer, BertForMaskedLM,  AdamW
from collections import OrderedDict
import logging
import random
import sys,os
p = os.path.dirname(os.getcwd())  #获取要导入模块的上上级目录
sys.path.append(p)
from dataModule import process_imdb
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.environ["CUDA_VISIBLE_DEVICES"] = '3,4,5'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import numpy as np
DEFAULT_MODEL_PATH='/home/zhangh/dataset/bert/bert-base-uncased'
params_dir ='/home/zhangh/workspace/Attack-Word/data/model/bert_base_uncased/bert_base_model_beta.pkl'
    
DEFAULT_TO_LOWER=False
DEFAULT_TOP_K = 10
ACCRUE_THRESHOLD = 1

def init_model(model_path,to_lower):
    """
    Initiate BERTForMaskedLm model.
    """
    logging.basicConfig(level=logging.INFO)
    tokenizer = BertTokenizer.from_pretrained(model_path,do_lower_case=to_lower)
    model = BertForMaskedLM.from_pretrained(model_path)
    #model.load_state_dict(torch.load(params_dir))
    return model,tokenizer

2021-11-24 08:28:02.606111: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-24 08:28:02.606146: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
model, tokenizer = init_model(DEFAULT_MODEL_PATH,to_lower=False)

Some weights of the model checkpoint at /home/zhangh/dataset/bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
tokenizer.tokenize('[PAD]')

['[PAD]']

In [3]:
from copy import deepcopy
from random import *
import random
def mask_words(words_list, max_pred=20):
    '''
    将文本中10%单词masked
    words_list:存放一个文本中所有单词，开头是'[CLS]',结尾是'[SEP]'
    '''
    words = deepcopy(words_list)
    #padding length=300,故为了防止mask到被截断的单词，需要处理超出长度的文本
    if len(words)>300:
        words = words[:301]
    n_pred =  min(max_pred, max(1, int(len(words) * 0.15))) 
    # 15 % of tokens in one sentence
    cand_maked_pos = [i for i, token in enumerate(words)
                          if token != '[CLS]' and token != '[SEP]']
    # candidate masked position
    shuffle(cand_maked_pos)
    masked_tokens, masked_pos = [], []
    invalid_replaced = []
    for token in ['[CLS]', '[SEP]','[PAD]']:    
        invalid_replaced.append(tokenizer.convert_tokens_to_ids(token))
    for pos in cand_maked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(words[pos])
            if random.random() < 0.8:  # 80%
                words[pos] = '[MASK]' # make mask
            elif random.random() > 0.5:  # 10%
                index = randint(0, tokenizer.vocab_size-1) 
                # random index in vocabulary
                while index in invalid_replaced: # can't involve 'CLS', 'SEP', 'PAD'
                   index = randint(0, tokenizer.vocab_size-1) 
                words[pos] = tokenizer.convert_ids_to_tokens(index) # replace
    return words, masked_tokens, masked_pos

In [4]:
def create_dataset(traindata,tokenizer,max_len):
    '''
    imput
    traindata:原始输入数据，存放str文本，list
    tokenizer:模型的tokenizer
    output
    input_idxs:模型训练的输入indexs
    target_idxs:模型训练的输出indexs
    masked_tokens:masked的token
    masked_pos:masked的token对应文本中的位置
    '''
    input_idxs = []
    target_idxs = []
    masked_tokens_all = []
    masked_pos_all = []
    for text in traindata:
        org_tokenized_text = tokenizer.tokenize(text)
        org_tokenized_text.insert(0, '[CLS]')
        org_tokenized_text.append('[SEP]')

        masked_tokenized_text,masked_tokens,masked_pos\
                            = mask_words(org_tokenized_text)
        if len(org_tokenized_text)!= len(masked_tokenized_text):
            print("error")
            continue
        
        #Padding
        if len(org_tokenized_text)<max_len:
            tmp=['[PAD]']*(max_len-len(org_tokenized_text))
            org_tokenized_text.extend(tmp)
            masked_tokenized_text.extend(tmp)
        elif len(org_tokenized_text)>max_len:
            org_tokenized_text = org_tokenized_text[:max_len]
            masked_tokenized_text = masked_tokenized_text[:max_len]
        
        
        target_indexed_tokens = tokenizer.convert_tokens_to_ids(org_tokenized_text)
        input_indexed_tokens = tokenizer.convert_tokens_to_ids(masked_tokenized_text)
        target_idxs.append(target_indexed_tokens)
        input_idxs.append(input_indexed_tokens)
        masked_tokens_all.append(masked_tokens)
        masked_pos_all.append(masked_pos)
    return input_idxs,target_idxs,masked_tokens_all,masked_pos_all

In [5]:
train_data_temp = ["The capital of France is Paris. The capital of France is Paris.",\
"Although the recipe for forward pass needs to be defined within this function, \
one should call the Module instance afterwards instead of this since the former\
 takes care of running the pre and post processing steps while the latter silently \
 ignores them."]
input_idxs,target_idxs,masked_tokens,masked_pos = create_dataset(train_data_temp,tokenizer,max_len=30)
print(input_idxs,target_idxs,masked_tokens,masked_pos)

[[101, 100, 3007, 103, 12410, 2003, 100, 1012, 100, 3007, 1997, 100, 2003, 100, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 100, 1996, 17974, 2005, 103, 103, 3791, 2000, 2022, 4225, 2306, 2023, 103, 1010, 103, 2323, 2655, 1996, 100, 6013, 5728, 2612, 1997, 2023, 2144, 1996, 2280, 3138, 2729]] [[101, 100, 3007, 1997, 100, 2003, 100, 1012, 100, 3007, 1997, 100, 2003, 100, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 100, 1996, 17974, 2005, 2830, 3413, 3791, 2000, 2022, 4225, 2306, 2023, 3853, 1010, 2028, 2323, 2655, 1996, 100, 6013, 5728, 2612, 1997, 2023, 2144, 1996, 2280, 3138, 2729]] [['of', '[UNK]'], ['pass', 'forward', 'one', 'while', 'function', 'processing']] [[3, 4], [6, 5, 15, 38, 13, 36]]


In [118]:
from torch.utils.data import TensorDataset, DataLoader
def loader(datas,tokenizer,batch_size=8):
    input_idxs,target_idxs,masked_tokens,masked_pos = \
        create_dataset(datas,tokenizer,max_len=300)
    train_set = TensorDataset(torch.LongTensor(input_idxs), torch.LongTensor(target_idxs))
    train_loader = DataLoader(dataset=train_set,
                          batch_size=batch_size,
                          shuffle=True
                          )
    #print("batch_size: ",batch_size)
    return train_loader,masked_tokens,masked_pos
    

In [32]:
x1=[1,3,4,4]
x2=["sf","aa","bc","are"]
for item1,item2 in zip(enumerate(x1),x2):
    print(item1,item2)

(0, 1) sf
(1, 3) aa
(2, 4) bc
(3, 4) are


In [10]:
def train_model(model,tokenizer,dataset,epoch=4,batch_size= 4):
    ''''''
    model.train()  # 将模型设置为训练模式
    model.to(device)
    
    train_loader,masked_tokens,masked_pos = loader(dataset,tokenizer,batch_size)
    
    avg_loss = []
    
    optimizer = AdamW(model.parameters(), lr=5e-5)
    
    for e in range(epoch):
        for batch_idx,(input_idxs,target_idxs) in enumerate(train_loader):
            input_idxs,target_idxs=input_idxs.to(device),target_idxs.to(device)
            output = model(input_idxs,labels = target_idxs)
            loss,logits = output[0],output[1]
            loss = loss / batch_size  # 梯度积累
            avg_loss.append(loss.item())
            loss.backward()
            if ((batch_idx + 1) % batch_size) == 0:
                # 每 8 次更新一下网络中的参数
                optimizer.step()
                optimizer.zero_grad()
            if batch_idx % 5 == 0:
                logging.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss:{:.6f}'.format(
                    e + 1, batch_idx, len(train_loader), 100. *
                    batch_idx / len(train_loader), np.array(avg_loss).mean()
                ))
    print('Finished Training')
    return model



In [11]:
model, tokenizer = init_model(DEFAULT_MODEL_PATH,to_lower=False)
train_texts,train_labels = process_imdb.read_file('train',clearn_flag=False)


Some weights of the model checkpoint at /home/zhangh/dataset/bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


read train files: 25000


In [15]:
#train_model(model,tokenizer,train_texts[:5],epoch=4,batch_size= 1)

In [122]:
params_dir = '/home/zhangh/workspace/Attack-Word/data/model/mask_LM/bert_base_LM_IMDB.pkl'
model.load_state_dict(torch.load(params_dir))

In [14]:
test_texts,trest_labels = process_imdb.read_file('test',clearn_flag=False)

read test files: 25000


In [121]:
def test_model(model,tokenizer,test_set):
    model.eval()
    model= model.to(device)
    print("Loading test dataset...")
    test_loader,masked_tokens,masked_pos = loader(test_set,tokenizer,batch_size=1)
    print("Load test dataset!")
    #print(len(test_loader), len(masked_tokens),len(masked_posed))
    total = 0
    correct = 0
    acc_sum = 0
    with torch.no_grad():
        for idx,(input,_) in enumerate(test_loader):
            text_cor = 0
            print(f"test index = {idx}")
            input = input.to(device)
            output = model(input)
            pred_tokens = predict(tokenizer,output.logits[0],masked_pos[idx])#
            # print(f"mask num : {masked_pos[idx]}")
            # print(f"result: {result_idx_tokens}")
            # print(f"masked_token: {masked_tokens[idx]}")
            for i in range(len(masked_tokens)):
                if masked_tokens[idx][i]==pred_tokens[i]: 
                    text_cor+= 1 
            #text_acc = text_cor/len(input)
            #print(f"")
            correct+= text_cor
            total += (len(masked_pos[idx]))
        print(f"正确预测的单词数量 {correct}，总数 {total},准确率 {100.*correct/total:.3f}%")

    


In [114]:
def predict(tokenizer,logits,masked_pos,topK_sample=0,max_sample=1):
    """
    Guess masked tokens.
    """
    #result_idx_tokens = {}
    result = []
    #print(f"masked_pos: {masked_pos}")
    for pos in masked_pos:
        #print(f"pos:{pos}")
        # print(len(logits[pos])-1)
        # print(len(logits[pos].tolist()))
        preds = dict(zip(range(0,len(logits[pos])-1),logits[pos].tolist()))
        #print(f"preds: {preds}")
        #print(predictions.logits[0][idx])
        #print("Average score: ",torch.mean(predictions.logits[0][idx],dim=0))
        sorted_pred = OrderedDict(sorted(preds.items(), 
            key=lambda kv: kv[1], reverse=True))
        idx = sample(sorted_pred,topK_sample,max_sample)
        result.append(tokenizer.convert_ids_to_tokens(idx))
        #result_idx_tokens[pos]=tokenizer.convert_ids_to_tokens(idx)
    return result

    


In [84]:
#idx = sample(sorted_pred,topK_sample,max_sample)
def softmax(v):
    l1 = list(map(lambda x: np.exp(x), v))
    return list(map(lambda x: x / sum(l1), l1))

def sample(score_dict,topK_sample,max_sample):
    ''''''
    
    if max_sample > 0 :#最大值采样
        return list(score_dict.keys())[0]
    else:#top_k 采样
        pred_scores = list(score_dict.values())[:topK_sample]
        pred_idxs = list(score_dict.keys())[:topK_sample]
        probs = np.array(softmax(pred_scores))
        pred_id = np.random.choice(a=pred_idxs, size=1, replace=True, p=probs)[0]
        return pred_id

In [120]:
test_model(model,tokenizer,test_texts[:5])

Loading test dataset...
error
Load test dataset!
test index = 0
test index = 1
test index = 2
test index = 3
正确预测的单词数量 6，总数 70,准确率 8.571%


In [104]:
list1 = ['creepy', 'the', 'full', 'the', 'holm', 'script']
list2 = ['creepy', 'this', 'full', 'time', 'holm', 'script']
count=0


4


In [99]:
tokenizer.convert_ids_to_tokens(17109)


'creepy'

In [26]:
def predict_mask(model,tokenizer,text,top_k=0,accrue_threshold=1):
    """
    Guess masked tokens.
    """
    tokenized_text = tokenizer.tokenize(text)
    #print(tokenized_text)
    print(len(tokenized_text))
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    #print(indexed_tokens)
    print(len(indexed_tokens))
    
    # Create the segments tensors.
    segments_ids = [0] * len(tokenized_text)
    masked_index = []
    for i in range(len(tokenized_text)):
        if (tokenized_text[i] == "[MASK]"):
            masked_index.append(i) 
    # print(segments_ids)
    # print(masked_index)
    
    results_dict = {}
    accrue_threshold = 1
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)
        # print(predictions.logits.shape) #1,81,30523
        # print(predictions.logits[0].shape)
        for idx in masked_index:
            result_idx = {}
            #print(predictions.logits[0][idx])
            #print("Average score: ",torch.mean(predictions.logits[0][idx],dim=0))
            count = 0
            for i in range(len(predictions.logits[0][idx])):#30522
                score_i = float(predictions.logits[0][idx][i].tolist())
                if score_i > accrue_threshold :
                #将位置i对应的token找到，并放入字典
                    tok = tokenizer.convert_ids_to_tokens([i])[0]
                    result_idx[tok] = float(score_i)
                    count += 1
            #print(f"Valid prediction num {count} for idx: {idx}")
            results_dict[idx]=result_idx
    sample(results_dict,top_k)

    


In [18]:
def predict(model,tokenizer,top_k,accrue_threshold,text):
    """
    Guess masked tokens.
    """
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Create the segments tensors.
    segments_ids = [0] * len(tokenized_text)

    masked_index = 0

    for i in range(len(tokenized_text)):
        if (tokenized_text[i] == "[MASK]"):
            masked_index = i
            break

    #print(tokenized_text)
    #print(masked_index)
    results_dict = {}

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)
        for i in range(len(predictions[0][0,masked_index])):
            if (float(predictions[0][0,masked_index][i].tolist()) > accrue_threshold):
                tok = tokenizer.convert_ids_to_tokens([i])[0]
                results_dict[tok] = float(predictions[0][0,masked_index][i].tolist())

    k = 0
    sorted_d = OrderedDict(sorted(results_dict.items(), key=lambda kv: kv[1], reverse=True))
    for i in sorted_d:
        print(i,sorted_d[i])
        k += 1
        if (k > top_k):
            break

In [None]:
def softmax(v):
    l1 = list(map(lambda x: np.exp(x), v))
    return list(map(lambda x: x / sum(l1), l1))
def max_sample():
    ''''''
def top_k_sample():
    ''''''
def sample(score_dict,top_k=0):
    ''''''
    
    # if top_k > 0:
    #     sampler = top_k_sample()
    # else:
    #     sampler = top_k_sample()
    result = []
    for idx in score_dict:
        preds = score_dict[idx]
        if top_k > 0 :
            k=0
            sorted_pred = OrderedDict(sorted(preds.items(), 
            key=lambda kv: kv[1], reverse=True))
            for i in sorted_pred:
                k += 1
                if (k > top_k):
                    sorted_pred[i] = 0
            pred = sorted_pred           
        scores = list(pred.values())
        probs = np.array(softmax(scores))
        pred_idxs = np.array(range(0,len(preds)))
        tokens = list(pred.keys())
        pred_id = np.random.choice(a=pred_idxs, size=1, replace=True, p=probs)[0]
        #print("pred_id: ",pred_id)
        result.append(tokens[pred_id])
    print(result)
    return result

## Guess redactions with BERT masked language prediction

In [4]:
# sample sentence from bay of pigs
text = """Although it cannot be determined accurately at
what height any of the Brigade's B-26's actually were
flying, Gar Thorsrud is of the opinion that they
probably would have been cruising at 8,000'-10,000'
for the early part of the trip, dropping down to
2,000' when approximately 15 miles off the target
by which time they would have been well past the
Essex."""

In [5]:
# mask "height"

text = """Although it cannot be determined accurately at
what [MASK] any of the Brigade's B-26's actually were
flying, Gar Thorsrud is of the opinion that they
probably would have been cruising at 8,000'-10,000'
for the early part of the trip, dropping down to
2,000' when approximately 15 miles off the target
by which time they would have been well past the
Essex."""

In [6]:
# mask "target"

text = """Although it cannot be determined accurately at
what height any of the Brigade's B-26's actually were
flying, Gar Thorsrud is of the opinion that they
probably would have been cruising at 8,000'-10,000'
for the early part of the trip, dropping down to
2,000' when approximately 15 miles off the [MASK]
by which time they would have been well past the
Essex."""

In [7]:
predict(model,tokenizer,DEFAULT_TOP_K,ACCRUE_THRESHOLD,text)

coast 8.16816234588623
point 7.087375164031982
ground 6.919796943664551
road 6.64233922958374
shore 6.514383792877197
mark 6.252679824829102
surface 6.111354827880859
end 5.635773181915283
target 5.60432767868042
sea 5.528745174407959
coastline 5.495854377746582


In [9]:
train_texts,train_labels = process_imdb.read_file('train',clearn_flag=False)
print(train_texts[0])
# tokenizer.tokenize(train_texts[0])

read train files: 25000
famous was "famous" for their tension and release style of cartoon where the semi-main character is in terrible peril, only to be rescued by the hero at the last second. this particular casper is the only one i can remember where death actually takes a hand. but even in death, there is still a happy ending.the constant in famous studios cartoons is that "virtue always triumphs". popeye always gets to his spinach in time, baby huey always out-foxes the fox, little audery always "learns her lesson". and some fs cartoons are really dark and depressing.you have to give them credit. as much as i love looney tunes and "tom and jerry" i don't think anyone was putting out a better cartoon product at that time than paramount. color, animation, music (the great winston sharples), editing, voices. they were consistent and a glowing example of the best that the art form had to offer.


In [10]:
sentence_t = train_texts[66]
print(sentence_t)

this enjoyable minor noir boasts a top cast, and many memorable scenes. the big distraction is the complete disregard for authentic accents. the spanish characters in the film are played by a frenchman (boyer), a belgian (francen), a greek (paxinou) and a hungarian (lorre)! and to top it all off bacall is supposed to be an english aristocrat! despite these absurdities, the performances are all very good - especially those of paxinou and lorre. but the scene in which boyer, paxinou and lorre meet, and talk in wildly different accents, is a real hoot! and i guess, seeing as how they were alone, that they should actually have been speaking in spanish anyway! it seems pretty weird that the brothers warner couldn't find any spanish speaking actors in los angeles! of course hollywood has often had an "any old accent will do" policy - my other favorite is greta garbo (swedish) as mata hari (dutch), who falls in love with a russian soldier played by a mexican (ramon novarro). maybe they should

In [11]:
def mask_sentence(sentence, rate=0.1,mask_num=0):
    words = sentence.split()
    if mask_num < 1:
        mask_num = int(len(words)*rate) if int(len(words)*rate)>0 else 1
    mask_idxs = random.sample(range(0,len(words)-1), mask_num)
    mask_info = {}
    for idx in mask_idxs:
        mask_info[idx] = words[idx]
        words[idx] = '[MASK]'
    return " ".join(words), mask_info


In [16]:
mask_sent_t,mask_info  = mask_sentence(sentence_t,0.1)
print(mask_sent_t,mask_info )

this enjoyable minor noir boasts a [MASK] cast, and many memorable scenes. [MASK] [MASK] distraction [MASK] the complete disregard for authentic accents. the spanish characters in the film are played by a frenchman (boyer), a belgian (francen), a greek (paxinou) and [MASK] hungarian (lorre)! and to top it all [MASK] bacall is [MASK] to be an english aristocrat! despite these absurdities, the performances are all very good [MASK] especially those of paxinou and lorre. but the scene in which boyer, paxinou and lorre meet, [MASK] talk in wildly different accents, is a real hoot! and i guess, seeing as how they were alone, [MASK] they should actually [MASK] been speaking in spanish anyway! it seems [MASK] weird that [MASK] brothers warner couldn't find [MASK] spanish speaking [MASK] in los angeles! of course hollywood [MASK] often had [MASK] "any old accent will do" policy - my other favorite is greta garbo (swedish) as mata hari [MASK] who falls in love with a russian soldier played by a 

In [123]:
text

"Although it cannot be determined accurately at\nwhat height any of the Brigade's B-26's actually were\nflying, Gar Thorsrud is of the opinion that they\nprobably would have been cruising at 8,000'-10,000'\nfor the early part of the trip, dropping down to\n2,000' when approximately 15 miles off the [MASK]\nby which time they would have been well past the\nEssex."

In [21]:
predict_mask(model,tokenizer,mask_sent_t,top_k=10,accrue_threshold=0)

268
268
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[6, 14, 15, 17, 55, 67, 72, 92, 118, 142, 146, 155, 158, 165, 168, 176, 179, 203, 238, 247]
pred_id:  3
pred_id:  0
pred_id:  0
pred_id:  0
pred_id:  0
pred_id:  0
pred_id:  0
pred_id:  0

In [122]:
predict(model,tokenizer,DEFAULT_TOP_K,ACCRUE_THRESHOLD,text)

coast 8.16816234588623
point 7.087375164031982
ground 6.919796943664551
road 6.64233922958374
shore 6.514383792877197
mark 6.252679824829102
surface 6.111354827880859
end 5.635773181915283
target 5.60432767868042
sea 5.528745174407959
coastline 5.495854377746582
