In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F

import transformers
from transformers import GPT2Config, GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
from transformers import BertTokenizer, BertTokenizerFast

import numpy as np
import os
from pathlib import Path
import re
from tqdm import tqdm_notebook, tnrange

In [None]:
exp_dir = "/mnt/disk3/m10615110/gpt2_chinese/exp/"
gpt2_pretrain_path = "/home/m10615110/gpt2/"

bert_tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model.resize_token_embeddings(bert_tokenizer.vocab_size)

In [None]:
# start_sent = '10 + 20 = '
# gpt_token = gpt_tokenizer.tokenize(start_sent)
# gpt_idxs = gpt_tokenizer.convert_tokens_to_ids(gpt_token)
# gpt_seqs = torch.LongTensor(gpt_idxs)

# with torch.no_grad():
#     output_sequences = model.generate(gpt_seqs.unsqueeze(0), 
#                                       max_length=bert_seqs.size(0)+20, 
#                                       top_k=beam_size, 
#                                       top_p=top_p, 
#                                       repetition_penalty=repetition_penalty,
#                                       temperature=temperature,
#                                       do_sample = False,
#                                       num_return_sequences=1, 
#                                       no_repeat_ngram_size=no_repeat_ngram_size,
#                                       pad_token_id=gpt_tokenizer.pad_token_id,
#                                       bos_token_id=bert_tokenizer.bos_token_id,
#                                       eos_token_id=bert_tokenizer.eos_token_id
#                                      )
    
# print(' '.join(gpt_tokenizer.convert_ids_to_tokens(output_sequences[0])))

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
USE_CUDA = torch.cuda.is_available()
USE_CUDA = False

In [None]:
model_path = Path(exp_dir) / 'gpt2_medium_noptt_len512_batch_4_2020-08-11 02:45:14' / 'epoch_8.mdl'

from collections import OrderedDict
state_dict = torch.load(model_path, map_location='cpu')
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    name = k[7:] # remove `module.`
    new_state_dict[name] = v

model.load_state_dict(new_state_dict)

model.eval()

if USE_CUDA:
    model.cuda()

In [None]:
print('total parms : ', sum(p.numel() for p in model.parameters()))
print('trainable parms : ', sum(p.numel() for p in model.parameters() if p.requires_grad))

In [None]:
max_length = 200
beam_size = 50
repetition_penalty = 8.0
temperature = 5.0
top_p = 0.7
no_repeat_ngram_size = 5

In [None]:
start_sent = '總統發表宣示致詞，'


In [None]:
sents = re.split('。|，| ', start_sent)
while '' in sents:
    sents.remove('')

bert_sent = '[CLS]'
for sent in sents:
    bert_sent += sent
    bert_sent += '[SEP]'

# bert_sent = start_sent
print(bert_sent)

bert_tokens = bert_tokenizer.tokenize(bert_sent)
# print(bert_tokens)
bert_idxs = bert_tokenizer.convert_tokens_to_ids(bert_tokens)
# print(bert_idxs)
bert_seqs = torch.LongTensor(bert_idxs)

In [None]:
%%time
model.eval()
with torch.no_grad():
    output_sequences = model.generate(bert_seqs.unsqueeze(0), 
                                      max_length=bert_seqs.size(0)+max_length, 
                                      top_k=beam_size, 
                                      top_p=top_p, 
                                      repetition_penalty=repetition_penalty,
                                      temperature=temperature,
                                      do_sample = False,
                                      num_return_sequences=1, 
                                      no_repeat_ngram_size=no_repeat_ngram_size,
                                      pad_token_id=bert_tokenizer.pad_token_id,
                                      bos_token_id=bert_tokenizer.cls_token_id,
#                                       eos_token_id=bert_tokenizer.sep_token_id
                                     )

In [None]:
print(' '.join(bert_tokenizer.convert_ids_to_tokens(output_sequences[0])))

In [2]:
sents = re.split('。|，| ', start_sent)
while '' in sents:
    sents.remove('')

bert_sent = '[CLS]'
for sent in sents:
    bert_sent += sent
    bert_sent += '[SEP]'

# bert_sent = bert_sent[:-5]
print(bert_sent)

bert_tokens = bert_tokenizer.tokenize(bert_sent)
bert_idxs = bert_tokenizer.convert_tokens_to_ids(bert_tokens)




NameError: name 're' is not defined

In [3]:
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        # torch.topk()返回最后一维最大的top_k个元素，返回值为二维(values,indices)
        # ...表示其他维度由计算机自行推断
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value  # 对于topk之外的其他元素的logits值设为负无穷

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)  # 对logits进行递减排序
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits

beams = []
beams.append([bert_idxs, 0])

for _ in tnrange(max_length):
    
    results = []
    
    for beam in beams:
        
        bert_idxs = beam[0]
        
        bert_seqs = torch.LongTensor(bert_idxs)
        if USE_CUDA:
            bert_seqs.cuda()
        
        outputs = model(bert_seqs)
        next_token_logits = outputs[0][-1, :]
        next_token_logits[bert_tokenizer.unk_token_id] = -np.inf
        
        # penalty for already appeared words
        for idx in set(bert_idxs):
            if next_token_logits[idx] > 0:
                next_token_logits[idx] /= repetition_penalty
            else:
                next_token_logits[idx] *= repetition_penalty
            
        next_token_logits = next_token_logits / temperature
        
        next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=beam_size, top_p=top_p)

        next_token_prob = F.log_softmax(next_token_logits, dim=0)
        probs, idxs = torch.topk(next_token_prob, k=beam_size)
        
        for prob, idx in zip(probs, idxs):
            
            generate_idxs = bert_idxs + [idx.item()]
            accumulate_prob = beam[1] + prob.item()
            
            results.append([generate_idxs, accumulate_prob])
            
    # beam search        
    results.sort(key=lambda x:x[1])
    results = results[::-1]
    beams = results[:beam_size]
    
    


NameError: name 'bert_idxs' is not defined

In [None]:
print(' '.join(bert_tokenizer.convert_ids_to_tokens(beams[0][0])))