In [12]:
from transformers import BertTokenizer

In [13]:
tokenizer = BertTokenizer(vocab_file="vocabulary/vocab_small.txt")
len(tokenizer)

13317

In [26]:
words= "[Goal1]你好[KG][Goal5][Goal4]"
dic = {'additional_special_tokens':["[Goal1]","[KG]","[Goal2]","[Goal3]","[Goal4]","[Goal5]","[A]","[Q]","[RE]"]}
tokenizer.add_special_tokens(dic)
len(tokenizer)

13321

In [23]:
dialogue_ids = []
dialogue_ids.extend(tokenizer.convert_tokens_to_ids(word) for word in words)
dialogue_ids

[138, 100, 157, 143, 154, 122, 140, 872, 1962, 138, 100, 100, 140]

In [11]:
tokenizer.convert_tokens_to_ids(words)

100

In [31]:
out = tokenizer.tokenize(words)
dialogue_ids = []
dialogue_ids.extend(tokenizer.convert_tokens_to_ids(word) for word in out)
text = tokenizer.convert_ids_to_tokens(dialogue_ids)
text

['[Goal1]', '你', '好', '[KG]', '[Goal5]', '[Goal4]']

In [28]:
out

['[Goal1]', '你', '好', '[KG]', '[Goal5]', '[Goal4]']

In [26]:
import numpy as np
import torch
n_position = 200
d_hid = 10
def get_position_angle_vec(position):
    return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]

In [24]:
sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
sinusoid_table.shape

(200, 10)

In [28]:
sinusoid_table = torch.FloatTensor(sinusoid_table)
sinusoid_table.size(1)

10

In [1]:
def collate_fn(batch):
    """
    计算该batch中的所有sample的最长的input，并且将其他input的长度向其对齐
    :param batch:
    :return:
    """
    global pad_id
    input_ids = []
    btc_size = len(batch)
    max_input_len = 0  # 该batch中最长的input，用于该batch的数据对齐
    # 计算该batch中input的最大长度
    for btc_idx in range(btc_size):
        if max_input_len < len(batch[btc_idx]):
            max_input_len = len(batch[btc_idx])
    # 使用pad_id对小于max_input_len的input_id进行补全
    for btc_idx in range(btc_size):
        input_len = len(batch[btc_idx])
        input_ids.append(batch[btc_idx])
        input_ids[btc_idx].extend([pad_id] * (max_input_len - input_len))                                                                                 
    return torch.tensor(input_ids, dtype=torch.long)


In [2]:
from transformers import BertTokenizer
import torch
tokenizer = BertTokenizer(vocab_file='vocabulary/vocab_small.txt')
# tokenizer的字典大小
dic = {'additional_special_tokens':["[Goal1]","[KG]","[Goal2]","[Goal3]","[Goal4]","[Goal5]","[A]","[Q]","[RE]"]}
tokenizer.add_special_tokens(dic)
vocab_size = len(tokenizer)
global pad_id
pad_id = tokenizer.convert_tokens_to_ids('[PAD]')

In [3]:
import transformers
from transformers import GPT2LMHeadModel
def create_model(vocab_size):
    """
    :param args:
    :param vocab_size:字典大小
    :return:
    """
   
    model_config = transformers.modeling_gpt2.GPT2Config.from_json_file('config/model_config_dialogue_small.json')
    model = GPT2LMHeadModel(config=model_config)
    # 根据tokenizer的vocabulary调整GPT2模型的voca的大小
    model.resize_token_embeddings(vocab_size)
    return model

model = create_model(vocab_size)

In [4]:
with open('data/train_tokenized.txt', "r", encoding="utf8") as f:
    data = f.read()

In [5]:
data = data.split('\n')
len(data)

19858

In [6]:
from dataset import MyDataset
from torch.utils.data import Dataset, DataLoader
train_dataset = MyDataset(data)
train_dataloader = DataLoader(train_dataset, batch_size = 4, shuffle = True, collate_fn = collate_fn)

In [7]:
for batch_id, input_ids in enumerate(train_dataloader):
    outputs = model.forward(input_ids=input_ids)
    print(len(input_ids))
    #print(input_ids[0])
    break

4


In [22]:
logits = outputs[0]
logits.shape
shift_logits = logits[...,:-1,:].contiguous() #torch.Size([4, 457, 13318])
shift_labels = input_ids[...,1:].contiguous() #torch.Size([4, 457])
from torch.nn import CrossEntropyLoss

In [43]:
shift_labels.size(1)
loss_fct = CrossEntropyLoss(ignore_index = 0, reduction = 'sum')
print(shift_logits.shape)
print(tokenizer.convert_tokens_to_ids('[RE]'))

torch.Size([4, 555, 13318])
13


In [62]:
import copy 
import numpy as np
loss = 0.0
accuracy = 0.0
num_targets_sum = 0.0
for index,shift_label in enumerate (shift_labels):
    label_no_grad = shift_label.detach()
    lab_2np = label_no_grad.numpy()
    itemindex = np.argwhere(lab_2np == 13)## '[re]'的index
    #print(itemindex[0][0])
    #print(shift_labels[index][itemindex[0][0]])
    ### 计算loss
    loss += loss_fct(shift_logits[index][itemindex[0][0]+1:].view(-1, shift_logits.size(-1)),
                     shift_labels[index][itemindex[0][0]+1:].view(-1))
    ### 计算acc
    _, preds = shift_logits[index].max(dim=-1)
    not_ignore = shift_label.ne(pad_id)
    num_targets = not_ignore.long().sum().item()
    num_targets_sum += num_targets
    
    correct = (shift_label == preds) & not_ignore  # 计算model预测正确的token的个数，排除pad的tokne
    correct = correct.float().sum()
    
print(loss, correct)

tensor(861.5113, grad_fn=<AddBackward0>) tensor(0.)


In [60]:
_, preds = shift_logits.max(dim=-1)
preds.shape

torch.Size([4, 555])

In [3]:
import os
CUR_DIR = os.path.dirname(os.path.abspath("__file__"))
CUR_DIR

'/home/jiahao/GPT2-chitchat'