In [8]:
import torch
import numpy as np
from GPT2 import GPT2Model, GPT2Tokenizer

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
device = 'cuda' #'cuda'

model = GPT2Model(
    vocab_size=30000,
    layer_size=12,
    block_size=1024,
    embedding_dropout=0.0,
    embedding_size=768,
    num_attention_heads=12,
    attention_dropout=0.0,
    residual_dropout=0.0)

state_dict = torch.load('save_distill.pth', map_location='cpu')

model.load_state_dict(state_dict)
model.to(device)
model.eval()

tokenizer = GPT2Tokenizer(
    'GPT2/bpe/vocab.json',
    'GPT2/bpe/chinese_vocab.model',
    max_len=512)

In [3]:
def sample(text, max_len=10):
    ids = tokenizer.encode(text)
    input_id = torch.tensor((np.array(ids).reshape(1, -1).astype('int64'))).to(device)
    output, cached_kvs = model(input_id, use_cache=True)
    nid = int(np.argmax(output[0, -1].detach().cpu().numpy()))
    ids += [nid]
    out = [nid]
    for i in range(max_len):
        input_id = torch.tensor(np.array([nid]).reshape(1, -1).astype('int64')).to(device)
        output, cached_kvs = model(input_id, cached_kvs, use_cache=True)
        nid = int(np.argmax(output[0, -1].detach().cpu().numpy()))
        ids += [nid]
        if nid==3:
            break
        out.append(nid)
    print(tokenizer.decode(out))

def ask_question(question, max_len=10):
    sample('''问题：中国的首都是哪里？
    答案：北京。
    问题：李白在哪个朝代？
    答案：唐朝。
    问题：%s
    答案：''' % question, max_len)

In [38]:
from data.samplers import DistributedBatchSampler, RandomSampler
from tqdm import tqdm

class GenDataset(torch.utils.data.Dataset):
    def __init__(self, data_path, split, tokenizer: GPT2Tokenizer, seq_length=1024, ratio=1):
        self.split = split
        self.tokenizer = tokenizer
        self.ratio = ratio

        self.pad_id = tokenizer.encoder['<pad>']
        self.eod_token = tokenizer.encoder['<eod>']
        self.seq_length = seq_length
        
        with open(data_path, "r") as f:
            data = f.readlines()
        self.samples = self.process(data)
        

    def process(self, data):
        samples = []
        for doc in tqdm(data[:int(self.ratio * len(data))]):
            token_ids = self.tokenizer.encode(doc)
            token_ids.append(self.eod_token)
            start = 0
            while start + self.seq_length + 1 < len(token_ids):
                samples.append(token_ids[start: start + self.seq_length + 1])
                start = start + self.seq_length + 1
            samples.append(token_ids[start:] + [self.pad_id] * (self.seq_length + 1 - (len(token_ids) - start)))

        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

    def collate(self, samps):
        bs = len(samps)

        # # triangle attention mask
        # attn_mask = torch.tril(torch.ones((self.seq_length, self.seq_length))).unsqueeze(0)
        # position_ids = torch.arange(self.seq_length, dtype=torch.long).unsqueeze(0).repeat(bs, 1)

        # the data that need to go through the model
        batch_sample = {
            "input_ids": torch.ones(bs, self.seq_length).long() * self.pad_id,
            # "attention_mask": attn_mask.unsqueeze(1),
            # "position_ids": position_ids,
        }

        # the data that do not need to go through the model
        no_model_sample = {
            "labels": torch.ones(bs, self.seq_length).long() * self.pad_id,
            "loss_mask": torch.zeros(bs, self.seq_length).float()
        }

        for i, samp in enumerate(samps):
            assert len(samp) == self.seq_length + 1, (len(samp), self.seq_length)
            batch_sample["input_ids"][i] = torch.tensor(samp[:-1], dtype=torch.long)
            no_model_sample["labels"][i] = torch.tensor(samp[1:], dtype=torch.long)
            no_model_sample["loss_mask"][i] = (no_model_sample["labels"][i] != self.pad_id).float()

        return batch_sample, no_model_sample


def load_data(data_path, data_type, tokenizer, ratio=1):

    # Dataset
    filename = os.path.join(data_path, data_type + '.txt')
    dataset = GenDataset(filename, data_type, tokenizer, ratio=ratio)
    
    # Use a random sampler with distributed batch sampler.
    if data_type == 'train':
        sampler = RandomSampler(dataset)
    else:
        sampler = torch.utils.data.SequentialSampler(dataset)
    
    # Torch dataloader.
    return torch.utils.data.DataLoader(dataset,
                                       sampler=sampler,
                                       num_workers=0,
                                       pin_memory=True,
                                       collate_fn=dataset.collate), dataset
                                       

def evaluate(model, dataloader, device, mode="dev"):
    model.eval()
    all_losses = []
    with torch.no_grad():
        for batch, no_model_batch in tqdm(dataloader):
            for k in batch.keys():
                batch[k] = batch[k].to(device)
            for k in no_model_batch.keys():
                no_model_batch[k] = no_model_batch[k].to(device)

            output = model(batch['input_ids'])
            labels = no_model_batch["labels"]

            # cross_entropy loss
            losses = loss_fcn(output.contiguous().float().reshape(-1,30000), labels.reshape(-1))
            loss_mask = no_model_batch["loss_mask"]
                
            loss = torch.sum(losses * loss_mask, dim=-1) / loss_mask.sum(dim=-1)

            all_losses.extend(loss.tolist())
    return np.mean(all_losses)


def train(model, dataloader, optimizer, device, mode="train"):
    model.train()
    for batch, no_model_batch in tqdm(dataloader):
        optimizer.zero_grad()
        for k in batch.keys():
            batch[k] = batch[k].to(device)
        for k in no_model_batch.keys():
            no_model_batch[k] = no_model_batch[k].to(device)
        output = model(batch['input_ids'])
        labels = no_model_batch["labels"]
        losses = loss_fcn(output.contiguous().float().reshape(-1,30000), labels.reshape(-1))
        loss_mask = no_model_batch["loss_mask"]
        loss = torch.sum(losses * loss_mask, dim=-1) / loss_mask.sum(dim=-1)
        loss.backward()
        optimizer.step()

In [33]:
train_dataloader, dataset = load_data('../nlpdata/data/STC', 'train', tokenizer)


100%|██████████| 21161/21161 [00:10<00:00, 2016.98it/s]


In [37]:
import torch.nn as nn
loss_fcn = nn.CrossEntropyLoss()
loss_fcn.to(device)

import transformers
optimizer = transformers.AdamW(model.parameters(), lr=1.5e-6, eps=1.0e-9)  # lr=1.5e-4, eps=1.0e-9

In [None]:
evaluate(model, train_dataloader, device, mode="dev")

In [39]:
train(model, train_dataloader, optimizer, device)

100%|██████████| 21161/21161 [2:32:57<00:00,  2.31it/s]


In [40]:
torch.save(model.state_dict(), "trained_dialog.pth")  # 只保存模型的参数

In [1]:
import torch
import numpy as np
from GPT2 import GPT2Model, GPT2Tokenizer

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
device = 'cuda' #'cuda'

model = GPT2Model(
    vocab_size=30000,
    layer_size=12,
    block_size=1024,
    embedding_dropout=0.0,
    embedding_size=768,
    num_attention_heads=12,
    attention_dropout=0.0,
    residual_dropout=0.0)

state_dict = torch.load('trained_dialog.pth', map_location='cpu')

model.load_state_dict(state_dict)
model.to(device)
model.eval()

tokenizer = GPT2Tokenizer(
    'GPT2/bpe/vocab.json',
    'GPT2/bpe/chinese_vocab.model',
    max_len=512)

In [4]:
ask_question('保险', max_len=50)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.593 seconds.
Prefix dict has been built successfully.





In [7]:
sample('对话上文:老年人该买保险吗？ 回复:', max_len=50)

我觉得,还是买保险吧,因为我是老百姓,我的钱是我的,我的钱是我的,我的钱是我的,我的钱是我的,
