In [2]:
import torch
import numpy as np
from GPT2 import GPT2Model, GPT2Tokenizer

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = 'cuda' #'cuda'

model = GPT2Model(
    vocab_size=30000,
    layer_size=12,
    block_size=1024,
    embedding_dropout=0.0,
    embedding_size=768,
    num_attention_heads=12,
    attention_dropout=0.0,
    residual_dropout=0.0)

state_dict = torch.load('save_distill.pth', map_location='cpu')

model.load_state_dict(state_dict)
model.to(device)
model.eval()

tokenizer = GPT2Tokenizer(
    'GPT2/bpe/vocab.json',
    'GPT2/bpe/chinese_vocab.model',
    max_len=512)

In [2]:
def sample(text, max_len=10):
    ids = tokenizer.encode(text)
    input_id = torch.tensor((np.array(ids).reshape(1, -1).astype('int64'))).to(device)
    output, cached_kvs = model(input_id, use_cache=True)
    nid = int(np.argmax(output[0, -1].detach().cpu().numpy()))
    ids += [nid]
    out = [nid]
    for i in range(max_len):
        input_id = torch.tensor(np.array([nid]).reshape(1, -1).astype('int64')).to(device)
        output, cached_kvs = model(input_id, cached_kvs, use_cache=True)
        nid = int(np.argmax(output[0, -1].detach().cpu().numpy()))
        ids += [nid]
        if nid==3:
            break
        out.append(nid)
    print(tokenizer.decode(out))

def ask_question(question, max_len=10):
    sample('''问题：中国的首都是哪里？
    答案：北京。
    问题：李白在哪个朝代？
    答案：唐朝。
    问题：%s
    答案：''' % question, max_len)

In [3]:
from data.samplers import DistributedBatchSampler, RandomSampler
from tqdm import tqdm
import json

class CHIDDataset(torch.utils.data.Dataset):
    def __init__(self, data_path, split, tokenizer, ratio=1):
        self.split = split
        self.tokenizer = tokenizer
        self.ratio = ratio
        self.pad_id = tokenizer.encoder['<pad>']
        self.eod_token = tokenizer.encoder['<eod>']

        with open(data_path, "r") as f:
            # cand_ids: the candidate label ids, namely, ids of "0", "1", ..., "9"
            # data: preprocessed (tokenized) data
            self.cand_ids, data = json.load(f)
        self.samples, self.sizes = self.process(data)

        self.max_size = max(self.sizes)

    def process(self, data):
        samples, sizes = [], []
        for d in tqdm(data[:int(self.ratio * len(data))]):
            # only use the loss of the last token
            loss_mask = [0] * (len(d["sent"]) - 2) + [1]

            samples.append({
                "input_ids": d["sent"][:-1], # ids for the tokenized sentence
                "loss_mask": loss_mask, # mask of the loss
                "labels": d["sent"][1:], # token labels of each sentence
                "truth": d["truth"], # labels if each sentence, should be an integer in [0, 9]
            })
            sizes.append(len(d["sent"]) - 1)

        return samples, sizes

    def __len__(self):
        return len(self.sizes)

    def __getitem__(self, idx):
        return self.samples[idx], self.sizes[idx]

    def collate(self, x):
        bs = len(x)
        samps = [s[0] for s in x]
        sizes = [s[1] for s in x]

        # fit to the max_size
        max_size = self.max_size

        # triangle attention mask
        attn_mask = torch.tril(torch.ones((max_size, max_size))).unsqueeze(0)
        position_ids = torch.arange(max_size, dtype=torch.long).unsqueeze(0).repeat(bs, 1)

        # the data that need to go through the model
        batch_sample = {
            "input_ids": torch.ones(bs, max_size).long() * self.pad_id,
            "attention_mask": attn_mask.unsqueeze(1),
            "position_ids": position_ids,
        }

        # the data that do not need to go through the model
        no_model_sample = {
            "labels": torch.ones(bs, max_size).long() * self.pad_id,
            "truth": torch.zeros(bs).long(),
            "loss_mask": torch.zeros(bs, max_size).float()
        }

        for i, samp in enumerate(samps):
            batch_sample["input_ids"][i, :len(samp["input_ids"])] = torch.tensor(samp["input_ids"])
            no_model_sample["labels"][i, :len(samp["labels"])] = torch.tensor(samp["labels"])
            no_model_sample["truth"][i] = torch.tensor(samp["truth"])
            no_model_sample["loss_mask"][i, :len(samp["loss_mask"])] = torch.tensor(samp["loss_mask"])

        return batch_sample, no_model_sample


def load_data(data_path, data_type, tokenizer, ratio=1):

    # Dataset
    filename = os.path.join(data_path, data_type + '.json')
    dataset = CHIDDataset(filename, data_type, tokenizer, ratio=ratio)
    
    # Use a random sampler with distributed batch sampler.
    if data_type == 'train':
        sampler = RandomSampler(dataset)
    else:
        sampler = torch.utils.data.SequentialSampler(dataset)
    
    # Torch dataloader.
    return torch.utils.data.DataLoader(dataset,
                                       sampler=sampler,
                                       num_workers=0,
                                       pin_memory=True,
                                       collate_fn=dataset.collate), dataset


def train(model, dataloader, optimizer, device, mode="train"):
    model.train()
    for batch, no_model_batch in tqdm(dataloader):
        optimizer.zero_grad()
        for k in batch.keys():
            batch[k] = batch[k].to(device)
        for k in no_model_batch.keys():
            no_model_batch[k] = no_model_batch[k].to(device)

        output = model(batch['input_ids'])
        output = torch.sum(output * no_model_batch["loss_mask"].unsqueeze(-1), 1) / torch.sum(no_model_batch["loss_mask"], -1).unsqueeze(-1)

        labels = no_model_batch["labels"].float()
        labels = (torch.sum(labels * no_model_batch["loss_mask"], 1) / torch.sum(no_model_batch["loss_mask"], -1)).long()

        losses = loss_fcn(output.unsqueeze(1).contiguous().float(), labels.unsqueeze(1))
        loss = torch.mean(losses)

        loss.backward()
        optimizer.step()

In [4]:
def evaluate(model, dataloader, cand_ids, device, mode="dev"):
    model.eval()
    all_truth, all_preds = [], []
    with torch.no_grad():
        for batch, no_model_batch in tqdm(dataloader, desc="Evaluating {}".format(mode)):
            for k in batch:
                batch[k] = batch[k].to(device)
            for k in no_model_batch:
                no_model_batch[k] = no_model_batch[k].to(device)

            output = model(batch['input_ids'])
            output = torch.sum(output * no_model_batch["loss_mask"].unsqueeze(-1), 1) / torch.sum(no_model_batch["loss_mask"], -1).unsqueeze(-1)

            scores = output.view(-1, 30000)

            truth = no_model_batch["truth"]
            truth = truth.view(-1)
            scores = scores[:, cand_ids]

            preds = torch.argmax(scores, dim=-1)

            all_truth.extend(truth.detach().cpu().tolist())
            all_preds.extend(preds.detach().cpu().tolist())
        
    acc = sum([int(p == l) for p, l in zip(all_preds, all_truth)]) / len(all_truth)
    acc = torch.tensor(acc).to(device)

    return acc, all_truth, all_preds

In [8]:
train_dataloader, trainset = load_data('../nlpdata/data/chid', 'train', tokenizer)
test_dataloadar, testset = load_data('../nlpdata/data/chid', 'test', tokenizer)

100%|██████████| 577157/577157 [00:20<00:00, 28718.55it/s]
100%|██████████| 23209/23209 [00:00<00:00, 80879.17it/s]


In [9]:
import torch.nn as nn
loss_fcn = nn.CrossEntropyLoss()
loss_fcn.to(device)

import transformers
optimizer = transformers.AdamW(model.parameters(), lr=1.5e-6, eps=1.0e-9)  # lr=1.5e-4, eps=1.0e-9

In [None]:
cand_ids = torch.tensor(testset.cand_ids).to(device)
acc, all_truth, all_preds = evaluate(model, test_dataloadar, cand_ids, device, mode="dev")

In [None]:
train(model, train_dataloader, optimizer, device)

In [11]:
torch.save(model.state_dict(), "trained_fill.pth")  # 只保存模型的参数

In [32]:
ask_question('保险好吗？', max_len=50)

保险好。


In [38]:
sample('去北京画室参加', max_len=20)

集训,回来后,我就开始画了,当时画的是
