A cute little demo showing the simplest usage of minGPT. Configured to run fine on Macbook Air in like a minute.

In [1]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.utils import set_seed
import numpy as np
set_seed(3407)

In [2]:
# process dataset
from mingpt.bpe import Encoder, get_encoder
e = get_encoder()
# x_gpt = [e.encode(d) for d in x.split()]
# y_gpt = e.encode_equation(y)
# calcu = eval(y)

In [3]:
raw_data_path = '24/answer.txt'
with open(raw_data_path, 'r') as f:
    raw_data = f.readlines()

raw_set = list()
for item in raw_data:
    x, y = item.strip().split(':')
    x, y = x.strip(), y.strip()
    if 'nope' in y:
        y = 'no answer'
    else:
        y = y.replace('×', '*')
    raw_set.append((x, y))

raw_set[515:525]

[('2 2 8 11', 'no answer'),
 ('2 2 8 12', '12+2+2+8'),
 ('2 2 8 13', 'no answer'),
 ('2 2 9 9', 'no answer'),
 ('2 2 9 10', '(9-2)*2+10'),
 ('2 2 9 11', '11+2+2+9'),
 ('2 2 9 12', '(2+2)*9-12'),
 ('2 2 9 13', 'no answer'),
 ('2 2 10 10', '10+10+2+2'),
 ('2 2 10 11', '(11*2-10)*2')]

In [4]:
class FixedLenDataset(Dataset):
    """
    Dataset for the Sort problem. E.g. for problem length 6:
    Input: 0 0 2 1 0 1 -> Output: 0 0 0 1 1 2
    Which will feed into the transformer concatenated as:
    input:  0 0 2 1 0 1 0 0 0 1 1
    output: I I I I I 0 0 0 1 1 2
    where I is "ignore", as the transformer is reading the input sequence
    """

    def __init__(self, raw_set):
        self.raw_set = raw_set
        self.tokenizer = get_encoder()
        self.max_len = 14  # 统计最大14

    def __len__(self):
        return len(self.raw_set)

    def get_vocab_size(self):
        return 50257

    def get_block_size(self):
        # fixed length version:
        # return self.length * 2 - 1
        # return 4 + 4 + 3 + 2 - 1  # 4数字+4数字+3运算符+2括号-1 = 12

        # padding version:
        return self.max_len

    def __getitem__(self, idx):
        inp, sol = self.raw_set[idx]
        inpp = [self.tokenizer.encode(d)[0] for d in inp.split()]
        soll = self.tokenizer.encode(sol) if 'no answer' in sol else self.tokenizer.encode_equation(sol)
        # padding here
        padding_len = self.max_len - len(inpp) - len(soll) + 1
        soll += [50256] * padding_len

        # print(inpp, soll)
        inpp, soll = torch.tensor(inpp, dtype=torch.long), torch.tensor(soll, dtype=torch.long)

        # concatenate the problem specification and the solution
        cat = torch.cat((inpp, soll), dim=0)

        # the inputs to the transformer will be the offset sequence
        x = cat[:-1].clone()
        y = cat[1:].clone()
        # we only want to predict at output locations, mask out the loss at the input locations
        y[:3] = -1  # 固定4数字
        y[-padding_len + 1:] = -1
        return x, y

In [5]:
ixes = list(range(len(raw_set)))
np.random.shuffle(ixes)
train_ixes = ixes[:-200]
test_ixes = ixes[-200:]
train_set = [raw_set[i] for i in train_ixes]
test_set = [raw_set[i] for i in test_ixes]
train_dataset = FixedLenDataset(train_set)
test_dataset = FixedLenDataset(test_set)

In [6]:
x, y = train_dataset[516]
print(len(x))
for a, b in zip(x,y):
    print(int(a),int(b))

14
17 -1
21 -1
21 -1
940 940
940 10
10 17
17 10
10 21
21 10
10 21
21 50256
50256 -1
50256 -1
50256 -1


In [7]:
max_len = 0
for x, y in train_dataset:
    max_len = max(max_len, len(x))
print(max_len)

14


In [8]:
# create a GPT instance
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt2'
# model_config.vocab_size = train_dataset.get_vocab_size()
# model_config.block_size = train_dataset.get_block_size()
model = GPT.from_pretrained(model_config.model_type)

number of parameters: 124.44M


In [9]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-5 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.batch_size = 8
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

running on device cpu


In [10]:
def eval_split(dataset):
    model.eval()
    losses = list()
    loader = DataLoader(dataset, batch_size=10, num_workers=0, drop_last=False)
    for batch in loader:
        batch = [t.to(trainer.device) for t in batch]
        x, y = batch
        # forward the model
        with torch.no_grad():
            logits, loss = model(x, y)
        losses.append(loss)

    model.train()
    return np.mean(losses)

In [None]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100== 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
        test_loss = eval_split(test_dataset)
        print(f"    test loss {test_loss:.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

iter_dt 0.00ms; iter 0: train loss 4.96339
    test loss 4.41681
iter_dt 630.67ms; iter 100: train loss 0.96286
    test loss 0.82988
iter_dt 617.86ms; iter 200: train loss 0.76551
    test loss 0.75350
iter_dt 568.27ms; iter 300: train loss 0.45483
    test loss 0.66477
iter_dt 619.60ms; iter 400: train loss 0.62636
    test loss 0.64712
iter_dt 640.05ms; iter 500: train loss 0.62744
    test loss 0.56431
iter_dt 658.75ms; iter 600: train loss 0.51872
    test loss 0.58311
iter_dt 679.89ms; iter 700: train loss 0.54847
    test loss 0.57948
iter_dt 502.54ms; iter 800: train loss 0.54337
    test loss 0.55968
iter_dt 494.92ms; iter 900: train loss 0.56315
    test loss 0.53850
iter_dt 575.26ms; iter 1000: train loss 0.71471
    test loss 0.54550
iter_dt 561.92ms; iter 1100: train loss 0.42306
    test loss 0.53585


In [None]:
##################### until now #####################

In [None]:
exit()
# now let's perform some evaluation
model.eval()

In [None]:
def eval_split(trainer, split, max_batches):
    dataset = {'train':train_dataset, 'test':test_dataset}[split]
    results = []
    mistakes_printed_already = 0
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        y = y.to(trainer.device)
        # isolate the input pattern alone
        inp = x[:, :4]
        sol = y[:, 4:]
        # let the model sample the rest of the sequence
        cat = model.generate(inp, 14 - 4, do_sample=False) # using greedy argmax, not sampling
        sol_candidate = cat[:, n:] # isolate the filled in sequence
        # compare the predicted sequence to the true sequence
        correct = (sol == sol_candidate).all(1).cpu() # Software 1.0 vs. Software 2.0 fight RIGHT on this line haha
        for i in range(x.size(0)):
            results.append(int(correct[i]))
            if not correct[i] and mistakes_printed_already < 3: # only print up to 5 mistakes to get a sense
                mistakes_printed_already += 1
                print("GPT claims that %s sorted is %s but gt is %s" % (inp[i].tolist(), sol_candidate[i].tolist(), sol[i].tolist()))
        if max_batches is not None and b+1 >= max_batches:
            break
    rt = torch.tensor(results, dtype=torch.float)
    print("%s final score: %d/%d = %.2f%% correct" % (split, rt.sum(), len(results), 100*rt.mean()))
    return rt.sum()

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_split(trainer, 'train', max_batches=50)
    test_score  = eval_split(trainer, 'test',  max_batches=50)

In [None]:
# let's run a random given sequence through the model as well
n = train_dataset.length # naugy direct access shrug
inp = torch.tensor([[0, 0, 2, 1, 0, 1]], dtype=torch.long).to(trainer.device)
assert inp[0].nelement() == n
with torch.no_grad():
    cat = model.generate(inp, n, do_sample=False)
sol = torch.sort(inp[0])[0]
sol_candidate = cat[:, n:]
print('input sequence  :', inp.tolist())
print('predicted sorted:', sol_candidate.tolist())
print('gt sort         :', sol.tolist())
print('matches         :', bool((sol == sol_candidate).all()))