In [None]:
!git clone https://github.com/karpathy/minGPT.git

Cloning into 'minGPT'...
remote: Enumerating objects: 489, done.[K
remote: Total 489 (delta 0), reused 0 (delta 0), pack-reused 489[K
Receiving objects: 100% (489/489), 1.44 MiB | 23.38 MiB/s, done.
Resolving deltas: 100% (260/260), done.


In [None]:
cd minGPT

/content/minGPT


In [None]:
"""
Trains a GPT to add n-digit numbers.
"""

import os
import sys
import json

import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader

from mingpt.model import GPT
from mingpt.trainer import Trainer
from mingpt.utils import set_seed, setup_logging, CfgNode as CN

In [None]:
import random
# -----------------------------------------------------------------------------

def get_config():

    C = CN()

    # system
    C.system = CN()
    C.system.seed = 3407
    C.system.work_dir = './out/adder'

    # data
    C.data = AdditionDataset.get_default_config()

    # model
    C.model = GPT.get_default_config()
    C.model.model_type = 'gpt-nano'

    # trainer
    C.trainer = Trainer.get_default_config()
    C.trainer.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster

    return C

# -----------------------------------------------------------------------------

class AdditionDataset(Dataset):
    """
    Creates n-digit addition problems. For example, if n=2, then an example
    addition problem would be to add 85 + 50 = 135. This problem would be
    represented as the following string for the GPT:

    "8550531"

    This is because:
    - we are discarding the + and =, which are not necessary. We just encode the digits
      of the input numbers concatenated together.
    - the result 135 is encoded backwards to make the addition easier to learn for the
      GPT model, because of how the addition algorithm works.

    As one more example, the problem 6 + 39 = 45 would be encoded as:

    "0639054"

    where you will notice that we are padding with zeros to make sure that we always
    produce strings of the exact same size: n + n + (n + 1). When n=2, this is 7.
    At test time, we will feed in an addition problem by giving the first 2n digits,
    and hoping that the GPT model completes the sequence with the next (n+1) digits
    correctly.
    """

    @staticmethod
    def get_default_config():
        C = CN()
        C.ndigit = 10
        return C

    def __init__(self, config, split):
        self.config = config
        self.split = split # train/test

        # split up all addition problems into either training data or test data
        ndigit = self.config.ndigit
        # assert ndigit <= 3, "the lines below would be very memory inefficient, in future maybe refactor to support"
        num = 10000 # total number of possible addition problems with ndigit numbers
        rng = torch.Generator()
        rng.manual_seed(1337)

        # perm = torch.randperm(num, generator=rng)
        # num_test = min(int(num*0.2), 500) # 20% of the whole dataset, or only up to 500
        # self.ixes = perm[:num_test] if split == 'test' else perm[num_test:]
        num_test = min(int(num * 0.2), 500)  # 20% of the whole dataset, or only up to 500
        self.test_start = 0
        self.test_end = num_test
        self.train_start = num_test
        self.train_end = num

    def get_vocab_size(self):
        return 10 # digits 0..9

    def get_block_size(self):
        # a,b,a+b, and +1 due to potential carry overflow,
        # but then also -1 because very last digit doesn't ever plug back
        # as there is no explicit <EOS> token to predict, it is implied
        return 3*self.config.ndigit + 1 - 1

    def __len__(self):
        if self.split == 'test':
            return self.test_end - self.test_start
        else:
            return self.train_end - self.train_start

    # def __getitem__(self, idx):
    #     ndigit = self.config.ndigit
    #     # given a problem index idx, first recover the associated a + b
    #     idx = self.ixes[idx].item()
    #     nd = 10**ndigit
        # a = idx // nd
        # b = idx %  nd
    #     # calculate the "label" of the addition problem a + b
    #     c = a + b
    #     # encode the digits of a, b, c into strings
    #     astr = f'%0{ndigit}d' % a
    #     bstr = f'%0{ndigit}d' % b
    #     cstr = (f'%0{ndigit+1}d' % c)[::-1] # reverse c to make addition easier
    #     render = astr + bstr + cstr
    #     dix = [int(s) for s in render] # convert each character to its token index
    #     # x will be input to GPT and y will be the associated expected outputs
    #     x = torch.tensor(dix[:-1], dtype=torch.long)
    #     y = torch.tensor(dix[1:], dtype=torch.long) # predict the next token in the sequence
    #     y[:ndigit*2-1] = -1 # we will only train in the output locations. -1 will mask loss to zero
    #     return x, y

    # generate data on the fly
    def __getitem__(self, idx):
        ndigit = self.config.ndigit
        nd = 10**ndigit

        if(idx < 2000):
          a = random.randint(0, 10)
          b = random.randint(0, 10)
        elif(idx < 3000):
          a = random.randint(0, 500)
          b = random.randint(0, 500)          
        elif(idx < 5000):
          a = random.randint(0, 1000)
          b = random.randint(0, 1000)          
        else:
          a = random.randint(0, 10**ndigit)
          b = random.randint(0, 10**ndigit)

        # calculate the "label" of the addition problem a + b
        c = a + b
        # encode the digits of a, b, c into strings
        astr = f'%0{ndigit}d' % a
        bstr = f'%0{ndigit}d' % b
        cstr = (f'%0{ndigit+1}d' % c)[::-1]  # reverse c to make addition easier
        render = astr + bstr + cstr
        dix = [int(s) for s in render]  # convert each character to its token index

        # x will be input to GPT and y will be the associated expected outputs
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)  # predict the next token in the sequence
        y[:ndigit * 2 - 1] = -1  # we will only train in the output locations. -1 will mask loss to zero
        return x, y


In [None]:
# get default config and overrides from the command line, if any
config = get_config()
# config.trainer.batch_size = 256

# config.merge_from_args(sys.argv[1:])
print(config)
setup_logging(config)
set_seed(config.system.seed)

# construct train and test datasets
train_dataset = AdditionDataset(config.data, split='train')
test_dataset  = AdditionDataset(config.data, split='test')

system:
    seed: 3407
    work_dir: ./out/adder
data:
    ndigit: 10
model:
    model_type: gpt2
    n_layer: None
    n_head: None
    n_embd: None
    vocab_size: None
    block_size: None
    embd_pdrop: 0.1
    resid_pdrop: 0.1
    attn_pdrop: 0.1
trainer:
    device: auto
    num_workers: 4
    max_iters: None
    batch_size: 256
    learning_rate: 0.0005
    betas: (0.9, 0.95)
    weight_decay: 0.1
    grad_norm_clip: 1.0



In [None]:
len(train_dataset)

9500

In [None]:
# construct the model
config.model.vocab_size = train_dataset.get_vocab_size()
config.model.block_size = train_dataset.get_block_size()
model = GPT(config.model)

number of parameters: 85.09M


In [None]:
# construct the trainer object
trainer = Trainer(config.trainer, model, train_dataset)

# helper function for the evaluation of a model
def eval_split(trainer, split, max_batches=None):
    dataset = {'train':train_dataset, 'test':test_dataset}[split]
    ndigit = config.data.ndigit
    results = []
    mistakes_printed_already = 0
    factors = torch.tensor([[10**i for i in range(ndigit+1)][::-1]]).to(trainer.device)
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        # isolate the first two digits of the input sequence alone
        d1d2 = x[:, :ndigit*2]
        # let the model sample the rest of the sequence
        d1d2d3 = model.generate(d1d2, ndigit+1, do_sample=False) # using greedy argmax, not sampling  
        # isolate the last digit of the sampled sequence
        d3 = d1d2d3[:, -(ndigit+1):]
        d3 = d3.flip(1) # reverse the digits to their "normal" order
        # decode the integers from individual digits
        d1i = (d1d2[:,:ndigit] * factors[:,1:]).sum(1)
        d2i = (d1d2[:,ndigit:ndigit*2] * factors[:,1:]).sum(1)
        d3i_pred = (d3 * factors).sum(1)
        d3i_gt = d1i + d2i # manually calculate the ground truth
        # evaluate the correctness of the results in this batch
        correct = (d3i_pred == d3i_gt).cpu() # Software 1.0 vs. Software 2.0 fight RIGHT on this line haha
        for i in range(x.size(0)):
            results.append(int(correct[i]))
            if not correct[i] and mistakes_printed_already < 5: # only print up to 5 mistakes to get a sense
                mistakes_printed_already += 1
                print("GPT claims that %d + %d = %d but gt is %d" % (d1i[i], d2i[i], d3i_pred[i], d3i_gt[i]))
        if max_batches is not None and b+1 >= max_batches:
            break
    rt = torch.tensor(results, dtype=torch.float)
    print("%s final score: %d/%d = %.2f%% correct" % (split, rt.sum(), len(results), 100*rt.mean()))
    return rt.sum()

# iteration callback
top_score = 0
def batch_end_callback(trainer):
    global top_score

    if trainer.iter_num % 10 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")

    if trainer.iter_num % 500 == 0:
        # evaluate both the train and test score
        train_max_batches = {1: None, 2: None, 3: 5}[2] # if ndigit=2 we can afford the whole train set, ow no
        model.eval()
        with torch.no_grad():
            train_score = eval_split(trainer, 'train', max_batches=train_max_batches)
            test_score  = eval_split(trainer, 'test',  max_batches=None)
        score = train_score + test_score
        # save the model if this is the best score we've seen so far
        if score > top_score:
            top_score = score
            print(f"saving model with new top score of {score}")
            ckpt_path = os.path.join(config.system.work_dir, "model.pt")
            torch.save(model.state_dict(), ckpt_path)
        # revert model to training mode
        model.train()

trainer.set_callback('on_batch_end', batch_end_callback)

# run the optimization
trainer.run()


running on device cuda




iter_dt 0.00ms; iter 0: train loss 2.32139
GPT claims that 0 + 6 = 0 but gt is 6
GPT claims that 2 + 5 = 0 but gt is 7
GPT claims that 3 + 6 = 0 but gt is 9
GPT claims that 4 + 9 = 0 but gt is 13
GPT claims that 5 + 3 = 0 but gt is 8
train final score: 19/9500 = 0.20% correct
GPT claims that 1 + 8 = 0 but gt is 9
GPT claims that 8 + 2 = 0 but gt is 10
GPT claims that 3 + 8 = 0 but gt is 11
GPT claims that 3 + 3 = 0 but gt is 6
GPT claims that 8 + 2 = 0 but gt is 10
test final score: 4/500 = 0.80% correct
saving model with new top score of 23.0
iter_dt 1277.82ms; iter 10: train loss 2.03177
iter_dt 1279.19ms; iter 20: train loss 1.75505
iter_dt 1282.59ms; iter 30: train loss 1.46019
iter_dt 1284.49ms; iter 40: train loss 1.43966
iter_dt 1283.41ms; iter 50: train loss 1.38887
iter_dt 1281.54ms; iter 60: train loss 1.17438
iter_dt 1282.63ms; iter 70: train loss 1.26614
iter_dt 1288.04ms; iter 80: train loss 1.33078
iter_dt 1280.21ms; iter 90: train loss 1.24493
iter_dt 1279.11ms; iter 100

KeyboardInterrupt: ignored

In [None]:
def convert_to_list(number, context_length):
    digits = []
    while number > 0:
        digits.insert(0, number % 10)
        number //= 10
    if len(digits) < context_length:
        digits = [0] * (context_length - len(digits)) + digits
    return digits

In [None]:
def render_equation(a, b, c):
    a_str = "".join(str(d) for d in a).lstrip('0') or '0'
    b_str = "".join(str(d) for d in b).lstrip('0') or '0'
    c_str = "".join(str(d) for d in c).lstrip('0') or '0'
    equation_str = "{} + {} = {}".format(a_str, b_str, c_str)
    return equation_str

In [None]:
def reverse_last_digits(arr, ndigits):
    last_digits = arr[-ndigits - 1:]
    last_digits.reverse()
    return last_digits

In [None]:
a = convert_to_list(999, train_dataset.config.ndigit)
b = convert_to_list(1, train_dataset.config.ndigit)

In [None]:
x = torch.tensor([a + b], dtype=torch.long).to(trainer.device)
result = model.generate(x, train_dataset.config.ndigit+1, do_sample=False)

In [None]:
c = reverse_last_digits(list(result[0].cpu().numpy()), train_dataset.config.ndigit)

In [None]:
render_equation(a, b, c)