# Initialize a GPT Model

In [1]:
from transformers import AutoConfig, AutoModelForCausalLM

vocab_size = 13
sequence_length = 4
result_length = 2
context_length = sequence_length + result_length
config = AutoConfig.from_pretrained("gpt2", vocab_size=vocab_size, n_ctx=context_length, n_head=12, n_layer=6, n_positions=context_length, n_embd=192)
model = AutoModelForCausalLM.from_config(config)

  from .autonotebook import tqdm as notebook_tqdm


The model will have the same architecture as GPT 2 but with a few modifications for making it smaller. The main changes are the size of the vocabulary that it is 13 because it will only handle numbers plus the padding token, the "+", and "=". The context window will only support 6 tokens since we are only interested in performing the addition of two single digits.

In [2]:
def model_size(model):
    return sum(t.numel() for t in model.parameters())

print(f'Model size: {model_size(model)/1000**2:.1f}M parameters')

Model size: 2.7M parameters


This model has 2.7 million weights instead of the 124 million parameters of the "gpt2" default config.

In [3]:
model_ckpt = 'addition_model'

In [4]:
model.save_pretrained("models/" + model_ckpt, push_to_hub=True)

pytorch_model.bin: 100%|██████████| 10.7M/10.7M [00:15<00:00, 684kB/s] 


# Custom tokenizer to encode numbers and the padding token

In [5]:
class NumberTokenizer:
  def __init__(self, numbers_qty=10):
    vocab = ['+', '=', '-1', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    self.numbers_qty = numbers_qty
    self.pad_token = '-1'
    self.encoder = {str(v):i for i,v in enumerate(vocab)}
    self.decoder = {i:str(v) for i,v in enumerate(vocab)}
    self.pad_token_id = self.encoder[self.pad_token]

  def decode(self, token_ids):
    return ' '.join(self.decoder[t] for t in token_ids)

  def __call__(self, text):
    return [self.encoder[t] for t in text.split()]

### Example of the tokenization for the model

In [6]:
tokenizer = NumberTokenizer(vocab_size)
tokenizer("1 + 1 = 2")

[4, 0, 4, 1, 5]

In [7]:
tokenizer.decoder

{0: '+',
 1: '=',
 2: '-1',
 3: '0',
 4: '1',
 5: '2',
 6: '3',
 7: '4',
 8: '5',
 9: '6',
 10: '7',
 11: '8',
 12: '9'}

# Built the dataset

In [8]:
import numpy as np
import torch
from torch.utils.data import Dataset

class AdditionDataset(Dataset):
    """ 
    Dataset with example of the form:
    Input: "2 + 3 = 0" where the first 4 characters represent the initial input sequence
    Output: "+ 3 = 0 5" where the last ditis represents the result of the addition and the first 3 digits are ignored during training
    The result is a dataset of tokenized sequences of numbers
    """

    def __init__(self, split, length=6):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
    
    def __len__(self):
        return 10000 # 10.000 examples per split
    

    def __getitem__(self, idx):
        # The dataset is generated on the fly

        available_numbers = [int(n) for n in tokenizer.decoder.values() if n != tokenizer.pad_token and str(n).isnumeric()]
        # generate some random integers
        inp = torch.tensor(np.random.choice(available_numbers, size=result_length))
        # solve the task
        sol = torch.tensor([int(i) for i in str(inp.sum().item())])
        sol = torch.nn.functional.pad(sol, (1 if sol.size()[0] == 1 else 0,0), 'constant', 0)

        
        # concatenate the problem specification and the solution
        cat = torch.cat((inp, sol), dim=0)

        # the inputs to the transformer will be the offset sequence
        x = cat[:-1].clone()
        y = cat[1:].clone()
        # we only want to predict at output locations, mask out the loss at the input locations
        y[:1] = int(tokenizer.pad_token)

        # Convert the tensors to the input expected by the model
        x = str(x[0].item()) + ' + ' + str(x[1].item()) + ' = ' + str(x[2].item())
        y = '-1 ' + str(y[0].item()) + ' -1 ' + str(y[1].item()) + ' ' + str(y[2].item())
        # tokenize the input and targets
        tokenized_input = tokenizer(x)
        tokenized_output = tokenizer(y)
        return torch.tensor(tokenized_input), torch.tensor(tokenized_output)

### Create the training and testing datasets

In [9]:
train_dataset = AdditionDataset('train', length=sequence_length)
test_dataset = AdditionDataset('test', length=sequence_length)

In [10]:
x, y = train_dataset[0]
print(tokenizer.decode(x.numpy()))
print(tokenizer.decode(y.numpy()))

1 + 2 = 0
-1 -1 -1 0 3


# Training loop

In [12]:
import torch
import torch.nn.functional as F
from accelerate import Accelerator

accelerator = Accelerator()

batch_size = 40
num_epochs = 1

optimizer = torch.optim.Adam(model.parameters())
dataset = train_dataset
data = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=batch_size)

model, optimizer, data = accelerator.prepare(model, optimizer, data)

model.train()
i = 0
for epoch in range(num_epochs):
  for source, targets in data:
    i += 1
    optimizer.zero_grad()
    loss = F.cross_entropy(model(source).logits.flatten(end_dim=1), targets.flatten(end_dim=1), ignore_index=tokenizer.pad_token_id)
    accelerator.backward(loss)
    optimizer.step()
    if i % 100 == 0:
      print(f'Step: {i} loss: {loss.item()}')
  print(f'Epoch: {epoch+1}/{num_epochs} loss: {loss.item()}')

Step: 100 loss: 0.26241305470466614
Step: 200 loss: 0.31865572929382324
Epoch: 1/1 loss: 0.07760939747095108


# Evaluating the model

In [13]:
def generate_solution(input, solution_length=6, model=model):
  model.eval()
  input = torch.tensor(tokenizer(input))
  input = input.to(accelerator.device)
  solution = []
  for i in range(solution_length):
    output = model(input)
    predicted = output.logits[-1].argmax()
    input = torch.cat((input, predicted.unsqueeze(0)), dim=0)
    solution.append(predicted.cpu().item())
  return tokenizer.decode(solution)

In [14]:
def evaluate_model(num_samples=1000, log=False):
  correct = 0
  for i in range(num_samples):
    input, target = test_dataset[i]
    input = input.cpu().numpy()
    target = target.cpu().numpy()
    input = tokenizer.decode(input[:sequence_length])
    target = tokenizer.decode(target[sequence_length-1:])
    predicted = generate_solution(input, solution_length=result_length, model=model)
    if target == predicted:
      correct += 1
      if log:
        print(f'CORRECT  Input: {input} Target: {target} Predicted: {predicted}')
    else:
      if log:
        print(f'Input: {input} Target: {target} Predicted: {predicted}')

  print(f'Accuracy: {correct/num_samples}')

In [15]:
evaluate_model(num_samples=1000, log=False)

Accuracy: 1.0


In [16]:
evaluate_model(num_samples=10, log=True)

CORRECT  Input: 6 + 9 = Target: 1 5 Predicted: 1 5
CORRECT  Input: 1 + 1 = Target: 0 2 Predicted: 0 2
CORRECT  Input: 9 + 6 = Target: 1 5 Predicted: 1 5
CORRECT  Input: 8 + 9 = Target: 1 7 Predicted: 1 7
CORRECT  Input: 1 + 8 = Target: 0 9 Predicted: 0 9
CORRECT  Input: 4 + 4 = Target: 0 8 Predicted: 0 8
CORRECT  Input: 0 + 2 = Target: 0 2 Predicted: 0 2
CORRECT  Input: 0 + 8 = Target: 0 8 Predicted: 0 8
CORRECT  Input: 2 + 6 = Target: 0 8 Predicted: 0 8
CORRECT  Input: 6 + 5 = Target: 1 1 Predicted: 1 1
Accuracy: 1.0


In [17]:
model.push_to_hub("models/" + model_ckpt)

pytorch_model.bin: 100%|██████████| 10.7M/10.7M [00:10<00:00, 1.05MB/s]


CommitInfo(commit_url='https://huggingface.co/Manuel2011/addition_model/commit/5cfb2cfd1d72b612db88985d84be0f1427608b6a', commit_message='Upload model', commit_description='', oid='5cfb2cfd1d72b612db88985d84be0f1427608b6a', pr_url=None, pr_revision=None, pr_num=None)