# Initialize a GPT Model

In [8]:
from transformers import AutoConfig, AutoModelForCausalLM, GPT2Tokenizer
from transformers import GPT2Tokenizer

vocab_size = 4
sequence_length = 4
context_length = sequence_length*2 - 1
config = AutoConfig.from_pretrained("gpt2", vocab_size=vocab_size, n_ctx=context_length, n_head=4, n_layer=2)
model = AutoModelForCausalLM.from_config(config)

The model will have the same architecture as GPT 2 but with a few modifications for making it smaller. The main changes are the size of the vocabulary that it is 4 because it will only handle numbers plus a token for padding, and the context window will only support 7 tokens since we are only interested in sorting numbers of a fix length of 4.

In [9]:
def model_size(model):
    return sum(t.numel() for t in model.parameters())

print(f'Model size: {model_size(model)/1000**2:.1f}M parameters')

Model size: 15.0M parameters


This model have 15 million weights instead of the 111 million parameters of the "gpt2" default config.

In [10]:
model_ckpt = 'sortingLLM'

In [None]:
model.save_pretrained("models/" + model_ckpt, push_to_hub=True)

# Custom tokenizer to encode numbers and the padding token

In [11]:
class NumberTokenizer:
  def __init__(self, numbers_qty=10):
    self.numbers_qty = numbers_qty
    self.pad_token = '-1'
    self.encoder = {str(v):i for i,v in enumerate(range(-1, numbers_qty-1))}
    self.decoder = {i:str(v) for i,v in enumerate(range(-1, numbers_qty-1))}
    self.pad_token_id = self.encoder[self.pad_token]

  def decode(self, token_ids):
    return ' '.join(self.decoder[t] for t in token_ids)

  def __call__(self, text):
    return [self.encoder[t] for t in text.split()]

### Example of the tokenization for the model

In [12]:
tokenizer = NumberTokenizer(vocab_size)
tokenizer("1 0 1 1 2")

[2, 1, 2, 2, 3]

In [13]:
tokenizer.decoder

{0: '-1', 1: '0', 2: '1', 3: '2'}

# Built dataset with unsorted and sorted sequences of numbers

In [14]:
import numpy as np
import torch
from torch.utils.data import Dataset

class SortDataset(Dataset):
    """ 
    Dataset with example of the form:
    Input: "0 3 1 0 2 1 0 0 1 1 2" where the first 6 digits represent the initial input sequence
    Output: "3 1 0 2 1 0 0 1 1 2 3" where the last ditis represents the sorted sequence and the 5 first digits are ignored during training
    The result is a dataset of tokenized sequences of numbers
    """

    def __init__(self, split, length=6):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
    
    def __len__(self):
        return 1000000 # 1M examples per split
    

    def __getitem__(self, idx):
        # The dataset is generated on the fly

        available_numbers = [int(n) for n in tokenizer.decoder.values() if n != tokenizer.pad_token]
        # For training we will generate easy examples, i.e. with a small number of possible digits
        # if self.split == 'train':
        #     available_numbers = available_numbers[:-2]
        # generate some random integers
        inp = torch.tensor(np.random.choice(available_numbers, size=self.length))
        # solve the task
        sol = torch.sort(inp)[0]

        # concatenate the problem specification and the solution
        cat = torch.cat((inp, sol), dim=0)

        # the inputs to the transformer will be the offset sequence
        x = cat[:-1].clone()
        y = cat[1:].clone()
        # we only want to predict at output locations, mask out the loss at the input locations
        y[:self.length-1] = int(tokenizer.pad_token)

        # Convert the tensors to the input expected by the model
        x, y = ' '.join(map(str, x.tolist())), ' '.join(map(str, y.tolist()))
        # tokenize the input and targets
        tokenized_input = tokenizer(x)
        tokenized_output = tokenizer(y)
        return torch.tensor(tokenized_input), torch.tensor(tokenized_output)

### Create the training and testing datasets

In [15]:
train_dataset = SortDataset('train', length=sequence_length)
test_dataset = SortDataset('test', length=sequence_length)

In [16]:
print(train_dataset[0])
print(test_dataset[0])

(tensor([3, 3, 3, 2, 2, 3, 3]), tensor([0, 0, 0, 2, 3, 3, 3]))
(tensor([1, 3, 1, 2, 1, 1, 2]), tensor([0, 0, 0, 1, 1, 2, 3]))


# Generating solutions

In [17]:
def generate_solution(input, solution_length=6, model=model):
  model.eval()
  input = torch.tensor(tokenizer(input))
  input = input.to(accelerator.device)
  solution = []
  for i in range(solution_length):
    output = model(input)
    predicted = output.logits[-1].argmax()
    input = torch.cat((input, predicted.unsqueeze(0)), dim=0)
    solution.append(predicted.cpu().item())
  return tokenizer.decode(solution)

# Training loop

In [18]:
import torch
import torch.nn.functional as F
from accelerate import Accelerator

accelerator = Accelerator()

batch_size = 100
num_epochs = 2

optimizer = torch.optim.Adam(model.parameters())
dataset = train_dataset
data = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=batch_size)

model, optimizer, data = accelerator.prepare(model, optimizer, data)

model.train()
for epoch in range(num_epochs):
  for source, targets in data:
    optimizer.zero_grad()
    loss = F.cross_entropy(model(source).logits.flatten(end_dim=1), targets.flatten(end_dim=1), ignore_index=tokenizer.pad_token_id)
    accelerator.backward(loss)
    optimizer.step()
    loss = F.cross_entropy(model(source).logits.flatten(end_dim=1), targets.flatten(end_dim=1), ignore_index=tokenizer.pad_token_id)
  print(f'Epoch: {epoch+1}/{num_epochs} loss: {loss.item()}')

Epoch: 1/2 loss: 0.05919044837355614
Epoch: 2/2 loss: 0.0646464079618454


# Evaluating the model

In [22]:
def evaluate_model(num_samples=1000, log=False):
  correct = 0
  for i in range(num_samples):
    input, target = test_dataset[i]
    input = input.cpu().numpy()
    target = target.cpu().numpy()
    input = tokenizer.decode(input[:sequence_length])
    target = tokenizer.decode(target[sequence_length-1:])
    predicted = generate_solution(input, solution_length=sequence_length, model=model)
    if target == predicted:
      correct += 1
      if log:
        print(f'CORRECT  Input: {input} Target: {target} Predicted: {predicted}')
    else:
      if log:
        print(f'Input: {input} Target: {target} Predicted: {predicted}')

  print(f'Accuracy: {correct/num_samples}')

In [23]:
evaluate_model(num_samples=1000, log=False)

Accuracy: 1.0


In [24]:
evaluate_model(num_samples=10, log=True)

CORRECT  Input: 0 1 0 0 Target: 0 0 0 1 Predicted: 0 0 0 1
CORRECT  Input: 0 0 1 0 Target: 0 0 0 1 Predicted: 0 0 0 1
CORRECT  Input: 1 1 0 1 Target: 0 1 1 1 Predicted: 0 1 1 1
CORRECT  Input: 1 2 0 0 Target: 0 0 1 2 Predicted: 0 0 1 2
CORRECT  Input: 1 0 1 2 Target: 0 1 1 2 Predicted: 0 1 1 2
CORRECT  Input: 1 0 1 0 Target: 0 0 1 1 Predicted: 0 0 1 1
CORRECT  Input: 2 1 1 1 Target: 1 1 1 2 Predicted: 1 1 1 2
CORRECT  Input: 1 1 0 1 Target: 0 1 1 1 Predicted: 0 1 1 1
CORRECT  Input: 2 2 1 1 Target: 1 1 2 2 Predicted: 1 1 2 2
CORRECT  Input: 2 0 2 2 Target: 0 2 2 2 Predicted: 0 2 2 2
Accuracy: 1.0


In [26]:
model.push_to_hub("models/" + model_ckpt)

pytorch_model.bin: 100%|██████████| 59.9M/59.9M [00:25<00:00, 2.37MB/s]


CommitInfo(commit_url='https://huggingface.co/Manuel2011/sortingLLM/commit/f495c49854785f6a8dac60f25dad5da1fd7fe77a', commit_message='Upload model', commit_description='', oid='f495c49854785f6a8dac60f25dad5da1fd7fe77a', pr_url=None, pr_revision=None, pr_num=None)