# Imports and configuration

In [119]:
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
from matplotlib import pyplot as plt
import time
import pandas as pd
import random
from dataclasses import dataclass, field
from torch.utils.data import DataLoader, TensorDataset

In [120]:
# Create a config file for the cup shuffling task
@dataclass
class MASTER_CONFIG:
    # Training
    seed: int = 1337
    batch_size: int = 32
    training_split: float = 0.8
    
    epochs = 1
    batch_eval_internal = 10
    learning_rate = 2e-5
    eval_iters = 50
    
    # Model parameters
    dtype: torch.dtype = torch.float32
    d_model: int = 128 # This is the size of the embedding
    l_max: int = 128 # Max sequence length
    n_heads: int = 8 # Number of heads in the multi-head attention
    n_layers: int = 8 # Number of layers in the transformer of MHA blocks
    dropout: float = 0.1 # Dropout rate
    causal: bool = True # Whether to use a causal mask in the attention layer
    
    # Data parameters
    n_cups: int = 3
    n_moves: int = 4
    n_samples: int = 100

    # Tokenizer
    vocab: list[str] = field(default_factory = lambda: [])
    vocab_size: int = 0
    
    # Use CUDA or MPS if available else CPU
    if (torch.cuda.is_available()):
        device = torch.device("cuda")
        print("Using CUDA")
    elif (torch.backends.mps.is_available()):
        device = torch.device("mps")
        print("Using Apple Silicon MPS")
    else:
        device = torch.device("cpu")
        print("Using CPU")

Using Apple Silicon MPS


# Tokenizer

In [121]:
# Tokenizer, we want BOS, EOS tokens
vocab = ['<MASK>', ' 1', ' 2', ' 3', ' 4', ' 5', ' 6', ' 7', ' 8', ' 9', '\n', 'Ball', 'There are', ' is in', 'Switch', ' and', ' cup', ' cups', '<PAD>']

setattr(MASTER_CONFIG, "vocab", vocab)
setattr(MASTER_CONFIG, "vocab_size", len(vocab))

idx_to_s = {i:ch for i, ch in enumerate(vocab)}
s_to_idx = {ch:i for i, ch in enumerate(vocab)}


def encode_tokens(s: str) -> list[int]:
    ids = []
    i = 0
    while i < len(s):
        max_len = -1
        max_token = None
        for token in s_to_idx.keys():
            token_len = len(token)
            if s[i:i+token_len] == token:
                if token_len > max_len:
                    max_len = token_len
                    max_token = token
        if max_token:
            ids.append(s_to_idx[max_token])
            i += max_len
        else:
            print(f"Unrecognized sequence at index {i}, {s[i:i+1]}")
            
            break

    return ids

def decode_tokens(ids: list[int]) -> str:
    return "".join([idx_to_s[i] for i in ids])

# Ball Shuffler

In [122]:
import random

def initial_ball_position(n=3):
    return random.randint(1, n)

def generate_shuffle_moves(n=3, num_moves=3):
    moves = []
    
    for _ in range(num_moves):
        # Randomly pick two different cups
        cup1, cup2 = random.sample(range(1, n + 1), 2)
        moves.append((cup1, cup2))
    
    return moves

def final_ball_position(initial_position, shuffle_moves):
    position = initial_position
    for move in shuffle_moves:
        # If the ball's current position matches one of the cups in the move, swap it.
        if position == move[0]:
            position = move[1]
        elif position == move[1]:
            position = move[0]
    
    return position

# Method for generating data and labels for batches.
# TODO: This can be done much better
def generate_batch(split: str, config=MASTER_CONFIG):
    B = config.batch_size
    vocab_size = config.vocab_size
    
    # Generate the data
    data = generate_batch_cup_data(n_cups = config.n_cups,num_moves = config.n_moves, num_examples = config.n_samples)
    # data[0] is of shape (n_samples, max sequence length), retrieve max_sequence length and save as T
    assert config.l_max > data[0].shape[1], "l_max must be greater than the max sequence length from the batch generator"
    T = data[0].shape[1]
    # Split the data into training and validation sets
    if split == "train":
        data = data[0][:int(config.training_split * len(data[0]))], data[1][:int(config.training_split * len(data[1]))], data[2][:int(config.training_split * len(data[2]))] 
    elif split == "val":
        data = data[0][int(config.training_split * len(data[0])):], data[1][int(config.training_split * len(data[1])):], data[2][int(config.training_split * len(data[2])):]
        print(f"len(data) val: {len(data)}")
    else:
        raise ValueError("split must be either 'train' or 'val'")
    
    # Return input and output tensors
    inputs = data[0]
    outputs = data[1]
    masked_positions = data[2]
    for i in range(0, len(inputs), B):
        yield {
            "inputs": inputs[i:i+B],
            "targets": outputs[i:i+B],
            "masked_positions": masked_positions[i:i+B]
        }
    
# Generate batches of data
def generate_batch_cup_data(n_cups = 3, num_examples=1000, num_moves=3, verbose=False):
    inputs_idx = []
    targets_idx = []
    pad_token = encode_tokens('<PAD>')
    
    for _ in range(num_examples):
        n_moves = random.choice(range(1,num_moves))

        input, target = generate_masked_cup_shuffling_scenario(n_cups=n_cups, n_moves=n_moves)
        inputs_idx.append(encode_tokens(input))

        # Finds the last cup number in the string
        targets_idx.append(encode_tokens(target))
    
    max_inputs_len = max(len(input) for input in inputs_idx)

    #print(f' Input idx shape: {np.array(inputs_idx).shape}, Target idx shape: {np.array(targets_idx).shape}')
    
    padded_input_ids = [input + pad_token * (max_inputs_len - len(input)) for input in inputs_idx]
    target_ids = [targets for targets in targets_idx]
        
    input_tensor = torch.tensor(padded_input_ids, dtype=torch.long)
    target_tensor = torch.tensor(target_ids, dtype=torch.long)
    # Masked token positions, masked token is "<MASK>" and is not necessarily the last token in the sequence.
    masked_tensor = torch.where(input_tensor == s_to_idx['<MASK>'], torch.ones_like(input_tensor), torch.zeros_like(input_tensor))
    # Add padding tokens to the masked_positions tensor. The padding token is the token with position pad_token.
    masked_tensor = masked_tensor + torch.where(input_tensor == pad_token[0], torch.ones_like(input_tensor), torch.zeros_like(input_tensor))
    
    if verbose:
        for i in range(num_examples):
            print(f"Shapes: {input_tensor.shape}, {target_tensor.shape}, {masked_tensor.shape}")
            print(f"Input string: {decode_tokens(input_tensor[i].tolist())}")
            print(f"Target string: {decode_tokens(target_tensor[i].tolist())}")
            print(f"Masked positions: {masked_tensor[i]}")
            print("----")
    
    return input_tensor, target_tensor, masked_tensor
    
def dim(a):
    if not type(a) == list:
        return []
    return [len(a)] + dim(a[0])

def dims(a):
    for i in range(len(a)):
        print(f"Len of dimension {i}: {len(a[i])}")

def generate_cup_shuffling_scenario(n=3, num_moves=3):
    # Generate initial ball position and shuffle movesx
    initial_position = initial_ball_position(n)
    shuffle_moves = generate_shuffle_moves(n, num_moves)
    
    # Calculate the final ball position
    final_position = final_ball_position(initial_position, shuffle_moves)
    
    # Construct the input and output strings
    string = f"There are {n} cups\n"
    string += f"Ball is in cup {initial_position}\n"
    string += "\n".join([f"Switch cup {move[0]} and cup {move[1]}" for move in shuffle_moves])
    string += f"\nBall is in cup {final_position}"
    
    return string

def generate_masked_cup_shuffling_scenario(n_cups=3, n_moves=3):
    # Generate initial ball position and shuffle moves
    initial_position = initial_ball_position(n_cups)
    shuffle_moves = generate_shuffle_moves(n_cups, n_moves)
    
    # Calculate the final ball position for tokenizing
    final_position = " " + str(final_ball_position(initial_position, shuffle_moves))
    
    # Construct the input and output strings
    input = f"There are {n_cups} cups\n"
    input += f"Ball is in cup {initial_position}\n"
    input += "\n".join([f"Switch cup {move[0]} and cup {move[1]}" for move in shuffle_moves])
    input += f"\nBall is in cup<MASK>"
    
    return input, final_position

In [123]:
generate_batch_cup_data(num_examples=5, verbose=True)

Shapes: torch.Size([5, 27]), torch.Size([5, 1]), torch.Size([5, 27])
Input string: There are 3 cups
Ball is in cup 2
Switch cup 2 and cup 3
Switch cup 1 and cup 3
Ball is in cup<MASK>
Target string:  1
Masked positions: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1])
----
Shapes: torch.Size([5, 27]), torch.Size([5, 1]), torch.Size([5, 27])
Input string: There are 3 cups
Ball is in cup 1
Switch cup 3 and cup 1
Switch cup 1 and cup 2
Ball is in cup<MASK>
Target string:  3
Masked positions: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1])
----
Shapes: torch.Size([5, 27]), torch.Size([5, 1]), torch.Size([5, 27])
Input string: There are 3 cups
Ball is in cup 3
Switch cup 1 and cup 3
Ball is in cup<MASK><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
Target string:  1
Masked positions: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1])
----
Shapes: torch.Size(

(tensor([[12,  3, 17, 10, 11, 13, 16,  2, 10, 14, 16,  2, 15, 16,  3, 10, 14, 16,
           1, 15, 16,  3, 10, 11, 13, 16,  0],
         [12,  3, 17, 10, 11, 13, 16,  1, 10, 14, 16,  3, 15, 16,  1, 10, 14, 16,
           1, 15, 16,  2, 10, 11, 13, 16,  0],
         [12,  3, 17, 10, 11, 13, 16,  3, 10, 14, 16,  1, 15, 16,  3, 10, 11, 13,
          16,  0, 18, 18, 18, 18, 18, 18, 18],
         [12,  3, 17, 10, 11, 13, 16,  2, 10, 14, 16,  2, 15, 16,  3, 10, 14, 16,
           2, 15, 16,  3, 10, 11, 13, 16,  0],
         [12,  3, 17, 10, 11, 13, 16,  3, 10, 14, 16,  3, 15, 16,  2, 10, 11, 13,
          16,  0, 18, 18, 18, 18, 18, 18, 18]]),
 tensor([[1],
         [3],
         [1],
         [2],
         [2]]),
 tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1

In [124]:

s = generate_cup_shuffling_scenario(4, 5)
print(s)

print(f"The string is {len(encode_tokens(s))} tokens long:\n\n{s}\n\nAnd has the following tokenization:\n{encode_tokens(s)}")


There are 4 cups
Ball is in cup 4
Switch cup 2 and cup 1
Switch cup 4 and cup 1
Switch cup 1 and cup 4
Switch cup 4 and cup 1
Switch cup 3 and cup 2
Ball is in cup 1
The string is 48 tokens long:

There are 4 cups
Ball is in cup 4
Switch cup 2 and cup 1
Switch cup 4 and cup 1
Switch cup 1 and cup 4
Switch cup 4 and cup 1
Switch cup 3 and cup 2
Ball is in cup 1

And has the following tokenization:
[12, 4, 17, 10, 11, 13, 16, 4, 10, 14, 16, 2, 15, 16, 1, 10, 14, 16, 4, 15, 16, 1, 10, 14, 16, 1, 15, 16, 4, 10, 14, 16, 4, 15, 16, 1, 10, 14, 16, 3, 15, 16, 2, 10, 11, 13, 16, 1]


In [125]:
initial_position = initial_ball_position()
print(initial_position)

shuffle_moves = generate_shuffle_moves()
print(shuffle_moves)

final_position = final_ball_position(initial_position, shuffle_moves)
print(final_position)

scenario = generate_masked_cup_shuffling_scenario()
print(scenario)


#print(generate_cup_data(n_cups=3, num_moves=range(1,5), num_examples=1))

3
[(3, 1), (3, 1), (2, 3)]
2
('There are 3 cups\nBall is in cup 3\nSwitch cup 1 and cup 3\nSwitch cup 1 and cup 2\nSwitch cup 2 and cup 1\nBall is in cup<MASK>', ' 1')


In [126]:
print(generate_batch_cup_data(4, 2, 5))

(tensor([[12,  4, 17, 10, 11, 13, 16,  2, 10, 14, 16,  1, 15, 16,  2, 10, 14, 16,
          4, 15, 16,  3, 10, 11, 13, 16,  0],
        [12,  4, 17, 10, 11, 13, 16,  4, 10, 14, 16,  4, 15, 16,  3, 10, 14, 16,
          3, 15, 16,  4, 10, 11, 13, 16,  0]]), tensor([[1],
        [4]]), tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1]]))


# Models, functions and layers

In [127]:
class DummyModel(nn.Module):
    """
    Dummy model for testing purposes. It takes a sequence of tokens and returns a probability distribution over the vocabulary.
    
    Input: x a sequence of tokens of shape (B, T)
    Output: a probability distribution over the vocabulary of shape (B, T, vocab_size)
    Parameters: 
    """
    def __init__(self, config=MASTER_CONFIG):
        super().__init__()
        self.config = config
        
        vocab_size = config.vocab_size
        d_model = config.d_model
        
        # Embedding layer from the vocabulary to the d_model dimension, parameter matrix W_e in R^{vocab_size x d_model}
        self.embedding = nn.Embedding(vocab_size, d_model)
        # Linear layer from d_model to vocab_size dimension. Parameter matrices W_l1, W_l2 in R^{d_model x d_model}, R^{d_model x vocab_size}
        self.linear = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, vocab_size)
        )
        
        # Returns the number of parameters in the model
        self.num_parameters = sum(p.numel() for p in self.parameters() if p.requires_grad)
        
    def forward(self, x, targets=None):
        # x is of shape (B, T), each element is an integer in the range [0, vocab_size) representing a token. By seeing each token as a one-hot vector, x can be seen as a tensor of shape (B, T, vocab_size)
        B, T = x.shape
        
        # Embedd x into the d_model dimension which makes x of shape (B, T, d_model)
        x = self.embedding(x)
        
        # MLP layer into an unembedding layer, which makes x of shape (B, T, vocab_size)
        x = self.linear(x)
        
        # Softmax over the vocabulary dimension which is the last dimension, returns a probability distribution over the vocabulary, it is of shape (B, T, vocab_size)
        logits = F.softmax(x, dim=-1)
        
        if targets is not None:
            # Calculate the loss
            
            # Flatten the logits and targets to be of shape (B*T, vocab_size) and (B*T) respectively
            x = x.view(B*T, self.vocab_size)
            targets = targets.view(B*T)
            
            loss = F.cross_entropy(logits, targets)
            return loss, logits
        
        return logits

In [128]:
class positional_encoding(nn.Module):
    """ 
    Positional encoding according to [VSP17] paper "Attention is all you need" based on sine and cosine functions.
    
    B = batch size
    T = sequence length
    d_model = embedding dimension
    
    Input: x a sequence of tokens of shape (B, T, d_model)
    Output: p, where p is the positional encoding, of shape (B, T, d_model)
    """
    def __init__(self, config=MASTER_CONFIG):
        super().__init__()
        d_model = config.d_model
        l_max = config.l_max
        dtype = config.dtype
        
        self.p = torch.zeros((1, l_max, d_model)) #TODO: Is this really necessary? Should this be l_max, d_model instead?
        num = torch.arange(l_max, dtype=dtype).reshape(-1, 1) # Creates X = [[0], [1], ..., [l_max - 1]]
        denum = torch.pow(10000, torch.arange(0, d_model, 2, dtype=dtype) / d_model) # Creates Y = [10000^0/d_model, 10000^2/d_model, ..., 10000^(d_model - 1)/d_model]
        self.p[:, :, 0::2] = torch.sin(num / denum)
        self.p[:, :, 1::2] = torch.cos(num / denum)
        
    def forward(self, x):
        return self.p[:, :x.shape[1], :].to(x.device)

class AttentionHead(nn.Module):
    def __init__(self, d_q, d_v, d_attn, d_out, bias=False, config=MASTER_CONFIG, mask=None):
        super().__init__()

        # Linear layers for Q, K, V of dimensions dq, dv, dv respectively
        # TODO: More efficient to do the linear transformation together and then split the result?
        # TODO: Allow a toggle for whether to use bias or not
        self.linear_q = nn.Linear(d_attn, d_q, bias=bias)
        self.linear_k = nn.Linear(d_attn, d_v, bias=bias)
        self.linear_v = nn.Linear(d_out, d_v, bias=bias)
        
    def forward(self, x, mask=None):
        _, _, D = x.shape # x is of shape (B, T, d_model) TODO: Maybe should make this dynamic through reshaping -1?
        
        q = self.linear_q(x) # (B, T, d_model)
        k = self.linear_k(x) # (B, T, d_model)
        v = self.linear_v(x) # (B, T, d_model)
        
        S = torch.bmm(q, k.transpose(1, 2)) / np.sqrt(D) # Calculate the attention weights (B, T, d_model) * (B, d_model, T) = (B, T, T)
        
        if mask == None:
            weights = S
        else:
            weights = S.masked_fill(mask == 0, -1e9)
            
        weights = F.softmax(weights, dim=-1) # Apply the softmax on the last dimension, meaning the last dimension sums to 1
        v_bar = torch.bmm(weights, v) # Apply the attention weights to the values (B, T, T) * (B, T, d_model) = (B, T, d_model)
        # Y_t = att(X_t W_h^Q, X_t W_h^K, X_t W_h^V) = softmax((X_t W_h^Q)(X_t W_h^K)^t / sqrt(d_model)) (X_t W_h^V)

        return v_bar
        

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, mask=None):
        super().__init__()
        self.n_heads = n_heads
        
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
        
        head_size = d_model // n_heads
        
        self.heads = nn.ModuleList(
            [AttentionHead(d_attn = head_size, d_out = head_size, d_q=d_model, d_v=d_model, mask=mask) for _ in range(n_heads)]
        )
        
        self.linear_o = nn.Linear(d_model * n_heads, d_model)
        
    def forward(self, x):
        H = self.n_heads

        B, T, _ = x.shape # x is of shape (B, T, d_model)
        
        x = x.view(B, T, H, -1) # Reshape x to (B, T, n_heads, d_model/n_heads)
        x = x.transpose(1, 2) # Transpose to get shape (B, n_heads, T, d_model/n_heads)
        
        v = [head(x[:, i, :, :]) for i, head in enumerate(self.heads)] # Apply attention heads to shape (B, 1, T, d_model/n_heads)

        v = torch.stack(v, dim=1) # Stack heads to get shape (B, n_heads, T, d_model/n_heads)
        v = v.transpose(1, 2).contiguous().view(B, T, -1) # Reshape to (B, T, d_model)
        #TODO: Understand what this transformation does
        
        v_bar = self.linear_o(v) # Apply the linear layer (B, T, d_model) -> (B, T, d_model)
        
        return v_bar
    
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, d_model, n_heads, mask=None):
        super().__init__()

        self.ln_mha = nn.LayerNorm(d_model)
        self.mha = MultiHeadAttention(d_model=d_model, n_heads=n_heads, mask=mask)
        
        self.fcn = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Linear(d_model, d_model)
        )
        
    def forward(self, x):
        # x is of shape (B, T, d_model)
        
        x = self.ln_mha(x) # (B, T, d_model) -> (B, T, d_model)
        x = self.mha(x) + x # (B, T, d_model) -> (B, T, d_model)
        x = self.fcn(x) + x # (B, T, d_model) -> (B, T, d_model)
        
        return x

class cup_GPT(nn.Module):
    def __init__(self, config=MASTER_CONFIG):
        super().__init__()
        self.config = config
        
        self.d_model = config.d_model
        self.n_heads = config.n_heads
        self.d_model = config.d_model
        self.vocab_size = config.vocab_size  # Define vocab_size here
        self.n_layers = config.n_layers
        self.l_max = config.l_max
        self.causal = config.causal
        
        d_model = config.d_model
        n_heads = config.n_heads
        d_model = config.d_model
        vocab_size = config.vocab_size
        n_layers = config.n_layers
        l_max = config.l_max
        causal = config.causal
        
        if causal: #Causal mask
            mask = torch.triu(torch.ones((l_max, l_max)), diagonal=1)
        else:
            mask = None

        
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos_enc = positional_encoding(config)
        
        self.mha_layers = nn.ModuleList(
            [MultiHeadAttentionLayer(d_model=d_model, n_heads=n_heads, mask=mask) for _ in range(n_layers)]
        )
        
        self.ln = nn.LayerNorm(d_model)
        self.unembed = nn.Linear(d_model, vocab_size)
        
    def forward(self, x, targets=None):
        B, T = x.shape # x is of shape (B, T)
        print(f'x shape: {x.shape} before embedding')
        x = self.embed(x) # Embed the tokens (B, T) -> (B, T, d_model)
        print(f'x shape: {x.shape} after embedding')
        x = x + self.pos_enc(x) # Add the positional encoding (B, T, d_model) -> (B, T, d_model)

        for mha_layer in self.mha_layers: 
            x = mha_layer(x) # Apply the MHA layers (B, T, d_model) -> (B, T, d_model)
        
        x = self.ln(x) # Apply the layer norm (B, T, d_model) -> (B, T, d_model)
        unnorm_logits = self.unembed(x) # Apply the unembedding layer (B, T, d_model) -> (B, T, vocab_size), unnormalized logits
        
        if targets is not None:            
            # Flatten the logits and targets to make the calculation more efficient and return a single value
            print(f'x shape: {unnorm_logits.shape} before flattening, targets shape: {targets.shape}')
            print(f"unnorm_logits shape: {unnorm_logits.shape}")
            #unnorm_logits = unnorm_logits.view(B*T, self.vocab_size) # (B, T, vocab_size) -> (B*T, vocab_size)
  
            print(f"targets shape: {targets.shape}")
            #targets = targets.view(B) # (B, T) -> (B*T)
            print(f'x shape: {unnorm_logits.shape} after flattening, targets shape: {targets.shape}')
            targets = targets.view(B)
            # Cross entropy loss in PyTorch takes the unnormalized logits and the targets, so we don't need to softmax the logits.
            loss = F.cross_entropy(unnorm_logits, targets, ignore_index = encode_tokens('<PAD>')[0])
            
        else:
            loss = None
            
        return unnorm_logits, loss
    
# TODO: Next steps are to implement the decoder part of the transformer.
# TODO: Understand how cross entropy loss works in PyTorch

# Testing batch generator

In [129]:
input_tensor, target_tensor = generate_batch_cup_data(n_cups=MASTER_CONFIG.n_cups, num_moves=MASTER_CONFIG.n_moves, num_examples=MASTER_CONFIG.n_samples)

dataset = TensorDataset(input_tensor, target_tensor)

dataloader = DataLoader(dataset, batch_size=MASTER_CONFIG.batch_size, shuffle=True)

print(input_tensor.shape, target_tensor.shape)

ValueError: too many values to unpack (expected 2)

In [None]:
# Show the first batch
print(decode_tokens(input_tensor[0].tolist()))
print(decode_tokens(target_tensor[0].tolist()))

There are 3 cups
Ball is in cup 3
Switch cup 3 and cup 2
Switch cup 2 and cup 1
Switch cup 1 and cup 2
Ball is in cup<MASK>
 2


# Initializing dummy model

In [None]:
config = MASTER_CONFIG
data = generate_batch_cup_data(config.n_cups, config.n_moves, config.n_samples)

model = DummyModel(config)
#model.to(config.device)

optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

print(MASTER_CONFIG.vocab_size)

# Test model
x = torch.randint(MASTER_CONFIG.vocab_size, (1, 3))
y = model(x)

print(x, y)

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
19
tensor([[10,  4,  9]]) tensor([[[0.0460, 0.0479, 0.0520, 0.0560, 0.0429, 0.0613, 0.0441, 0.0564,
          0.0454, 0.0876, 0.0580, 0.0435, 0.0487, 0.0478, 0.0532, 0.0507,
          0.0420, 0.0556, 0.0611],
         [0.0457, 0.0343, 0.0508, 0.0327, 0.0476, 0.0558, 0.0747, 0.0513,
          0.0687, 0.0417, 0.0539, 0.0606, 0.0579, 0.0405, 0.0585, 0.0412,
          0.0532, 0.0705, 0.0603],
         [0.0381, 0.0598, 0.0417, 0.0610, 0.0452, 0.0514, 0.0463, 0.0512,
          0.0541, 0.0833, 0.0743, 0.0406, 0.0595, 0.0465, 0.0536, 0.0539,
          0.0405, 0.0561, 0.0430]]], grad_fn=<SoftmaxBackward0>)


In [None]:
config = MASTER_CONFIG
print(f'Generating data with n_cups={config.n_cups}, n_moves={config.n_moves}, n_samples={config.n_samples}, batch_size={config.batch_size}, vocab_size={config.vocab_size}')
data = generate_batch_cup_data(n_cups=config.n_cups, num_moves=config.n_moves, num_examples=config.n_samples)

model = cup_GPT(config)
#model.to(config.device)

optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
pad_token = encode_tokens('<PAD>')[0]

print(MASTER_CONFIG.vocab_size)

# Test model
x = torch.randint(MASTER_CONFIG.vocab_size, (1, 3))
y = model(x)

print(x, y)

# Method for evaluating the loss of the PyTorch model on the validation set without defining the model.
@torch.no_grad()
def evaluate_model(model):
    out = {}
    model.eval()
    
    for split in ["train", "val"]:
        total_loss = 0
        total_tokens = 0
        
        for batch in generate_batch(split=split):
            # Get the inputs and targets
            inputs = batch["inputs"]
            targets = batch["targets"]
            # Get the model outputs
            logits, loss = model(inputs)
            print(split)
            print(batch)
            print(inputs)
            print(targets)
            print(loss)
            # Update the total loss and tokens
            total_loss += loss.item()
            total_tokens += targets.shape[0] * targets.shape[1]
        
        # Calculate the average loss
        avg_loss = total_loss / total_tokens
        
        # TODO: Maybe consider using total_loss.mean() instead?
        
        # Store the average loss
        out[split] = avg_loss
    
    model.train()
    return out

# Train the model with evaluate_model
def train_model(model, optimizer, config=MASTER_CONFIG):
    # Set the model to train mode
    model.train()
    
    # Initialize the losses
    losses = []
    
    # Initialize the timer
    start = time.time()
    
    # Loop over the training data
    for i, batch in enumerate(generate_batch(split="train")):
        # Get the inputs and targets
        inputs = batch["inputs"]
        targets = batch["targets"]
        print(f'inputs shape: {inputs.shape}, targets shape: {targets.shape}')

        # Get the model outputs
        logits, _ = model(inputs)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)),targets.squeeze(1), ignore_index = pad_token)
        # Backpropagate the loss
        loss.backward()
        
        # Update the parameters
        optimizer.step()
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Append the loss
        losses.append(loss.item())
        
        # Print the loss every 100 iterations
        if i % 100 == 0:
            print(f"Iteration {i}, loss = {loss.item()}")
            
        # Evaluate the model every 1000 iterations
        if i % 1000 == 0:
            print("Evaluating model...")
            eval_out = evaluate_model(model)
            print(f"Train loss = {eval_out['train']}, val loss = {eval_out['val']}")
            
    # Evaluate the model at the end of training
    print("Evaluating model...")
    eval_out = evaluate_model(model)
    print(f"Train loss = {eval_out['train']}, val loss = {eval_out['val']}")
    
    # Print the total time
    print(f"Total time: {time.time() - start} seconds")
    
    # Plot the losses
    plt.plot(losses)
    plt.show()
    
    return model

Generating data with n_cups=3, n_moves=4, n_samples=100, batch_size=32, vocab_size=19
tensor([[0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
19
x shape: torch.Size([1, 3]) before embedding
x shape: torch.Size([1, 3, 128]) after embedding
tensor([[17, 12,  0]]) (tensor([[[ 1.0792,  0.5924, -0.8365, -0.4285, -1.0817,  0.5186, -0.1577,
          -0.7170, -0.3342,  0.0555, -0.0124, -0.1945,  0.8093, -0.2641,
          -0.6206, -0.2486,  0.3515,  0.5582,  0.0696],
         [ 1.4278,  0.2523, -0.5568,  0.6184,  0.2891,  1.2540, -0.0104,
          -0.1949,  0.2749, -0.0530, -0.1097, -0.2526,  0.6322, -1.1689,
           0.8515, -0.0402,  0.5456,  0.3345, -0.4231],
         [ 0.7413,  0.7185, -0.9885, -0.9926,  0.1738,  0.4904, -0.1655,
          -0.1895, -0.3779,  0.4334, -0.2087, -0.2027,  0.6373, -0.3425,
           0.0191,  0.049

In [None]:
train_model(model, optimizer, config=MASTER_CONFIG)

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 1],
        ...,
        [0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
inputs shape: torch.Size([32, 34]), targets shape: torch.Size([32, 1])
x shape: torch.Size([32, 34]) before embedding
x shape: torch.Size([32, 34, 128]) after embedding


ValueError: Expected input batch_size (1088) to match target batch_size (32).

# Tests

In [None]:
def test_auto_regressive(model, config=MASTER_CONFIG):
    '''
    Tests if the model is auto-regressive by comparing the output of the model when given the entire input sequence and when given the input sequence one token at a time.
    '''
    # TODO: Not sure this works.
    # shape vocab_size, batch_size, block_size
    x = torch.randint(config.vocab_size, (1, 3))
    y1 = model(x)
    
    y2 = torch.zeros_like(y1)
    for b in range(x.size(1)):
        y_b = model(x[:, :b + 1])
        y2[:, b] = y_b[:, b]
            
    error = ((y1 - y2).norm() / (y1.norm() + y2.norm())).item()
    
    if error < 1e-5:
        print("Auto-regressive test passed")
        print(error)
    else:
        print("Auto-regressive test failed")
        print(error)
        
    # print(f"Error={error}")
    
test_auto_regressive(model)



x shape: torch.Size([1, 3]) before embedding
x shape: torch.Size([1, 3, 128]) after embedding


TypeError: zeros_like(): argument 'input' (position 1) must be Tensor, not tuple

# Training model

# Miscelanea

In [None]:
def positional_embedding_naive(t: int, config=MASTER_CONFIG) -> torch.Tensor:
    """
    Input: t the position of the token in a sequence
    Output: the positional embedding of the position
    Parameters: W_p in R^{d_model x l_max} where l_max is the maximum sequence length or context size
    Return: e_p in R^{d_model}

    Using the sine positional embedding from [VSP17] paper "Attention is all you need".
    """
    d_model = config.d_model
    l_max = config.l_max
    
    # Create the positional embedding matrix
    W_p = torch.zeros((d_model, l_max))
    for t in range(l_max):
        for i in range(d_model):
            if i % 2 == 0:
                W_p[i, t] = np.sin(t / 10000 ** (i / d_model)) # TODO: check if this is correct, or should it be l_max instead of 10000?
            else:
                W_p[i, t] = np.cos(t / 10000 ** ((i - i % 2) / d_model))
    
    # Return the positional embedding vector
    return W_p[:, t]
