# Imports and configuration

In [276]:
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
from matplotlib import pyplot as plt
import time
import pandas as pd
import random
from dataclasses import dataclass, field
from torch.utils.data import DataLoader, TensorDataset

In [277]:
# Create a config file for the cup shuffling task
@dataclass
class MASTER_CONFIG:
    # Training
    seed: int = 1337
    batch_size: int = 32
    training_split: float = 0.8
    
    epochs = 1
    batch_eval_internal = 10
    learning_rate = 2e-5
    eval_iters = 50
    
    # Model parameters
    dtype: torch.dtype = torch.float32
    d_model: int = 128 # This is the size of the embedding
    l_max: int = 128 # Max sequence length
    n_heads: int = 8 # Number of heads in the multi-head attention
    n_layers: int = 8 # Number of layers in the transformer of MHA blocks
    dropout: float = 0.1 # Dropout rate
    
    # Data parameters
    n_cups: int = 3
    n_moves: int = 3
    n_samples: int = 1000

    # Tokenizer
    vocab: list[str] = field(default_factory = lambda: [])
    vocab_size: int = 0
    
    # Use CUDA or MPS if available else CPU
    if (torch.cuda.is_available()):
        device = torch.device("cuda")
        print("Using CUDA")
    elif (torch.backends.mps.is_available()):
        device = torch.device("mps")
        print("Using Apple Silicon MPS")
    else:
        device = torch.device("cpu")
        print("Using CPU")

Using Apple Silicon MPS


# Ball Shuffler

In [278]:
import random

def initial_ball_position(n=3):
    return random.randint(1, n)

def generate_shuffle_moves(n=3, num_moves=3):
    moves = []
    
    for _ in range(num_moves):
        # Randomly pick two different cups
        cup1, cup2 = random.sample(range(1, n + 1), 2)
        moves.append((cup1, cup2))
    
    return moves

def final_ball_position(initial_position, shuffle_moves):
    position = initial_position
    for move in shuffle_moves:
        # If the ball's current position matches one of the cups in the move, swap it.
        if position == move[0]:
            position = move[1]
        elif position == move[1]:
            position = move[0]
    
    return position

# Method for generating data and labels for batches.
# TODO: This can be done much better
def generate_batch(split: str, config=MASTER_CONFIG):
    B = config.batch_size
    T = config.l_max
    vocab_size = config.vocab_size
    
    # Generate the data
    data = generate_cup_data(config.n_cups, config.n_moves, config.n_samples)
    
    # Split the data into training and validation sets
    if split == "train":
        data = data[:int(config.training_split * len(data))]
    elif split == "val":
        data = data[int(config.training_split * len(data)):]
    else:
        raise ValueError("split must be either 'train' or 'val'")
    
    # Return input and output tensors
    inputs = torch.zeros((B, T), dtype=torch.long)
    outputs = torch.zeros((B, T), dtype=torch.long)
    
    

def generate_cup_shuffling_scenario(n=3, num_moves=3):
    # Generate initial ball position and shuffle moves
    initial_position = initial_ball_position(n)
    shuffle_moves = generate_shuffle_moves(n, num_moves)
    
    # Calculate the final ball position
    final_position = final_ball_position(initial_position, shuffle_moves)
    
    # Construct the input and output strings
    string = f"There are {n} cups\n"
    string += f"Ball is in cup {initial_position}\n"
    string += "\n".join([f"Switch cup {move[0]} and cup {move[1]}" for move in shuffle_moves])
    string += f"\nBall is in cup {final_position}"
    
    return string

def generate_masked_cup_shuffling_scenario(n_cups=3, n_moves=3):
    # Generate initial ball position and shuffle moves
    initial_position = initial_ball_position(n_cups)
    shuffle_moves = generate_shuffle_moves(n_cups, n_moves)
    
    # Calculate the final ball position for tokenizing
    final_position = " " + str(final_ball_position(initial_position, shuffle_moves))
    
    # Construct the input and output strings
    input = f"There are {n_cups} cups\n"
    input += f"Ball is in cup {initial_position}\n"
    input += "\n".join([f"Switch cup {move[0]} and cup {move[1]}" for move in shuffle_moves])
    input += f"\nBall is in cup<MASK>"
    
    return input, final_position

In [279]:
print(generate_masked_cup_shuffling_scenario(4,5))

('There are 4 cups\nBall is in cup 2\nSwitch cup 1 and cup 2\nSwitch cup 1 and cup 2\nSwitch cup 3 and cup 4\nSwitch cup 3 and cup 1\nSwitch cup 4 and cup 1\nBall is in cup<MASK>', ' 2')


In [280]:
initial_position = initial_ball_position()
print(initial_position)

shuffle_moves = generate_shuffle_moves()
print(shuffle_moves)

final_position = final_ball_position(initial_position, shuffle_moves)
print(final_position)

scenario = generate_masked_cup_shuffling_scenario()
print(scenario)


#print(generate_cup_data(n_cups=3, num_moves=range(1,5), num_examples=1))

3
[(2, 3), (3, 2), (1, 3)]
1
('There are 3 cups\nBall is in cup 3\nSwitch cup 3 and cup 2\nSwitch cup 1 and cup 3\nSwitch cup 3 and cup 2\nBall is in cup<MASK>', ' 3')


# Tokenizer

In [281]:
# Tokenizer, we want BOS, EOS tokens
vocab = ['<MASK>', ' 1', ' 2', ' 3', ' 4', ' 5', ' 6', ' 7', ' 8', ' 9', '\n', 'Ball', 'There are', ' is in', 'Switch', ' and', ' cup', ' cups', '<PAD>']

setattr(MASTER_CONFIG, "vocab", vocab)
setattr(MASTER_CONFIG, "vocab_size", len(vocab))

idx_to_s = {i:ch for i, ch in enumerate(vocab)}
s_to_idx = {ch:i for i, ch in enumerate(vocab)}

def encode_tokens(s: str) -> list[int]:
    ids = []
    i = 0
    while i < len(s):
        max_len = -1
        max_token = None
        for token in s_to_idx.keys():
            token_len = len(token)
            if s[i:i+token_len] == token:
                if token_len > max_len:
                    max_len = token_len
                    max_token = token
        if max_token:
            ids.append(s_to_idx[max_token])
            i += max_len
        else:
            print(f"Unrecognized sequence at index {i}, {s[i:i+1]}")
            
            break
    return ids

def decode_tokens(ids: list[int]) -> str:
    return "".join([idx_to_s[i] for i in ids])

s = generate_cup_shuffling_scenario(4, 5)

print(f"The string is {len(encode_tokens(s))} tokens long:\n\n{s}\n\nAnd has the following tokenization:\n{encode_tokens(s)}")


The string is 48 tokens long:

There are 4 cups
Ball is in cup 3
Switch cup 1 and cup 2
Switch cup 4 and cup 3
Switch cup 1 and cup 4
Switch cup 2 and cup 3
Switch cup 3 and cup 4
Ball is in cup 1

And has the following tokenization:
[12, 4, 17, 10, 11, 13, 16, 3, 10, 14, 16, 1, 15, 16, 2, 10, 14, 16, 4, 15, 16, 3, 10, 14, 16, 1, 15, 16, 4, 10, 14, 16, 2, 15, 16, 3, 10, 14, 16, 3, 15, 16, 4, 10, 11, 13, 16, 1]


In [282]:
# Generate batches of data
def generate_batch_cup_data(n_cups = 3, num_examples=1000, num_moves=3):
    data = []
    inputs_idx = []
    targets_idx = []
    moves = random.choice(range(num_moves))
    pad_token_id = encode_tokens('<PAD>')
    
    for _ in range(num_examples):
        input, target = generate_masked_cup_shuffling_scenario(n_cups, moves)
        inputs_idx.append(encode_tokens(input))

        # Finds the last cup number in the string
        targets_idx.append(encode_tokens(target))
    
    max_inputs_len = max(len(input) for input in inputs_idx)
    max_targets_len = max(len(target) for target in targets_idx)

    padded_input_ids = [input + [pad_token_id] * (max_inputs_len - len(input)) for input in inputs_idx]
    padded_target_ids = [targets + [pad_token_id] * (max_targets_len - len(targets)) for targets in targets_idx]
    
    input_tensor = torch.tensor(padded_input_ids, dtype=torch.long)
    target_tensor = torch.tensor(padded_target_ids, dtype=torch.long)
    
    return input_tensor, target_tensor

# Models, functions and layers

In [283]:
class DummyModel(nn.Module):
    """
    Dummy model for testing purposes. It takes a sequence of tokens and returns a probability distribution over the vocabulary.
    
    Input: x a sequence of tokens of shape (B, T)
    Output: a probability distribution over the vocabulary of shape (B, T, vocab_size)
    Parameters: 
    """
    def __init__(self, config=MASTER_CONFIG):
        super().__init__()
        self.config = config
        
        vocab_size = config.vocab_size
        d_model = config.d_model
        
        # Embedding layer from the vocabulary to the d_model dimension, parameter matrix W_e in R^{vocab_size x d_model}
        self.embedding = nn.Embedding(vocab_size, d_model)
        # Linear layer from d_model to vocab_size dimension. Parameter matrices W_l1, W_l2 in R^{d_model x d_model}, R^{d_model x vocab_size}
        self.linear = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, vocab_size)
        )
        
        # Returns the number of parameters in the model
        self.num_parameters = sum(p.numel() for p in self.parameters() if p.requires_grad)
        
    def forward(self, x, targets=None):
        # x is of shape (B, T), each element is an integer in the range [0, vocab_size) representing a token. By seeing each token as a one-hot vector, x can be seen as a tensor of shape (B, T, vocab_size)
        B, T = x.shape
        
        # Embedd x into the d_model dimension which makes x of shape (B, T, d_model)
        x = self.embedding(x)
        
        # MLP layer into an unembedding layer, which makes x of shape (B, T, vocab_size)
        x = self.linear(x)
        
        # Softmax over the vocabulary dimension which is the last dimension, returns a probability distribution over the vocabulary, it is of shape (B, T, vocab_size)
        logits = F.softmax(x, dim=-1)
        
        if targets is not None:
            # Calculate the loss
            
            # Flatten the logits and targets to be of shape (B*T, vocab_size) and (B*T) respectively
            x = x.view(B*T, self.vocab_size)
            targets = targets.view(B*T)
            
            loss = F.cross_entropy(logits, targets)
            return loss, logits
        
        return logits

In [435]:
class positional_encoding(nn.Module):
    """ 
    Positional encoding according to [VSP17] paper "Attention is all you need" based on sine and cosine functions.
    
    Input: x a sequence of tokens of shape (B, T, d_model)
    Output: p, where p is the positional encoding, of shape (B, T, d_model)
    """
    def __init__(self, config=MASTER_CONFIG):
        super().__init__()
        d_model = config.d_model
        l_max = config.l_max
        dtype = config.dtype
        
        self.p = torch.zeros((1, l_max, d_model)) #TODO: Is this really necessary? Should this be l_max, d_model instead?
        
        # Creates X = [[0], [1], ..., [l_max - 1]]
        num = torch.arange(l_max, dtype=dtype).reshape(-1, 1)
        # Creates Y = [10000^0/d_model, 10000^2/d_model, ..., 10000^(d_model - 1)/d_model]
        denum = torch.pow(10000, torch.arange(0, d_model, 2, dtype=dtype) / d_model)
        
        self.p[:, :, 0::2] = torch.sin(num / denum)
        self.p[:, :, 1::2] = torch.cos(num / denum)
        
    def forward(self, x):
        return self.p[:, :x.shape[1], :].to(x.device)

class AttentionHead(nn.Module):
    def __init__(self, d_q, d_v, d_attn, d_out, bias=False, config=MASTER_CONFIG, mask=None):
        super().__init__()

        # Linear layers for Q, K, V of dimensions dq, dv, dv respectively
        # TODO: More efficient to do the linear transformation together and then split the result?
        # TODO: Allow a toggle for whether to use bias or not
        self.linear_q = nn.Linear(d_attn, d_q, bias=bias)
        self.linear_k = nn.Linear(d_attn, d_v, bias=bias)
        self.linear_v = nn.Linear(d_out, d_v, bias=bias)
        # Y_t = att(X_t W_h^Q, X_t W_h^K, X_t W_h^V) = softmax((X_t W_h^Q)(X_t W_h^K)^t / sqrt(d_model)) (X_t W_h^V)
        
    def forward(self, x, mask=None):
        # x is of shape (B, T, d_model)
        B, T, D = x.shape #TODO: Maybe should make this dynamic through reshaping -1?
        
        q = self.linear_q(x) # (B, T, d_model)
        k = self.linear_k(x) # (B, T, d_model)
        v = self.linear_v(x) # (B, T, d_model)
        
        S = torch.bmm(q, k.transpose(1, 2)) / np.sqrt(D) # (B, T, d_model) * (B, d_model, T) = (B, T, T)
        
        # (B, T, d_model) * (B, d_model, T) = (B, T, T)
        if mask == None:
            weights = S
        else:
            weights = S.masked_fill(mask == 0, -1e9)
            
        

        # Apply the softmax on the last dimension, meaning the last dimension sums to 1
        weights = F.softmax(weights, dim=-1)
        
        # Apply the attention weights to the values
        # (B, T, T) * (B, T, d_model) = (B, T, d_model)
        v_bar = torch.bmm(weights, v)
        
        return v_bar
        

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, mask=None):
        super().__init__()
        self.n_heads = n_heads
        
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
        
        head_size = d_model // n_heads
        
        self.heads = nn.ModuleList(
            [AttentionHead(d_attn = head_size, d_out = head_size, d_q=d_model, d_v=d_model, mask=mask) for _ in range(n_heads)]
        )
        
        self.linear_o = nn.Linear(d_model * n_heads, d_model)
        
    def forward(self, x):
        # x is of shape (B, T, d_model)
        B, T, _ = x.shape
        H = self.n_heads
        
        # Reshape x to (B, T, n_heads, d_model/n_heads)
        x = x.view(B, T, H, -1)
        
        # Transpose to get shape (B, n_heads, T, d_model/n_heads)
        x = x.transpose(1, 2)
        
        # Apply attention heads to shape (B, 1, T, d_model/n_heads) = (B, T, d_model/n_heads)
        v = [head(x[:, i, :, :]) for i, head in enumerate(self.heads)] 
        
        # Stack and reshape
        v = torch.stack(v, dim=1)
        v = v.transpose(1, 2).contiguous().view(B, T, -1)
        #TODO: Understand what this transformation does
        
        # Apply the linear layer
        v_bar = self.linear_o(v)
        
        return v_bar
    
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, d_model, n_heads, mask=None):
        super().__init__()

        self.ln_mha = nn.LayerNorm(d_model)
        self.mha = MultiHeadAttention(d_model=d_model, n_heads=n_heads, mask=mask)
        
        self.fcn = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Linear(d_model, d_model)
        )
        
    def forward(self, x):
        # x is of shape (B, T, d_model)
        
        x = self.ln_mha(x) # (B, T, d_model) -> (B, T, d_model)
        x = self.mha(x) + x
        x = self.fcn(x) + x
        
        return x

class cup_GPT(nn.Module):
    def __init__(self, config=MASTER_CONFIG):
        super().__init__()
        self.config = config
        
        d_model = config.d_model
        n_heads = config.n_heads
        d_model = config.d_model
        vocab_size = config.vocab_size
        n_layers = config.n_layers
        l_max = config.l_max
        
        #Causal mask
        causal_mask = torch.tril(torch.ones((l_max, l_max)))
        causal_mask = None
        
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos_enc = positional_encoding(config)
        
        self.mha_layers = nn.ModuleList(
            [MultiHeadAttentionLayer(d_model=d_model, n_heads=n_heads, mask=causal_mask) for _ in range(n_layers)]
        )
        
        self.ln = nn.LayerNorm(d_model)
        self.unembed = nn.Linear(d_model, vocab_size)
        
    def forward(self, x, targets=None):
        # x is of shape (B, T)
        B, T = x.shape
        
        # Embed the tokens (B, T) -> (B, T, d_model)
        x = self.embed(x)
        
        # Add the positional encoding (B, T, d_model) -> (B, T, d_model)
        x = x + self.pos_enc(x)
        
        # Apply the MHA layers (B, T, d_model) -> (B, T, d_model)
        for mha_layer in self.mha_layers:
            x = mha_layer(x)
        
        # Apply the layer norm (B, T, d_model) -> (B, T, d_model)
        x = self.ln(x)
        
        # Apply the unembedding layer (B, T, d_model) -> (B, T, vocab_size)
        logits = self.unembed(x)
        
        if targets is not None:
            # Calculate the loss
            
            # Flatten the logits and targets to be of shape (B*T, vocab_size) and (B*T) respectively to make the calculation more efficient and return a single value
            x = x.view(B*T, self.vocab_size)
            targets = targets.view(B*T)
            
            loss = F.cross_entropy(logits, targets)
            return loss, logits
        
        return logits
    
# TODO: Next steps are to implement the decoder part of the transformer.
# Implement:
# - Training loop
# TODO: Understand how cross entropy loss works in PyTorch

# Training pass

In [285]:
# Training of DummyModel
def train(model, optimizer, config=MASTER_CONFIG):
    # Set the model to training mode
    model.train()
    
    # Generate a batch of data
    inputs, targets = generate_batch("train", config)
    
    # Forward pass
    loss, logits = model(inputs, targets)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss.item()

In [286]:
input_tensor, target_tensor = generate_batch_cup_data(n_cups=MASTER_CONFIG.n_cups, num_moves=MASTER_CONFIG.n_moves, num_examples=MASTER_CONFIG.n_samples)

dataset = TensorDataset(input_tensor, target_tensor)

dataloader = DataLoader(dataset, batch_size=MASTER_CONFIG.batch_size, shuffle=True)

# Initializing dummy model

In [333]:
config = MASTER_CONFIG
data = generate_batch_cup_data(config.n_cups, config.n_moves, config.n_samples)

model = DummyModel(config)
#model.to(config.device)

optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

print(MASTER_CONFIG.vocab_size)

# Test model
x = torch.randint(MASTER_CONFIG.vocab_size, (1, 3))
y = model(x)

print(x, y)

19
tensor([[16,  8,  0]]) tensor([[[0.0622, 0.0487, 0.0572, 0.0471, 0.0498, 0.0572, 0.0705, 0.0542,
          0.0512, 0.0604, 0.0496, 0.0571, 0.0549, 0.0596, 0.0456, 0.0318,
          0.0394, 0.0503, 0.0532],
         [0.0581, 0.0750, 0.0656, 0.0529, 0.0541, 0.0619, 0.0436, 0.0465,
          0.0356, 0.0670, 0.0327, 0.0665, 0.0472, 0.0885, 0.0374, 0.0494,
          0.0330, 0.0488, 0.0364],
         [0.0750, 0.0558, 0.0572, 0.0458, 0.0527, 0.0545, 0.0467, 0.0650,
          0.0658, 0.0827, 0.0489, 0.0366, 0.0408, 0.0532, 0.0381, 0.0462,
          0.0400, 0.0506, 0.0444]]], grad_fn=<SoftmaxBackward0>)


In [359]:
config = MASTER_CONFIG
data = generate_batch_cup_data(config.n_cups, config.n_moves, config.n_samples)

model = cup_GPT(config)
#model.to(config.device)

optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

print(MASTER_CONFIG.vocab_size)

# Test model
x = torch.randint(MASTER_CONFIG.vocab_size, (1, 3))
y = model(x)

print(x, y)

19
tensor([[16, 14, 11]]) tensor([[[0.0595, 0.0769, 0.1297, 0.0388, 0.0156, 0.0673, 0.0383, 0.0167,
          0.0482, 0.0522, 0.0327, 0.0278, 0.0347, 0.0247, 0.1177, 0.0564,
          0.0537, 0.0770, 0.0318],
         [0.0318, 0.0361, 0.0613, 0.0833, 0.0542, 0.0443, 0.0340, 0.1034,
          0.0397, 0.0405, 0.0526, 0.0800, 0.0457, 0.0454, 0.0462, 0.1053,
          0.0409, 0.0290, 0.0264],
         [0.0395, 0.0554, 0.0500, 0.0385, 0.0554, 0.0246, 0.0258, 0.0450,
          0.0296, 0.0644, 0.0528, 0.0795, 0.1179, 0.0270, 0.0322, 0.0669,
          0.1300, 0.0405, 0.0251]]], grad_fn=<SoftmaxBackward0>)


# Tests

In [508]:
def test_auto_regressive(model, config=MASTER_CONFIG):
    '''
    Tests if the model is auto-regressive by comparing the output of the model when given the entire input sequence and when given the input sequence one token at a time.
    '''
    
    # shape vocab_size, batch_size, block_size
    x = torch.randint(config.vocab_size, (1, 3))
    y1 = model(x)
    
    y2 = torch.zeros_like(y1)
    for b in range(x.size(1)):
        y_b = model(x[:, :b + 1])
        y2[:, b] = y_b[:, b]
            
    error = ((y1 - y2).norm() / (y1.norm() + y2.norm())).item()
    
    if error < 1e-5:
        print("Auto-regressive test passed")
        print(error)
    else:
        print("Auto-regressive test failed")
        print(error)
        
    # print(f"Error={error}")
    
test_auto_regressive(model)

#zero input test
def zero_input_test(model, config=MASTER_CONFIG):
    '''
    Tests if the model outputs the same distribution when given a sequence of zeros as input.
    '''
    
    x = torch.zeros((1, 3), dtype=torch.long)
    print(x)
    y = model(x)
    print(y)
    
    error = y.norm().item()
    
    if error < 1e-5:
        print("Zero input test passed")
        print(error)
    else:
        print("Zero input test failed")
        print(error)
        
    # print(f"Error={error}")

zero_input_test(model)

def test_auto_reg_2(model, config=MASTER_CONFIG):
    sequence1 = [random.randint(0, config.vocab_size-1) for _ in range(3)]
    sequence2 = [random.randint(0, config.vocab_size-1)] + sequence1  # Add a new token at the beginning

    output1 = model(torch.tensor(sequence1).unsqueeze(0))
    output2 = model(torch.tensor(sequence2).unsqueeze(0))

    prob1 = F.softmax(output1[0, -1, :], dim=-1)[sequence1[-1]]
    prob2 = F.softmax(output2[0, -1, :], dim=-1)[sequence1[-1]]
    
    print(prob1 - prob2)
    
test_auto_reg_2(model)

Auto-regressive test failed
0.11075954884290695
tensor([[0, 0, 0]])
tensor([[[0.0215, 0.0186, 0.0224, 0.0702, 0.0663, 0.0275, 0.0104, 0.1458,
          0.0258, 0.1290, 0.0560, 0.0589, 0.0472, 0.0439, 0.0618, 0.0230,
          0.1201, 0.0300, 0.0213],
         [0.0207, 0.0192, 0.0225, 0.0675, 0.0680, 0.0260, 0.0105, 0.1554,
          0.0256, 0.1197, 0.0559, 0.0620, 0.0499, 0.0429, 0.0568, 0.0235,
          0.1250, 0.0281, 0.0208],
         [0.0190, 0.0194, 0.0225, 0.0676, 0.0688, 0.0243, 0.0112, 0.1612,
          0.0258, 0.1086, 0.0576, 0.0663, 0.0516, 0.0433, 0.0547, 0.0233,
          0.1256, 0.0285, 0.0208]]], grad_fn=<SoftmaxBackward0>)
Zero input test failed
0.49578845500946045
tensor(0.0001, grad_fn=<SubBackward0>)


In [290]:
# Method for evaluating the loss of the PyTorch model on the validation set without defining the model.
@torch.no_grad()
def evaluate_model(model, criterion):
    out = {}
    model.eval()
    
    for split in ["train", "val"]:
        total_loss = 0
        total_tokens = 0
        
        for batch in generate_batch(split=split):
            # Get the inputs and targets
            inputs = batch["inputs"]
            targets = batch["targets"]
            
            # Get the model outputs
            outputs = model(inputs)
            
            # Calculate the loss
            loss = criterion(outputs, targets)
            
            # Update the total loss and tokens
            total_loss += loss.item()
            total_tokens += targets.shape[0] * targets.shape[1]
        
        # Calculate the average loss
        avg_loss = total_loss / total_tokens
        
        # Store the average loss
        out[f"{split}_loss"] = avg_loss
    

# Miscelanea

In [291]:
def positional_embedding_naive(t: int, config=MASTER_CONFIG) -> torch.Tensor:
    """
    Input: t the position of the token in a sequence
    Output: the positional embedding of the position
    Parameters: W_p in R^{d_model x l_max} where l_max is the maximum sequence length or context size
    Return: e_p in R^{d_model}

    Using the sine positional embedding from [VSP17] paper "Attention is all you need".
    """
    d_model = config.d_model
    l_max = config.l_max
    
    # Create the positional embedding matrix
    W_p = torch.zeros((d_model, l_max))
    for t in range(l_max):
        for i in range(d_model):
            if i % 2 == 0:
                W_p[i, t] = np.sin(t / 10000 ** (i / d_model)) # TODO: check if this is correct, or should it be l_max instead of 10000?
            else:
                W_p[i, t] = np.cos(t / 10000 ** ((i - i % 2) / d_model))
    
    # Return the positional embedding vector
    return W_p[:, t]
