# Assignment 3 - Autoregressive Language Modeling with Transformers

In [41]:
%pip install torch==2.4.1 datasets==3.1.0

Note: you may need to restart the kernel to use updated packages.


In [59]:
import os
import math
import inspect
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np

from datasets import load_dataset

# Data

In [60]:
import torch
from datasets import load_dataset

class MovieDataset(torch.utils.data.Dataset):
    def __init__(self, block_size=256):
        # Load dataset
        ds = load_dataset("Pablinho/movies-dataset")
        data = ds['train'].to_pandas()

        # Convert to pandas and create string format
        text_data = ""
        for _, row in data.iterrows():
            text_data += f"{row['Title']}: {row['Overview']}\n"

        # Create character mappings
        chars = sorted(list(set(text_data)))
        self.string_to_int = {ch:i for i,ch in enumerate(chars)}
        self.int_to_string = {i:ch for i,ch in enumerate(chars)}

        # Encode text to integers
        encoded_data = [self.string_to_int[c] for c in text_data]
        
        # Convert to tensor
        self.data = torch.tensor(encoded_data, dtype=torch.long)
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        x = chunk[:-1]  # all but last
        y = chunk[1:]   # all but first
        return x, y

    def decode(self, ids):
        return ''.join([self.int_to_string[i.item()] for i in ids])

# Model

In [61]:
@dataclass
class GPTConfig:
    vocab_size: int # Number of unique tokens in the vocabulary
    block_size: int = 256 # Sequence length
    n_block: int = 6 # Number of blocks in the transformer
    n_head: int = 6 # Number of attention heads
    n_embd: int = 384 # Embedding dimensionality
    dropout: float = 0.2 # Dropout rate
    bias: bool = True # If True, we add a bias to the LayerNorm and Linear layers.

In [75]:
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0, f"Embedding dimension {config.n_embd} must be divisible by number of heads {config.n_head}"

        self.n_head = config.n_head # Number of attention heads
        self.n_embd = config.n_embd # Embedding dimensionality
        self.dropout = config.dropout # Dropout rate

        # Maps embedding into Q, K, V. We'll use one layer to generate these matrices for all heads at once.
        self.qkv_map = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)

        # After performing attention for each head individually, we concat the results 
        # and feed them through this linear layer.
        self.proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)

        # Regularization
        self.final_dropout = nn.Dropout(self.dropout)


    def forward(self, x):
        B, T, C = x.shape # Batch size, sequence length, n_embd
        d_k = C // self.n_head # Dimension of the query, key, and value vectors (within a head)

        # TODO: Implement Causal Self Attention
        # Hint: The output of the qkv_map is a tensor of shape (B, T, 3*C).
        # We need to split this tensor into Q, K, and V tensors of shape (B, T, C) each.
        # Afterwards, reshape and transpose them to the correct shape (see assert statements),
        # such that we have (smaller) Q, K, and V matrices for each head.
        qkv_tensor = self.qkv_map(x)
        #splitting accross the 2nd dimension (C*3), so that every new tensor has dimension C there
        Q, K, V = torch.split(qkv_tensor, C, dim=2)
        #reshaping data, to have d_k and n_head as separate dimensions -> C = d_k*n_head
        #also, T and self.n_head dimensions need to be swapped
        dim1 = 1
        dim2 = 2
        #"view" only works if sizes are compatible, "reshape" returns copy if not compatible
        Q = Q.reshape(B,T,self.n_head,d_k).transpose(dim1, dim2)
        K = K.reshape(B,T,self.n_head,d_k).transpose(dim1, dim2)
        V = V.reshape(B,T,self.n_head,d_k).transpose(dim1, dim2)

        for M in [Q, K, V]:
            assert M.shape == (B, self.n_head, T, d_k), f"Expected shape (B, self.n_head, T, d_k), but got {M.shape}"

        # TODO: Compute the attention weights and aggregated values as specified in the assignment sheet.
        # Hint: Broadcasted matrix multiplication can be implemented using the @ operator.
        # Hint: `torch.tril` may help you with masking the attention scores.
        
        #similarities Q*K^T
        s = Q @ K.transpose(2,3)

        #normalized attention weights
        s_normalized = s/torch.sqrt(torch.tensor(d_k))
        weights_normalized = F.softmax(s_normalized, dim = 3)
        
        #weighted sum of values
        aggregated_vals = weights_normalized @ V # this is the output of each attention head, which is a weighted sum of the values in V
        assert aggregated_vals.shape == (B, self.n_head, T, d_k), f"Expected aggregated_vals shape (B, self.n_head, T, d_k), but got {aggregated_vals.shape}"

        # Combine all head outputs into the last dimension
        out = aggregated_vals.transpose(1, 2).reshape(B, T, C)
        out = self.proj(out) # This combines the outputs of all heads
        out = self.final_dropout(out) # This is the final dropout layer

        return out

You can test your implementation of the `CausalSelfAttention` class by running the following code:

In [77]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = GPTConfig(vocab_size=10, block_size=8, n_block=6, n_head=6, n_embd=12, dropout=0.0, bias=True)

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

x = torch.randn(2, 8, 12).to(device)
attention = CausalSelfAttention(config).to(device)
att_out = attention(x)

# Read expected output from file
att_out_expected = torch.load('CausalSelfAttention_out.pt', map_location=device)

assert torch.allclose(att_out, att_out_expected)

att_out: tensor([[[-0.1410, -0.1456,  0.0279,  0.1063,  0.3435,  0.1737, -0.1079,
          -0.2904,  0.0837,  0.0774, -0.0432,  0.0253],
         [-0.1748, -0.1900,  0.0187,  0.0313,  0.3244,  0.2348, -0.0639,
          -0.3467,  0.0300,  0.0499,  0.0102,  0.0375],
         [-0.1192, -0.2192,  0.0257,  0.0251,  0.2824,  0.2528, -0.1883,
          -0.3818,  0.0285, -0.0104, -0.0395,  0.0161],
         [-0.1759, -0.1602,  0.0831,  0.2396,  0.5030,  0.1127, -0.0007,
          -0.2570,  0.0935,  0.1121, -0.0365,  0.0257],
         [-0.1949, -0.2670,  0.1070,  0.1451,  0.4692,  0.2010, -0.1052,
          -0.4021, -0.0586,  0.1342, -0.0678,  0.1223],
         [-0.2087, -0.2168,  0.1145,  0.2018,  0.4810,  0.1007, -0.0047,
          -0.2621,  0.0955,  0.1357, -0.1080,  0.0965],
         [-0.1492, -0.3028,  0.1388,  0.1465,  0.4636,  0.2449, -0.1317,
          -0.4205, -0.0678,  0.0621, -0.0884,  0.1299],
         [-0.1202, -0.1567,  0.0320,  0.1135,  0.3432,  0.1806, -0.1261,
          -0.29

  att_out_expected = torch.load('CausalSelfAttention_out.pt', map_location=device)


AssertionError: 

In [6]:
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        # TODO: Implement the MLP
        # It should consist of a linear layer, a GELU activation function, and a final linear layer.
        # After the final linear layer, apply dropout with dropout rate config.dropout.
        # The first linear layer should map from config.n_embd to 4 * config.n_embd.
        # The second linear layer should map from 4 * config.n_embd back to config.n_embd.
        # The linear layers should have a bias term if config.bias is True, and no bias term otherwise.

    def forward(self, x):
        # TODO: Implement the forward pass of the MLP
        pass

In [None]:
class Block(nn.Module): # -> exactly what's shown in the slides
    def __init__(self, config):
        super().__init__()
        self.layernorm_1 = nn.LayerNorm(config.n_embd, bias=config.bias)
        self.attention = CausalSelfAttention(config)
        self.layernorm_2 = nn.LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attention(self.layernorm_1(x))
        x = x + self.mlp(self.layernorm_2(x))
        return x

In [None]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            embed_token = nn.Embedding(config.vocab_size, config.n_embd),
            embed_position = nn.Embedding(config.block_size, config.n_embd),
            dropout = nn.Dropout(config.dropout),
            blocks = nn.ModuleList([Block(config) for _ in range(config.n_block)]),
            layernorm = nn.LayerNorm(config.n_embd, bias=config.bias),
        ))

        self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # We use the same weights for the token embeddings and the final linear layer.
        # This is a form of "weight tying", see https://paperswithcode.com/method/weight-tying
        self.transformer.embed_token.weight = self.head.weight

        # Initialize all linear layers using our custom init function
        self.apply(self._init_params)

        # report number of parameters
        print(f"Number of parameters in GPT: {self.get_num_params()/1e6:.2f}M")


    def _init_params(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)


    def get_num_params(self):
        return sum(p.numel() for p in self.parameters())


    def forward(self, idx, targets=None): 
        device = idx.device
        b, t = idx.shape #-> first dimension is batch dimension
        assert t <= self.config.block_size, f"Cannot process sequence of length {t}, block size is only {self.config.block_size}"
        position_idxs = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # TODO: Implement the forward pass of the GPT model
        # Embed the tokens and positions using the embedding layers self.transformer.embed_token and self.transformer.embed_position.
        # Add the token embeddings and position embeddings together and pass the result through the dropout layer.
        # Pass the result through all the transformer blocks.
        # Apply layer normalization and finally obtain the logits by project the result to 
        # the vocabulary space using the head layer.
        logits = ...

        if targets is not None:
            # We calculate the loss if targets are provided (i.e., during training)
            loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), targets.view(-1), ignore_index=-1)
        else:
            loss = None

        return logits, loss


    def get_optimizer(self, weight_decay, learning_rate, betas, device):
        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}

        # We will decay all parameters that are 2D or higher dimensional. 
        # This includes all weight matrices and embeddings.
        decay_params = [p for n, p in param_dict.items() if len(p.shape) >= 2]
        # We will not decay biases and layernorm parameters (which are 1D).
        nodecay_params = [p for n, p in param_dict.items() if len(p.shape) < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]

        fused = (device == 'cuda')
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, fused=fused)

        return optimizer


    @torch.no_grad()
    def sample(self, idx, max_new_tokens, temperature=1.0):
        # idx is of shape (batch_size, sequence_length)

        for _ in range(max_new_tokens):
            # If the sequence context is growing too long we must crop it at block_size
            idx_input = idx if idx.shape[1] <= self.config.block_size else idx[:, -self.config.block_size:]
            # TODO: Push idx_input through the model to get the logits for the next token in the sequence
            # Hint: The logits that are returned by the model are of shape (batch_size, sequence_length, vocab_size).
            # To predict the next token, we only need the logits for the last position in the sequence.
            # Next, divide the logits by the desired temperature and apply the softmax function to convert them to probabilities.
            # Finally, sample the next token from this probability distribution.

            next_token = ...
            assert next_token.shape == (idx.shape[0], 1), f"Expected next_token shape (batch_size, 1), but got {next_token.shape}"
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, next_token), dim=1)

        return idx

# Training 

In [31]:
@torch.no_grad()
def estimate_train_val_loss(model, train_loader, val_loader, val_iters, device):
    model.eval()
    losses = {}
    for split, loader in [('train', train_loader), ('val', val_loader)]:
        total_loss = 0
        for i, (X, Y) in enumerate(loader):
            if i >= val_iters:
                break
            X, Y = X.to(device), Y.to(device)
            logits, loss = model(X, Y)
            total_loss += loss.item()
        losses[split] = total_loss / val_iters
    model.train()
    return losses

In [37]:
# Set seeds for reproducibility
torch.manual_seed(1337)

block_size = 128
batch_size = 128

# Create datasets and dataloaders
data = MovieDataset(block_size)

# split into train and validation sets
train_len = int(len(data) * 0.8)
val_len = len(data) - train_len
train_dataset, val_dataset = torch.utils.data.random_split(data, [train_len, val_len])

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0
)

val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0
)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

out_dir = 'MovieGPT'
checkpoint_path = os.path.join(out_dir, 'checkpoint.pt')
os.makedirs(out_dir, exist_ok=True)  # Create output directory

# Eval/Logging
val_interval = 500 # Number of iterations between evaluations
val_iters = 20 # Number of iterations for evaluation
log_interval = 10 # Number of iterations between logging

# Optimizer settings
learning_rate = 1e-3 # Larger networks typically require a learning rate that is smaller than this
max_iters = 5_000 # Number of iterations to train for
weight_decay = 1e-1 # Weight decay for regularization (on the weights/embeddings)
beta1, beta2 = 0.9, 0.99 # Beta1, Beta2 for AdamW optimizer
grad_clip = 1.0 # Clip gradients at this value, or disable if == 0.0

# Compile model
compile_model = True # Compile the model for faster execution

# Model config
vocab_size = ... # TODO: Use the dataset `data` to determine the vocabulary size
config = GPTConfig(
    block_size=block_size, 
    vocab_size=vocab_size, 
    n_block=4, 
    n_head=4, 
    n_embd=128, 
    dropout=0.0, 
    bias=False
) # This is a relatively small model

model = GPT(config).to(device)

if compile_model:
    print("Compiling the model...")
    model = torch.compile(model) # Needs PyTorch >= 2.0
    print("Done compiling")

# Initialize optimizer
optimizer = model.get_optimizer(weight_decay, learning_rate, (beta1, beta2), device)

# Training loop
iter_num = 0
best_val_loss = float('inf')

for _ in range(max_iters):
    for X, Y in train_loader:
        # Get batch and move to device
        X, Y = X.to(device), Y.to(device)
        
        # Forward pass
        logits, loss = model(X, targets=Y)
        
        # Backward pass
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        if grad_clip != 0.0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        
        # Logging
        if iter_num % log_interval == 0:
            print(f"iter {iter_num}: loss {loss.item():.4f}")
            
        # Evaluation
        if iter_num % val_interval == 0:
            losses = estimate_train_val_loss(model, train_loader, val_loader, val_iters, device)
            print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
            
            # Save best model
            if losses['val'] < best_val_loss:
                best_val_loss = losses['val']
                if iter_num > 0:
                    print(f"Saving checkpoint to {out_dir}")
                    model_to_save = model._orig_mod if compile_model else model
                    torch.save({
                        'model': model_to_save.state_dict(),
                        'model_args': config,
                    }, checkpoint_path)
        
        iter_num += 1
        if iter_num >= max_iters:
            break
    
    if iter_num >= max_iters:
        break

# Sample from the model

In [None]:
num_samples = 5  # Number of samples to draw
max_new_tokens = 500  # Number of tokens generated in each sample
temperature = 0.8  # TODO: Use different temperature values and qualitatively report on the results

# Set seed for reproducibility
seed = 345  
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

# Load the model
checkpoint = torch.load(checkpoint_path, map_location=device)
config = checkpoint['model_args']
model = GPT(config)
model.load_state_dict(checkpoint['model'])
model.eval()
model.to(device)

# Create dataset to get encoder/decoder
dataset = MovieDataset(block_size=config.block_size)
encode = lambda s: [dataset.string_to_int[c] for c in s]
decode = dataset.decode

# Generate samples
print('-'*20)
with torch.no_grad():
    for k in range(num_samples):
        start_prompt = "\n"  # Start prompt
        prompt_ids = encode(start_prompt)
        x = torch.tensor(prompt_ids, dtype=torch.long, device=device)[None, ...]
    
        y = model.sample(x, max_new_tokens, temperature=temperature)
        print(decode(y[0]))
        print('-'*20)

### a) Analyzing the Dataset

In [None]:
# Data Analysis
dataset = MovieDataset()

README.md:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


9000plus.csv:   0%|          | 0.00/4.21M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9837 [00:00<?, ? examples/s]

In [40]:
#extracting samples
dataset = MovieDataset()
print(f"Size of dataset: {len(dataset)}")
print(f"Size of contained items: {len(dataset[0])}")
print(f"Sample size: {len(dataset[0][0])}")

seed = 345  
torch.manual_seed(seed)

offset= torch.randint(0, len(dataset), (1,))+92791

#print out some random samples
for chosen_index in [offset, offset+1, offset+2]:
    
    sample = dataset.__getitem__(chosen_index)
    numbers_1 = ', '.join([str(x.item()) for x in sample[0]])
    print(numbers_1)
    sentence_1 = [dataset.int_to_string[x.item()] for x in sample[0]]
    print("".join(sentence_1))

    numbers_2 = ', '.join([str(x.item()) for x in sample[0]])
    print(numbers_2)
    sentence_2 = [dataset.int_to_string[x.item()] for x in sample[1]]
    print("".join(sentence_2))




Size of dataset: 3001845
Size of contained items: 2
Sample size: 256
68, 70, 62, 75, 81, 2, 84, 66, 79, 66, 84, 76, 73, 67, 14, 2, 62, 75, 65, 2, 66, 83, 66, 75, 2, 81, 69, 66, 2, 67, 66, 79, 76, 64, 70, 76, 82, 80, 2, 68, 76, 65, 65, 66, 80, 80, 2, 39, 66, 64, 62, 81, 66, 2, 69, 66, 79, 80, 66, 73, 67, 16, 1, 44, 70, 74, 70, 64, 2, 20, 28, 2, 54, 69, 66, 75, 2, 62, 2, 64, 76, 64, 72, 79, 76, 62, 64, 69, 15, 80, 77, 79, 66, 62, 65, 2, 77, 73, 62, 68, 82, 66, 2, 81, 69, 79, 66, 62, 81, 66, 75, 66, 65, 2, 81, 76, 2, 65, 66, 64, 70, 74, 62, 81, 66, 2, 81, 69, 66, 2, 64, 69, 70, 73, 65, 2, 77, 76, 77, 82, 73, 62, 81, 70, 76, 75, 2, 76, 67, 2, 45, 66, 84, 2, 56, 76, 79, 72, 2, 34, 70, 81, 86, 2, 70, 75, 2, 81, 69, 66, 2, 76, 79, 70, 68, 70, 75, 62, 73, 2, 44, 70, 74, 70, 64, 14, 2, 63, 70, 76, 73, 76, 68, 70, 80, 81, 2, 50, 82, 80, 62, 75, 2, 51, 86, 73, 66, 79, 2, 62, 75, 65, 2, 69, 66, 79, 2, 79, 66, 80, 66, 62, 79, 64, 69, 2, 62, 80, 80, 76, 64, 70, 62, 81, 66, 80, 2, 65, 66, 83, 66, 73,