In [None]:
!pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0

# FROM BELOW ARE NOTES FOR MAIN FILE V2 "gpt.py"

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 10000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------

In [None]:
torch.manual_seed(1337)

In [None]:
# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [None]:
# here are all the unique characters that occur in this text (ALREADY NOTED, REFER TO PREVIOUS V1 FILE FOR WORKINGS)
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [None]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [None]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # generates tensor of random indices, shape tuple defined by batch_size // These indices serves to index the data file. See 2 lines below.
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [None]:
@torch.no_grad() # disables gradient calculation (used during evaluation)
def estimate_loss():
    out = {} # empty dict which function will store the average losses for each data split, "train" and "val"

    model.eval() # this method switches the model to evaluation model. Which affects layers like dropout layers that have different behaviours during training vs evaluation (dropout is turned off during evaluation)
    for split in ['train', 'val']: # begins a loop over the data splits. The function evaluates the model both on training data and validation data. Allowing you o monitor overfitting and general performance.
        losses = torch.zeros(eval_iters) # initialises tensor "losses" filled with zeroes of length eval_iters
        for k in range(eval_iters):
            X, Y = get_batch(split) # uses get_batch with current split to generate a bunch of input data X and target data Y
            logits, loss = model(X, Y) # passes the batch through the model. The model returns logits (raw outputs before applying softmax), and the computed "loss" for this batch.
            losses[k] = loss.item()  # extracts value of loss using ".item()" and stores it in the losses tensor at index "k"
        out[split] = losses.mean() # after all iterations for current split is complete. Calculates the mean of all recorded losses. And stores it in dictionary "out", where split is the key.
    model.train() # switches model back to training mode after evaluating on both splits. Re-enabling training specific behaviours like dropout
    return out

## Notes for class Head

You're on the right track in understanding the `nn.Linear` layer in PyTorch. Here's a detailed explanation of what it does and how it's configured in your code:

### nn.Linear Function
`nn.Linear` is a PyTorch module that applies a linear transformation to the incoming data. It's essentially a fully connected neural network layer. Here’s what each parameter in the `nn.Linear` initialization means:

- **n_embd (input features):** This is the size of each input sample. For instance, if `n_embd` is 512, each input to the layer should have 512 features.

- **head_size (output features):** This is the size of each output sample. The layer transforms the input dimension (`n_embd`) into the `head_size` dimension. For example, if `head_size` is 64, each output from this layer will have 64 features.

- **bias (Boolean):** This is a flag that indicates whether a bias vector should be added to the output. If `bias=False`, no bias is added. If `bias=True`, a bias vector (initialized to zero by default) is created and added to the outputs.

### Role in the Attention Head
In the context of your `Head` class within a transformer, these layers (`self.key`, `self.query`, `self.value`) are used to transform the input into three different representations:
- **Keys (k):** Used to interact with queries to compute attention scores.
- **Queries (q):** Used to interact with keys to fetch the most relevant information across the sequence.
- **Values (v):** Once the relevant positions are identified using keys and queries, the values at these positions are combined to produce the output.

### Working Mechanism
Here's what happens when you use `nn.Linear` in the context of your self-attention head:

1. **Input Dimensionality:** The input `x` to your `forward` function in the `Head` class has the shape `[B, T, C]`, where `B` is the batch size, `T` is the sequence length (number of time steps), and `C` is the number of channels (here, `C` is `n_embd`, the embedding size).

2. **Transformation:**
   - When `x` is passed through `self.key(x)`, the layer transforms each `[B, T, n_embd]` input into `[B, T, head_size]`. It does this by multiplying `x` by a weight matrix `W` of shape `[n_embd, head_size]`, and since `bias=False`, no bias is added. The same transformation is applied by `self.query(x)` and `self.value(x)` to produce queries and values, respectively.

3. **Output Dimensionality:** Each of the transformed outputs (`k`, `q`, `v`) now has the shape `[B, T, head_size]`, aligning with the required dimensions to compute attention scores and ultimately, the weighted sum of values.

This transformation allows each head to project the input embeddings into a different subspace, helping the model to focus on different features of the input at different positions, enhancing its ability to capture complex relationships in the data.

## Back to Coding

In [None]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False) # transformers tensor of [B, T, n_embed] to [B, T, head_size] // after applying a trained linear transformation
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) # A lower triangular matrix tril of size (block_size, block_size) is created and registered as
                                                                                     # a buffer. This matrix is used later to apply a mask for the attention mechanism, allowing the model to
                                                                                     # only attend to previous positions and prevent "looking ahead."
        self.dropout = nn.Dropout(dropout) # dropout layer is included to prevent overfitting during training by randomly zeroing out elements of the output tensor with a probability defined by dropout.

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
                                                         # this is the attention(Q, K, V) equation. division by sqrt(d_k) can be seen.
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

## Notes of MultiHeadAttention and nn.Module

Certainly! The `MultiHeadAttention` class is a crucial component of Transformer models, extending the concept of self-attention by incorporating multiple heads. Each head can potentially learn to pay attention to different parts of the input, making the model more powerful and versatile. Here’s a detailed breakdown of this class:

### Class Definition
```python
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
```
- The class `MultiHeadAttention` inherits from `nn.Module`, which is a base class for all neural network modules in PyTorch. It manages multiple heads of attention that process the input in parallel, allowing the model to capture different aspects of information simultaneously.

### Constructor
```python
def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) # creates x number of parallel self attention heads
    self.proj = nn.Linear(head_size * num_heads, n_embd)
    self.dropout = nn.Dropout(dropout)
```
- **ModuleList of Heads:**
  - `self.heads` is a `nn.ModuleList` containing several instances of the `Head` class defined earlier. The list comprehension `[Head(head_size) for _ in range(num_heads)]` creates `num_heads` instances of `Head`, each capable of transforming the input independently.
  - `head_size` defines the size of each head's output.

- **Projection Layer:**
  - `self.proj` is a linear layer that projects the concatenated outputs of all attention heads back to the original embedding dimension (`n_embd`). This is necessary because each head outputs `head_size` features, and concatenating `num_heads` of them results in `head_size * num_heads` features.
  - This layer maps the combined features back to the expected size for compatibility with other components in the Transformer architecture.

- **Dropout:**
  - `self.dropout` is a dropout layer that randomly zeroes elements of the output tensor with a probability defined by `dropout`. This regularization technique helps prevent overfitting.

### Forward Pass
```python
def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.proj(out)) # projection back into the residual pathway
    return out
```
- **Concatenating Heads Outputs:**
  - The forward pass starts by applying each head in `self.heads` to the input `x`. The list comprehension `[h(x) for h in self.heads]` computes the outputs from all heads.
  - `torch.cat([...], dim=-1)` concatenates these outputs along the last dimension. If each head's output has dimensions `[B, T, head_size]` and there are `num_heads` heads, the result will have dimensions `[B, T, head_size * num_heads]`.

- **Projection and Dropout:**
  - The concatenated output is then passed through the `self.proj` linear transformation, which reduces its dimensionality from `head_size * num_heads` back to `n_embd`, aligning it with the rest of the network.
  - After the projection, dropout is applied for regularization.

The `MultiHeadAttention` module effectively combines information from multiple representational spaces. By doing so, it allows the model to attend to information from different subsequences in different positions, which is a powerful mechanism in many NLP tasks. This architecture is one of the key reasons why Transformers excel in handling complex dependencies in sequence data.

`nn.Module` is one of the core building blocks in PyTorch, a popular deep learning framework. It serves as the base class for all neural network modules, and most of the functionality of neural networks in PyTorch is built on top of this class. Here’s a detailed breakdown of what `nn.Module` is, what it contains, and what it does:

### Definition of nn.Module
- **Base Class for All Networks:** `nn.Module` is the base class for all neural network modules in PyTorch. Any new neural network component should inherit from `nn.Module` to get all its functionality.

### Key Features and Functions
1. **Parameter Management:**
   - **Automatic Parameter Registration:** When you define instance attributes that are `nn.Parameter` or `nn.Module` types, they are automatically added to the list of parameters (or sub-modules) that the module knows about. This includes weights, biases, and other parameters which are used in forward passes and are necessary for backpropagation.
   - **Easy Access to Parameters:** `nn.Module` provides methods like `.parameters()` and `.named_parameters()` to iterate over all parameters of the model, which is very useful for optimization, saving, loading, etc.

2. **Sub-modules Management:**
   - **Hierarchical Structure:** You can nest `nn.Module` instances inside one another. This hierarchical organization allows building complex architectures easily. Methods like `.children()` and `.modules()` help in accessing these sub-modules at different levels of hierarchy.

3. **Forward Pass Definition:**
   - **Forward Method:** Each `nn.Module` subclass typically implements a `forward()` method. When you call the module (like a function call with `module(input)`), it internally calls `forward()` with the input. This method is where you define the computation performed by the module.

4. **Gradient Computation and Backpropagation:**
   - **Support for Autograd:** `nn.Module` seamlessly integrates with PyTorch’s autograd system. Parameters of the module are automatically registered for gradient computation. When used in a training loop, gradients are computed when calling `.backward()`, and PyTorch takes care of all the gradient flow calculations through modules.

5. **Utilities for Training:**
   - **to(device):** You can move all module parameters to a specified device (CPU or GPU) with a single call to `.to(device)`.
   - **train() and eval():** Switch between training and evaluation modes. This affects the behavior of certain layers like dropout (active during training and inactive during evaluation) and batch normalization (uses running statistics during evaluation).

6. **Serialization and Deserialization:**
   - **Save and Load Models:** `nn.Module` provides convenient methods for saving (`torch.save(module.state_dict(), PATH)`) and loading (`module.load_state_dict(torch.load(PATH))`) the parameters of a model, allowing for model persistence and transfer.

### Practical Example
When you define a new class that inherits from `nn.Module`, you typically:
1. Initialize the parent class in your constructor.
2. Define any layers or parameters your module needs.
3. Implement the `forward()` method to specify how the module processes input.

Here is a simple example:
```python
import torch.nn as nn

class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.linear = nn.Linear(10, 5)  # A simple linear layer

    def forward(self, x):
        return self.linear(x)
```
In this example, `SimpleModel` contains a single linear layer, and the `forward` method defines how the model processes input `x` through that layer.

### Summary
`nn.Module` is essentially what makes building, training, and using neural networks in PyTorch straightforward and flexible. It provides the infrastructure for assembling layers and parameters into a complete model, managing their states, and using them efficiently during training or inference.

## Back to Coding

In [None]:
class MultiHeadAttention(nn.Module): # inherits from nn.Module <-- base class for all NN modules in PyTorch
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size): # class constructor
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) # creates x number of parallel self attention heads
                                                                                # contains several instances of the "Head" class defined earlier. Each capable of transforming the input independently.
        self.proj = nn.Linear(head_size * num_heads, n_embd) # projects the concatenated outputs of all attention heads back to the original embedding dimension "n_embeds". This is
                                                             # necessary because each head outputs head_size features, and concatenating num_heads of them results in head_size * num_heads features.
        self.dropout = nn.Dropout(dropout) # dropout layer

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # concatenates the outputs from independent heads along the last dimension. // [B, T, head_size] becomes [B, T, head_size * num_heads]
        out = self.dropout(self.proj(out)) # passes through this linear transformation. Which reduces its dimensionality from head_size*num_heads for dim=-1 to n_embed // after projection, dropout is applied for regularization
        return out

In [None]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd), # both this and 2 lines below had been multipied by 4 based on FFN implementation in the
                                           # paper "Attention is All you need"
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd), # this is the projection layer going back into the residual pathway
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size) # multi-head attention class initialisation
        self.ffwd = FeedFoward(n_embd) # notice how this occurs after attention
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # residual
        x = x + self.ffwd(self.ln2(x)) # from feedforward
        return x

In [None]:
class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
                # not just encoding identity of token here. but also its position!
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)]) # multiple blocks can be seen here
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size) # short for language model head

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C) // at this point x not only contains token identity. but also
                                        #  positional identity from position embedding
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))
#open('more.txt', 'w').write(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))