#### Torch Import

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F

#### CUDA Check

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(device)

cuda


#### Hyperperameters

In [17]:
dropout = 0.2
block_size = 64
n_layer = 4 
n_head = 8
n_embd = 256

print(n_embd // n_head) # Test purpose

32


#### Basic Test

In [18]:
chars = ""

# Read the contents of the file 'infinite_in_modern_thought.txt' and store it in the variable 'text'
with open('infinite_in_modern_thought.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Print the first 1000 characters of the text
# print(text[:1000])

# Create a list of unique characters in the text and sort them
chars = sorted(list(set(text)))
# print(chars)

# Print the number of unique characters
# print(len(chars))

# print(text[:1000])
chars = sorted(list(set(text)))
# print(chars)
# print(len(chars))

vocab_size = len(chars)

#### Simple Mapping

In [19]:
# Create a dictionary that maps each character to its index in the 'chars' list
string_to_index = {ch: i for i, ch in enumerate(chars)}

# Create a dictionary that maps each index to its corresponding character in the 'chars' list
index_to_string = {i: ch for i, ch in enumerate(chars)}

# Define the 'endcode' function that converts a string to a list of character indices
endcode = lambda s: [string_to_index[c] for c in s]

# Define the 'decode' function that converts a list of character indices to a string
decode = lambda l: ''.join([index_to_string[i] for i in l])

print(endcode("Hello, World!"))

print(decode(endcode("Hello, World!")))

[35, 61, 68, 68, 71, 11, 1, 50, 71, 74, 68, 60, 2]
Hello, World!


#### Torch Mapping

In [20]:
# Create a dictionary that maps each character to its index in the 'chars' list
string_to_index = {ch: i for i, ch in enumerate(chars)}

# Create a dictionary that maps each index to its corresponding character in the 'chars' list
index_to_string = {i: ch for i, ch in enumerate(chars)}

# Define the 'endcode' function that converts a string to a list of character indices
endcode = lambda s: [string_to_index[c] for c in s]

# Define the 'decode' function that converts a list of character indices to a string
decode = lambda l: ''.join([index_to_string[i] for i in l])

# Create a tensor from the encoded text using torch.tensor
data = torch.tensor(endcode(text), dtype=torch.long)

print(data[:100])

tensor([97, 47, 64, 61,  1, 43, 74, 71, 66, 61, 59, 76,  1, 34, 77, 76, 61, 70,
        58, 61, 74, 63,  1, 61, 29, 71, 71, 67,  1, 71, 62,  1, 42, 70,  1, 76,
        64, 61,  1, 76, 64, 61, 71, 74, 81,  1, 71, 62,  1, 76, 64, 61,  1, 65,
        70, 62, 65, 70, 65, 76, 61,  1, 65, 70,  1, 69, 71, 60, 61, 74, 70,  1,
        76, 64, 71, 77, 63, 64, 76,  0,  1,  1,  1,  1,  0, 47, 64, 65, 75,  1,
        61, 58, 71, 71, 67,  1, 65, 75,  1, 62])


#### Train Validation Split

In [21]:
# Split the data into train and validation sets
data_size = int(0.9 * len(data))  # Calculate the size of the train data

train_data = data[:data_size]  # Assign the first 90% of the data to the train_data variable
val_data = data[data_size:]  # Assign the remaining 10% of the data to the val_data variable

#### Tensor Process Flow

In [22]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]

for t in range(block_size):
    # Get the context by slicing the input sequence up to the current position
    context = x[:t + 1]
    
    # Get the target by selecting the next character in the input sequence
    target = y[t]
    
    # Print the context and target
    print(f"when input is {context}, the target: {target}")

when input is tensor([97]), the target: 47
when input is tensor([97, 47]), the target: 64
when input is tensor([97, 47, 64]), the target: 61
when input is tensor([97, 47, 64, 61]), the target: 1
when input is tensor([97, 47, 64, 61,  1]), the target: 43
when input is tensor([97, 47, 64, 61,  1, 43]), the target: 74
when input is tensor([97, 47, 64, 61,  1, 43, 74]), the target: 71
when input is tensor([97, 47, 64, 61,  1, 43, 74, 71]), the target: 66
when input is tensor([97, 47, 64, 61,  1, 43, 74, 71, 66]), the target: 61
when input is tensor([97, 47, 64, 61,  1, 43, 74, 71, 66, 61]), the target: 59
when input is tensor([97, 47, 64, 61,  1, 43, 74, 71, 66, 61, 59]), the target: 76
when input is tensor([97, 47, 64, 61,  1, 43, 74, 71, 66, 61, 59, 76]), the target: 1
when input is tensor([97, 47, 64, 61,  1, 43, 74, 71, 66, 61, 59, 76,  1]), the target: 34
when input is tensor([97, 47, 64, 61,  1, 43, 74, 71, 66, 61, 59, 76,  1, 34]), the target: 77
when input is tensor([97, 47, 64, 61

#### Split Dataset

In [23]:
# Split the data into train and validation sets
data_size = int(0.9 * len(data))  # Calculate the size of the train data

train_data = data[:data_size]  # Assign the first 90% of the data to the train_data variable
val_data = data[data_size:]  # Assign the remaining 10% of the data to the val_data variable

# Define the get_batch function that returns a batch of data
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# Set the batch size
batch_size = 128

# Get a batch of training data
x, y = get_batch('train')

# Print the shape of x and y
print(x.shape, y.shape)

# Print the values of x and y
print(x)
print(y)

torch.Size([128, 64]) torch.Size([128, 64])
tensor([[61, 70, 76,  ..., 76, 74, 71],
        [61,  1, 57,  ..., 65, 75,  1],
        [65, 70,  1,  ...,  1, 71, 74],
        ...,
        [61,  1, 75,  ..., 75,  1, 69],
        [75,  1, 71,  ..., 71, 70, 78],
        [60, 74, 61,  ..., 71, 70,  1]], device='cuda:0')
tensor([[70, 76,  1,  ..., 74, 71, 70],
        [ 1, 57, 75,  ..., 75,  1, 57],
        [70,  1, 81,  ..., 71, 74,  1],
        ...,
        [ 1, 75, 61,  ...,  1, 69, 77],
        [ 1, 71, 62,  ..., 70, 78, 61],
        [74, 61, 75,  ..., 70,  1, 90]], device='cuda:0')


#### Bigram (Autoregressive Model)

In [24]:
class Head(nn.Module):
    """One head of self-attention."""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Input shape: (batch, time-step, channels)
        # Output shape: (batch, time-step, head size)
        B, T, C = x.shape
        k = self.key(x)   # (B, T, hs)
        q = self.query(x) # (B, T, hs)
        # compute attention scores ("affinities")
        # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * k.shape[-1] ** -.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)

        # (B, T, hs) @ (B, T, T) -> (B, hs, T)
        v = self.value(x) # (B, T, hs)
        out = wei @ v # (B, T, hs)

        return out

In [25]:
class MultiHeadAttention(nn.Module):
    """Multi-head attention layer."""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList(
            [Head(head_size) for _ in range(num_heads)]
        )
        self.proj = nn.Linear(head_size * head_size, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # (B,T, F) -> (B,T, (h1, h1, h2, h2, h3, h3, h4, h4))
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [26]:
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [34]:
class Block(nn.Module):
    """Transformer block: A multi-head attention layer followed by a feedforward layer."""

    def __init__(self, n_embd, n_head):
        # n_embd is the embedding dimension, n_head is the number of attention heads
        # n_head is the number of attention heads

        super().__init__()
        head_size = n_embd // n_head
        self.sa = nn.MultiheadAttention(embed_dim=n_embd, num_heads=n_head)

        # self.ffn = nn.Sequential(
        #     nn.Linear(n_embd, 4 * n_embd),
        #     nn.ReLU(),
        #     nn.Linear(4 * n_embd, n_embd),
        # )

        self.ffw = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffw(x)
        x = self.ln2(x + y)
        return x

In [35]:
class GPTLanguageModel(nn.Module):
    # Initialize the model with the vocabulary size
    def __init__(self, vocab_size):
        # Call the parent class's constructor
        super().__init__()
        # Create an embedding table to map token indices to vectors
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)

        # Create a multi-head attention layer (GPU/CPU)
        self.blocks = nn.Sequential(
            *[Block(n_embd, n_head=n_head)
              for _ in range(n_layer)]
        )

        self.lm_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self.__init_weights)

    def __init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    # Define the forward pass
    def forward(self, idx, targets=None):
        # Get the logits (unnormalized probabilities) for the input indices
        # logits = self.token_embedding_table(idx)

        B, T = idx.shape
        
        # idx and targets are both (B, T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C)
        x = self.blocks(x) # (B, T, C)
        x = self.lm_f(x) # (B, T, C)
        logits = self.lm_head(x) # (B, T, vocab_size)

        # If targets are not provided, set the loss to None
        if targets is None:
            loss = None
        # If targets are provided, calculate the cross-entropy loss
        else:
            # Get the shape of the logits tensor (batch size, sequence length, vocab size)
            B, T, C = logits.shape
            # Reshape the logits tensor to (batch size * sequence length, vocab size)
            logits = logits.view(B*T, C)
            # Reshape the targets tensor to (batch size * sequence length)
            targets = targets.view(B*T)
            # Calculate the cross-entropy loss
            loss = F.cross_entropy(logits, targets)
        # Return the logits and loss
        return logits, loss

    # Define the generate method
    def generate(self, idx, max_new_tokens):
        # Generate new tokens one at a time
        for _ in range(max_new_tokens):
            # Get the logits and loss for the current input indices
            logits, loss = self.forward(idx)
            # Get the logits for the last token in the sequence
            logits = logits[:, -1, :]
            # Convert the logits to probabilities
            probs = F.softmax(logits, dim=-1)
            # Sample the next token from the probabilities
            idx_next = torch.multinomial(probs, num_samples=1)
            # Append the next token to the current sequence
            idx = torch.cat((idx, idx_next), dim=1)
        # Return the generated sequence
        return idx
    
model = GPTLanguageModel(vocab_size).to(device)

In [37]:
max_iter = 500
learning_rate = 3e-4

eval_interval = 100

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()  # Set the model to evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_interval)  # Initialize a tensor to store the losses
        for i in range(eval_interval):
            x, y = get_batch(split)  # Get a batch of data
            T = x.shape[1]  # Define the variable 'T' as the sequence length
            logits, loss = model(x, y)  # Forward pass through the model
            losses[i] = loss.item()  # Store the loss value
        out[split] = losses.mean()  # Calculate the mean loss for the split
    model.train()  # Set the model back to training mode
    return out


for iter in range(max_iter):
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    # Get a batch of training data
    x, y = get_batch('train')
    # Get the logits and loss
    logits, loss = model(x, y)
    # Backpropagate the loss
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    # Print the loss every 100 iterations
    # if iter % 100 == 0:
    #     print(f"loss: {loss.item()}")

print(loss.item())

TypeError: MultiheadAttention.forward() missing 2 required positional arguments: 'key' and 'value'

#### Some Optimization Algorithms and Loss Functions

**MSE (Mean Squared Error):** MSE is a commonly used loss function in regression problems. It measures the average squared difference between the predicted and actual values. The lower the MSE, the better the model's performance.

**GD (Gradient Descent):** GD is an optimization algorithm used to minimize the loss function of a model. It iteratively updates the model's parameters in the direction of steepest descent of the loss function. GD can be slow for large datasets as it requires computing the gradients for the entire dataset in each iteration.

**Momentum:** Momentum is an extension of GD that helps accelerate convergence and overcome local minima. It introduces a momentum term that accumulates the gradients of previous iterations and uses it to update the model's parameters. This helps the model to continue moving in the right direction even when the gradients are small.

**RMSprop (Root Mean Square Propagation):** RMSprop is an optimization algorithm that adapts the learning rate for each parameter based on the average of recent squared gradients. It helps to speed up convergence by reducing the learning rate for parameters with large gradients and increasing it for parameters with small gradients.

**Adam:** Adam (Adaptive Moment Estimation) is an optimization algorithm that combines the benefits of both momentum and RMSprop. It maintains a running average of both the gradients and the squared gradients, and uses them to update the model's parameters. Adam is known for its fast convergence and good performance on a wide range of problems.

**AdamW:** AdamW is a variant of the Adam optimizer that incorporates weight decay regularization. Weight decay helps prevent overfitting by adding a penalty term to the loss function that discourages large parameter values. AdamW is particularly effective when dealing with models with large numbers of parameters.

In [None]:
# Create a context tensor with a single token (index 0)
context = torch.zeros((1, 1), dtype=torch.long, device=device)
# Generate a sequence of 100 tokens starting from the context
generated_chars = decode(m.generate(context, max_new_tokens=100)[0].tolist())
# Print the generated sequence
print(generated_chars)

#### Block Size and Batch Size

In [None]:
"""
NOTE:
- The block size determines the size of each block in the tensor.
- The batch size determines the number of blocks to be processed in parallel.
"""