In [None]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
# read dataset
with open("input.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [None]:
print(f"Length of dataset: {len(text)}")

In [None]:
# First 1000 characters
print(text[:1000])

In [None]:
# Get all unique characters in the dataset
chars = sorted(list(set(text)))
vocabulary_size = len(chars)
print("Unique characters: ", "".join(chars))
print(f"Size: {len(chars)}")

In [None]:
# Time to tokenize!
# This means converting the string, i.e a sequence of characters, to a sequence of integers according to some vocabulary of possible elements.
# In our case, each character is a token.

# Create mapping from characters to integers
stoi = {c: i for i,c in enumerate(chars)}
itos = {i: c for i,c in enumerate(chars)}

# Encoder and decoder
encode = lambda s: [stoi[c] for c in s]  # Encodes string (list of characters) to list of integers
decode = lambda l: "".join([itos[i] for i in l])  # Decodes a list of integers to a string

print(encode("Hi there"))
print(decode(encode("Hi there")))

In [None]:
# Encode the entire text dataset and store it into a tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])  # The 1000 characters we looked at earlier will be presented like this to the model

In [None]:
# Lets split into train/eval
train_size = int(0.9 * len(data))
train_data = data[:train_size]
eval_data = data[train_size:]

print(train_data.shape, eval_data.shape)

In [None]:
# Lets define the context length, also called block size
# This is the amount of text the transformer will see during training
block_size = 8
train_data[:block_size+1]

In [None]:
# Lets visualize the samples the transformer will see
x = train_data[:block_size]
y = train_data[1:block_size+1]

for i in range(block_size):
    context = x[:i+1]
    target = y[i]
    print(f"When input is {context} the target is: {target}")

In [None]:
# Introduce batch dimension
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split: str, batch_size: int):
    # Generate a batch of inputs and targets
    data = train_data if split == "train" else eval_data
    indices = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in indices])
    y = torch.stack([data[i+1:i+block_size+1] for i in indices])
    return x,y
    
xs, ys = get_batch("train", batch_size)
print("Inputs: ")
print(xs.shape)
print(xs)
print("Targets: ")
print(ys.shape)
print(ys)
print("------")

for b in range(batch_size):
    for i in range(block_size):
        context = xs[b,:i+1]
        target = ys[b,i]
        print(f"When input is {context.tolist()} the target is: {target}")

In [None]:
print(xs)  # Our input to the transformer

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    """This model predicts the next character based on a single character."""
    
    def __init__(self, vocabulary_size):
        super().__init__()
        # Each token directly reads the logits for the next toke from a lookup table
        self.token_embedding_table = nn.Embedding(vocabulary_size, vocabulary_size)
        
    def forward(self, idx, targets=None) -> torch.Tensor:
        # idx and targets are [batch_size, block_size] tensors of integers
        logits = self.token_embedding_table(idx)  # [batch_size, block_size, vocabulary_size]
        
        if targets is None:
            loss = None
        else:
            # Reshape to [batch_size * block_size, vocabulary_size] according to what cross_entropy wants
            batch_size, block_size, vocabulary_size = logits.shape
            logits = logits.view(batch_size*block_size, vocabulary_size)
            targets = targets.view(batch_size*block_size)

            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is a [batch_size, block_size] tensor of indices in the current context
        for _ in range(max_new_tokens):
            logits, _ = self(idx)  # Get predictions
            logits = logits[:, -1, :]  # Focus on last element, i.e what comes next
            probabilities = F.softmax(logits, dim=-1)  # Softmax logits to get probabilities
            next_idx = torch.multinomial(probabilities, num_samples=1)  # [batch_size, 1]
            idx = torch.cat((idx, next_idx), dim=1)  # [batch_size, block_size + 1]
            
        return idx
            
            
    
model = BigramLanguageModel(vocabulary_size)
out, loss = model(xs, ys)
print(f"Output shape: {out.shape}")
print(loss)

idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(idx, max_new_tokens=100)[0].tolist()))

In [None]:
# Create PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [None]:
batch_size = 32

for steps in range(10_000):
    xs, ys = get_batch("train", batch_size)
    
    logits, loss = model(xs, ys)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

In [None]:
print(decode(model.generate(idx, max_new_tokens=300)[0].tolist()))