## Creating a tiny LLM using an open source textbook

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
# Load the text file
with open("textbook.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Create a character-level vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}  # String to Integer mapping
itos = {i: ch for i, ch in enumerate(chars)}  # Integer to String mapping

# Encode the text into integers


def encode(s):
    return [stoi[c] for c in s]


def decode(l):
    return ''.join([itos[i] for i in l])


data = torch.tensor(encode(text), dtype=torch.long)

# Split into training and validation sets (90% train, 10% val)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [4]:
class GPT(nn.Module):
    def __init__(self, vocab_size, block_size, n_embd, n_layer, n_head):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd)
        self.blocks = nn.ModuleList([
            TransformerBlock(n_embd, n_head) for _ in range(n_layer)
        ])
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device).unsqueeze(0)

        tok_emb = self.token_embedding(idx)
        pos_emb = self.position_embedding(pos)
        x = tok_emb + pos_emb

        for block in self.blocks:
            x = block(x)

        x = self.ln_f(x)
        logits = self.head(x)

        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
            return logits, loss
        return logits

class TransformerBlock(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.attn = nn.MultiheadAttention(n_embd, n_head, batch_first=True)
        self.ln1 = nn.LayerNorm(n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
        )
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        attn_out, _ = self.attn(x, x, x)
        x = x + attn_out
        x = self.ln1(x)

        mlp_out = self.mlp(x)
        x = x + mlp_out
        x = self.ln2(x)

        return x

In [5]:
import time

# Hyperparameters
batch_size = 64
block_size = 128
n_embd = 256
n_layer = 6
n_head = 4
learning_rate = 3e-4
max_iters = 100
eval_interval = 500


# Function to get a batch of data


def get_batch(split):
    try:
        data = train_data if split == 'train' else val_data
        ix = torch.randint(len(data) - block_size, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in ix])
        y = torch.stack([data[i+1:i+block_size+1] for i in ix])
        return x, y
    except Exception as e:
        print(f"Error in get_batch: {e}")
        raise


# Initialize the model
try:
    print("Initializing model...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GPT(vocab_size, block_size, n_embd, n_layer, n_head).to(device)
    print(f"Model initialized on {device}.")
except Exception as e:
    print(f"Error in model initialization: {e}")
    raise

# Optimizer
try:
    print("Initializing optimizer...")
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    print("Optimizer initialized.")
except Exception as e:
    print(f"Error in optimizer initialization: {e}")
    raise

# Training loop with time estimation
start_time = time.time()  # Record the start time
for step in range(max_iters):
    try:
        step_start_time = time.time()  # Start time of this step

        # Fetch training batch
        xb, yb = get_batch('train')
        xb, yb = xb.to(device), yb.to(device)

        # Forward pass
        logits, loss = model(xb, yb)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Debug info for training
        print(f"Step {step}: Train loss {loss.item():.4f}")

        # Evaluate periodically
        if step % eval_interval == 0 or step == max_iters - 1:
            with torch.no_grad():
                val_loss = sum(model(xb.to(device), yb.to(device))[1].item()
                               for xb, yb in [get_batch('val')] * 10) / 10

            # Calculate elapsed time and estimated remaining time
            elapsed_time = time.time() - start_time
            steps_completed = step + 1
            avg_step_time = elapsed_time / steps_completed
            remaining_time = avg_step_time * (max_iters - steps_completed)

            print(
                f"Step {step}: Train loss {loss.item():.4f}, Val loss {val_loss:.4f}")
            print(
                f"Elapsed time: {elapsed_time / 60:.2f} minutes, Estimated remaining time: {remaining_time / 60:.2f} minutes")

    except Exception as e:
        print(f"Error at step {step}: {e}")
        raise

Initializing model...
Model initialized on cpu.
Initializing optimizer...
Optimizer initialized.
Step 0: Train loss 5.0050
Step 0: Train loss 5.0050, Val loss 4.2136
Elapsed time: 0.05 minutes, Estimated remaining time: 4.71 minutes
Step 1: Train loss 4.1758
Step 2: Train loss 3.7469
Step 3: Train loss 3.4840
Step 4: Train loss 3.3160
Step 5: Train loss 3.2350
Step 6: Train loss 3.1645
Step 7: Train loss 3.1556
Step 8: Train loss 3.1149
Step 9: Train loss 3.0985
Step 10: Train loss 3.0278
Step 11: Train loss 2.9369
Step 12: Train loss 2.9870
Step 13: Train loss 2.8686
Step 14: Train loss 2.8919
Step 15: Train loss 2.8452
Step 16: Train loss 2.7909
Step 17: Train loss 2.7752
Step 18: Train loss 2.7746
Step 19: Train loss 2.7626
Step 20: Train loss 2.7040
Step 21: Train loss 2.7198
Step 22: Train loss 2.8075
Step 23: Train loss 2.6874
Step 24: Train loss 2.6396
Step 25: Train loss 2.6340
Step 26: Train loss 2.6656
Step 27: Train loss 2.6486
Step 28: Train loss 2.6267
Step 29: Train loss 

### The results are essentially giberish because to train a model from scratch it would take an enourmous amout of resoucres. The case study used A100 GPUs for 4 straight days.

In [7]:
def generate(model, start, max_new_tokens):
    model.eval()
    with torch.no_grad():
        idx = torch.tensor(encode(start), dtype=torch.long,
                           device=device).unsqueeze(0)
        for _ in range(max_new_tokens):
            logits = model(idx)[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next_token), dim=1)
        return decode(idx[0].tolist())


# Generate text
print(generate(model, start="food and", max_new_tokens=100))

food and o“akuk
rabyd0, devalcod iove, ioven ofroroustg

tale
teres Irofr cewefevo f 3. momand,Vύ0è. oficö.0
