# Tiny LLM Example

This notebook demonstrates a complete example of training a tiny LLM using the components we've implemented.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from src.models.transformer import TransformerBlock
from src.tokenizers.byte_tokenizer import ByteTokenizer
from src.train.data import create_data_loader
from src.train.trainer import Trainer
from src.train.evaluation import evaluate_model

## Define Model Architecture

In [None]:
class TinyLLM(nn.Module):
    """
    A tiny LLM implementation using our Transformer blocks.
    """
    
    def __init__(self, vocab_size=256, d_model=64, num_heads=4, num_layers=2, d_ff=128):
        super(TinyLLM, self).__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, d_model)
        
        # Transformer blocks
        self.layers = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff) for _ in range(num_layers)
        ])
        
        # Output projection
        self.output_projection = nn.Linear(d_model, vocab_size)
        
    def forward(self, x):
        # Apply embedding
        x = self.embedding(x)
        
        # Apply transformer blocks
        for layer in self.layers:
            x = layer(x)
        
        # Apply output projection
        logits = self.output_projection(x)
        
        return logits

## Prepare Data

In [None]:
# Initialize tokenizer
tokenizer = ByteTokenizer()

# Sample training data
training_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "Machine learning is a subset of artificial intelligence.",
    "Natural language processing enables computers to understand human language.",
    "Deep learning models have revolutionized many fields.",
    "Transformers are a powerful architecture for sequence modeling.",
    "Attention mechanisms allow models to focus on relevant parts of input.",
    "Large language models can generate human-like text.",
    "Tokenization is the process of converting text into tokens.",
    "Neural networks learn patterns from data.",
    "PyTorch is a popular deep learning framework."
]

# Encode texts
tokenized_texts = tokenizer.encode_batch(training_texts, add_bos=True, add_eos=True)

# Create data loader
seq_length = 16
batch_size = 4
data_loader = create_data_loader(tokenized_texts, seq_length, batch_size)

print(f"Number of training samples: {len(data_loader.dataset)}")
print(f"Batch size: {batch_size}")
print(f"Sequence length: {seq_length}")

## Initialize Model and Training Components

In [None]:
# Initialize model
model = TinyLLM(vocab_size=tokenizer.vocab_size, d_model=64, num_heads=4, num_layers=2)

# Initialize optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Initialize trainer
trainer = Trainer(model, optimizer)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

## Train Model

In [None]:
# Train for a few epochs
num_epochs = 3
trainer.train(data_loader, num_epochs, log_interval=5)

## Evaluate Model

In [None]:
# Evaluate model
metrics = evaluate_model(model, data_loader)
print(f"Evaluation Metrics:")
print(f"  Loss: {metrics['loss']:.4f}")
print(f"  Perplexity: {metrics['perplexity']:.4f}")
print(f"  Accuracy: {metrics['accuracy']:.4f}")

## Generate Text (Simple Example)

In [None]:
def generate_text(model, tokenizer, prompt, max_length=50, temperature=1.0):
    """
    Generate text using the trained model.
    """
    model.eval()
    
    # Encode prompt
    input_ids = tokenizer.encode(prompt, add_bos=True)
    input_tensor = torch.tensor([input_ids], dtype=torch.long)
    
    # Generate text
    with torch.no_grad():
        for _ in range(max_length):
            # Get logits from model
            logits = model(input_tensor)
            
            # Get logits for the last token
            next_token_logits = logits[0, -1, :] / temperature
            
            # Sample next token
            probabilities = torch.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probabilities, 1)
            
            # Append to input
            input_tensor = torch.cat([input_tensor, next_token.unsqueeze(0)], dim=1)
            
            # Stop if we generate an EOS token
            if next_token.item() == tokenizer.eos_token_id:
                break
    
    # Decode generated text
    generated_ids = input_tensor[0].tolist()
    generated_text = tokenizer.decode(generated_ids)
    
    return generated_text

# Generate some text
prompt = "Machine learning"
generated_text = generate_text(model, tokenizer, prompt, max_length=30)
print(f"Prompt: {prompt}")
print(f"Generated: {generated_text}")