# Custom Language Model Exploration

This notebook demonstrates how to build and train a custom language model from scratch.

In [None]:
import sys
import os
import torch
import numpy as np
import matplotlib.pyplot as plt

# Add the project root to the path
sys.path.append(os.path.abspath('..'))

## 1. Load and Explore the Data

First, let's load some sample text data to train our model.

In [None]:
# Sample text for demonstration
# In a real project, you would load data from files
sample_text = """
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence 
concerned with the interactions between computers and human language, in particular how to program computers 
to process and analyze large amounts of natural language data. The goal is a computer capable of understanding 
the contents of documents, including the contextual nuances of the language within them.

The technology can then accurately extract information and insights contained in the documents as well as 
categorize and organize the documents themselves.
"""

print(f"Sample text length: {len(sample_text)} characters")

## 2. Tokenization

Let's create a simple character-level tokenizer for our model.

In [None]:
from src.data.tokenizer import CharacterTokenizer

# Create and train the tokenizer
tokenizer = CharacterTokenizer()
tokenizer.train([sample_text])

# Encode the text
encoded = tokenizer.encode(sample_text)
print(f"Encoded length: {len(encoded)}")
print(f"First 20 tokens: {encoded[:20]}")

# Decode back to text
decoded = tokenizer.decode(encoded)
print(f"\nDecoded sample: {decoded[:50]}...")

# Vocabulary size
vocab_size = len(tokenizer.token_to_id)
print(f"\nVocabulary size: {vocab_size}")

## 3. Create a Dataset

In [None]:
from src.data.dataset import TextDataset
from torch.utils.data import DataLoader

# Create a dataset
block_size = 64  # Context length
dataset = TextDataset([encoded], block_size=block_size)

# Create a dataloader
batch_size = 4
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Inspect a batch
batch = next(iter(dataloader))
print(f"Input shape: {batch['input_ids'].shape}")
print(f"Labels shape: {batch['labels'].shape}")

# Show a sample
sample_idx = 0
input_text = tokenizer.decode(batch['input_ids'][sample_idx].tolist())
target_text = tokenizer.decode(batch['labels'][sample_idx].tolist())

print(f"\nInput: {input_text[:30]}...")
print(f"Target: {target_text[:30]}...")

## 4. Build the Model

Now let's create our transformer-based language model.

In [None]:
from src.model.transformer import CustomLanguageModel

# Model hyperparameters
d_model = 128       # Embedding dimension
num_heads = 4       # Number of attention heads
num_layers = 3      # Number of transformer layers
d_ff = 512          # Feed-forward dimension
dropout = 0.1       # Dropout rate

# Create the model
model = CustomLanguageModel(
    vocab_size=vocab_size,
    d_model=d_model,
    num_heads=num_heads,
    num_layers=num_layers,
    d_ff=d_ff,
    max_seq_length=block_size,
    dropout=dropout
)

# Print model summary
print(f"Model parameters: {sum(p.numel() for p in model.parameters())}")

# Test forward pass
input_ids = batch['input_ids']
outputs = model(input_ids)
print(f"Output shape: {outputs.shape}")

## 5. Training Setup

In [None]:
import torch.optim as optim
from src.training.trainer import Trainer

# Create optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-4)

# Create learning rate scheduler
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Create trainer
trainer = Trainer(
    model=model,
    train_dataloader=dataloader,
    optimizer=optimizer,
    scheduler=scheduler,
    checkpoint_dir="../checkpoints",
    use_wandb=False  # Set to True if you want to use Weights & Biases
)

## 6. Train the Model

Let's train our model for a few epochs.

In [None]:
# Train for a few epochs
num_epochs = 5
trainer.train(num_epochs=num_epochs)

## 7. Generate Text

Now let's use our trained model to generate some text.

In [None]:
# Set model to evaluation mode
model.eval()

# Prepare a prompt
prompt = "Natural language"
prompt_ids = torch.tensor([tokenizer.encode(prompt)], dtype=torch.long)

# Generate text
max_length = 100
temperature = 0.8
generated_ids = model.generate(prompt_ids, max_length=max_length, temperature=temperature)

# Decode the generated text
generated_text = tokenizer.decode(generated_ids[0].tolist())
print(f"Generated text:\n{generated_text}")

## 8. Evaluate the Model

Let's calculate perplexity on our sample text.

In [None]:
import torch.nn.functional as F

def calculate_perplexity(model, dataloader, device="cpu"):
    model.eval()
    total_loss = 0
    total_tokens = 0
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids)
            outputs = outputs.view(-1, outputs.size(-1))
            labels = labels.view(-1)
            
            # Calculate cross entropy loss
            loss = F.cross_entropy(outputs, labels, ignore_index=0, reduction="sum")
            
            total_loss += loss.item()
            total_tokens += (labels != 0).sum().item()
    
    # Calculate perplexity
    avg_loss = total_loss / total_tokens
    perplexity = np.exp(avg_loss)
    
    return perplexity

# Calculate perplexity
perplexity = calculate_perplexity(model, dataloader)
print(f"Perplexity: {perplexity:.2f}")

## 9. Save the Model and Tokenizer

In [None]:
# Save the model
torch.save(model.state_dict(), "../checkpoints/character_model.pt")

# Save the tokenizer
tokenizer.save("../checkpoints/character_tokenizer.json")

print("Model and tokenizer saved successfully!")

## 10. Next Steps

Here are some ways to improve the model:

1. Train on a larger dataset
2. Use a more sophisticated tokenizer (BPE, WordPiece)
3. Increase model size (more layers, larger embedding dimension)
4. Implement techniques like flash attention for efficiency
5. Add regularization techniques
6. Experiment with different architectures (GPT, BERT, etc.)