# Training a Small Language Model

This notebook demonstrates how to train a small language model from scratch.

In [None]:
import torch
from llm_trainer.models import TransformerLM
from llm_trainer.config import ModelConfig, TrainingConfig, DataConfig
from llm_trainer.tokenizer import create_tokenizer
from llm_trainer.training import Trainer

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
from datasets import load_dataset
tokenizer = create_tokenizer("bpe")
dataset = load_dataset('wikitext', 'wikitext-103-raw-v1', split='train')
text_column = 'text'

tokenizer.train(
    dataset=dataset,
    vocab_size=3200,
    max_samples=1000,
    text_column=text_column,
    verbose=True
)

print(f"Tokenizer vocab size: {tokenizer.vocab_size}")

In [None]:
# Configure model
model_config = ModelConfig(
    vocab_size=tokenizer.vocab_size,
    d_model=256,
    n_heads=4,
    n_layers=4,
    d_ff=1024,
    max_seq_len=512,
    dropout=0.1
)

# Create model
model = TransformerLM(model_config)
print(f"Model parameters: {model.get_num_params():,}")

In [None]:
# Configure training
training_config = TrainingConfig(
    batch_size=4,
    learning_rate=1e-4,
    num_epochs=1,  # Small for demo
    gradient_accumulation_steps=4,
    logging_steps=10,
    save_steps=100,
    checkpoint_dir="./checkpoints",
    report_to=["tensorboard"]  # Enable TensorBoard logging
)

# Configure data
data_config = DataConfig(
    dataset_name="wikitext",
    dataset_config="wikitext-2-raw-v1",
    max_length=512
)

In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    config=training_config
)

# Train
trainer.train_from_config(model_config, data_config)

In [None]:
# Save model
trainer.save_model("./trained_model")
print("Model saved!")