# Msingi1 Training Notebook

This notebook trains the Msingi1 Swahili language model using Mixture of Experts architecture.

## Setup Steps
1. Mount Google Drive
2. Clone repository and install dependencies
3. Load dataset and tokenizer
4. Initialize model
5. Train model with checkpointing

## 1. Mount Google Drive
First, mount Google Drive to save checkpoints and model artifacts:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 2. Clone Repository and Setup Environment

In [None]:
# Clone repository
!git clone https://github.com/your-username/msingi1.git
%cd msingi1

# Install dependencies and setup environment
!python setup_colab.py

## 3. Load Dataset and Tokenizer

In [None]:
import torch
from torch.utils.data import DataLoader
from tokenizers import ByteLevelBPETokenizer
from src.data_processor import SwahiliDataset, extract_dataset

# Load tokenizer
tokenizer = ByteLevelBPETokenizer(
    "tokenizer/vocab.json",
    "tokenizer/merges.txt"
)

# Extract dataset
texts = extract_dataset("data/swahili_dataset.zip")

# Create dataset
dataset = SwahiliDataset(
    texts=texts,
    tokenizer=tokenizer,
    max_length=1024,
    stride=512
)

# Create dataloader
train_loader = DataLoader(
    dataset,
    batch_size=4,  # Adjust based on GPU memory
    shuffle=True,
    num_workers=2
)

## 4. Initialize Model

In [None]:
from src.model import Msingi1, MsingiConfig

# Initialize model configuration
config = MsingiConfig(
    vocab_size=32000,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    num_experts=8,
    expert_capacity=32
)

# Initialize model
model = Msingi1(config)
model = model.cuda()  # Move to GPU

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Learning rate scheduler with warmup
from transformers import get_linear_schedule_with_warmup

num_training_steps = len(train_loader) * 50  # 50 epochs
num_warmup_steps = num_training_steps // 10  # 10% warmup

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

## 5. Training Loop with Checkpointing

In [None]:
import os
from tqdm.notebook import tqdm
import wandb

# Initialize wandb
wandb.init(project="msingi1", name="training_run_1")

# Training parameters
num_epochs = 50
gradient_accumulation_steps = 4
checkpoint_dir = "/content/drive/MyDrive/msingi1/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    
    for step, batch in enumerate(progress_bar):
        # Move batch to GPU
        input_ids = batch["input_ids"].cuda()
        labels = batch["labels"].cuda()
        
        # Forward pass
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss / gradient_accumulation_steps
        loss.backward()
        
        # Update weights every gradient_accumulation_steps
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        # Update progress
        total_loss += loss.item() * gradient_accumulation_steps
        progress_bar.set_postfix({"loss": total_loss / (step + 1)})
        
        # Log to wandb
        wandb.log({
            "loss": loss.item() * gradient_accumulation_steps,
            "learning_rate": scheduler.get_last_lr()[0]
        })
    
    # Save checkpoint every 5 epochs
    if (epoch + 1) % 5 == 0:
        checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch+1}.pt")
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': total_loss / len(train_loader),
        }, checkpoint_path)
        print(f"\nCheckpoint saved: {checkpoint_path}")

print("Training completed!")
wandb.finish()

## 6. Generate Text with Trained Model

In [None]:
def generate_text(prompt: str, max_length: int = 100):
    model.eval()
    with torch.no_grad():
        # Encode prompt
        encoded = tokenizer.encode(prompt)
        input_ids = torch.tensor([encoded.ids]).cuda()
        
        # Generate
        outputs = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            temperature=0.7
        )
        
        # Decode and return
        return tokenizer.decode(outputs[0].tolist())

# Test generation
prompt = "Habari ya leo?"
generated_text = generate_text(prompt)
print(f"Prompt: {prompt}")
print(f"Generated: {generated_text}")