# Msingi1 MoE Training on Google Colab

This notebook:
1. Sets up GPU and dependencies
2. Loads the pre-trained tokenizer
3. Trains the Msingi1 Swahili language model with Mixture of Experts
4. Saves checkpoints and logs metrics with Weights & Biases

In [None]:
!nvidia-smi

## Setup

In [None]:
!pip install -q torch tokenizers wandb fastmoe
!git clone https://github.com/Msingi-AI/msingi1.git
%cd msingi1
!pip install -r requirements.txt

## Imports and Setup

In [None]:
import torch
import os
import wandb
from src.model import Msingi1Model
from src.train import train_model
from src.data_processor import SwahiliDataset
from google.colab import drive

## Model Configuration

In [None]:
# Mount Google Drive
drive.mount('/content/drive')
# Initialize wandb
wandb.init(project="msingi1-moe", name="moe_training_run")
# Model configuration
config = {
    'vocab_size': 32000,
    'hidden_size': 768,
    'num_experts': 8,
    'expert_capacity': 32,
    'moe_layers': [2, 4],
    'intermediate_size': 3072,
    'num_attention_heads': 12,
    'batch_size': 32,
    'num_epochs': 100,
    'learning_rate': 3e-4,
    'save_every': 5,
    'checkpoint_dir': '/content/drive/MyDrive/msingi1_checkpoints'
}

## Initialize Model

In [None]:
# Create checkpoint directory
os.makedirs(config['checkpoint_dir'], exist_ok=True)
# Initialize model
model = Msingi1Model(config)
model = model.cuda()  # Move to GPU

## Training

In [None]:
# Load tokenizer
tokenizer_path = '/content/drive/MyDrive/msingi1_tokenizer/tokenizer.json'
tokenizer = tokenizers.Tokenizer.from_file(tokenizer_path)
# Prepare dataset
dataset = SwahiliDataset('data/Swahili data/train.txt', tokenizer)
dataloader = torch.utils.data.DataLoader(
    dataset, 
    batch_size=config['batch_size'],
    shuffle=True
)
# Training loop
train_model(
    model=model,
    dataloader=dataloader,
    config=config,
    wandb=wandb
)
