In [None]:
print('Setup complete.')

# Lab 04: Running, Monitoring, and Evaluating Fine-Tuning Jobs

## Learning Objectives
- Structure a fine-tuning job into a reusable pipeline
- Monitor and log training metrics like loss and accuracy
- Implement advanced evaluation metrics for text generation (e.g., BLEU score)
- Save and load model checkpoints

## Setup

In [None]:
import numpy as np
import json
import matplotlib.pyplot as plt
from typing import List, Dict, Tuple, Any
from collections import Counter
import math
import os

## Part 1: Structuring the Training Pipeline

In [None]:
# We'll reuse and build upon the mock model from previous labs
class MockModel:
    def __init__(self, vocab_size=256, dim=32):
        self.weights = np.random.randn(dim, vocab_size) * 0.1

    def save_checkpoint(self, path: str):
        np.save(path, self.weights)

    def load_checkpoint(self, path: str):
        self.weights = np.load(path)

class TrainingPipeline:
    def __init__(self, model, dataset, lr=0.01):
        self.model = model
        self.dataset = dataset
        self.lr = lr
        self.history = {'loss': [], 'accuracy': []} # For monitoring

    def run(self, epochs: int):
        print(f'Starting training for {epochs} epochs...')
        for epoch in range(epochs):
            total_loss = self._train_one_epoch()
            avg_loss = total_loss / len(self.dataset)
            
            # Mock evaluation for accuracy
            accuracy = self._evaluate()
            
            # Log metrics
            self.history['loss'].append(avg_loss)
            self.history['accuracy'].append(accuracy)
            
            print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2%}')
            
            # Save a checkpoint periodically
            if (epoch + 1) % 5 == 0:
                self.model.save_checkpoint(f'model_epoch_{epoch+1}.npy')
                print(f'Saved checkpoint at epoch {epoch+1}')

    def _train_one_epoch(self) -> float:
        # Simplified training logic from previous labs
        total_loss = 0
        for _ in self.dataset:
            # Mock forward/backward pass
            loss_grad = np.random.randn(*self.model.weights.shape) * 0.01
            self.model.weights -= self.lr * loss_grad
            total_loss += np.mean(loss_grad**2) # Mock loss
        return total_loss

    def _evaluate(self) -> float:
        # Mock evaluation: accuracy improves as training progresses
        # This simulates the model getting better over time
        base_accuracy = 0.1
        improvement = (len(self.history['loss']) / 100) # Simple linear improvement
        return min(base_accuracy + improvement, 0.95) + random.uniform(-0.05, 0.05)

## Part 2: Monitoring and Logging

In [None]:
def plot_history(history: Dict[str, List[float]]):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Plot training loss
    ax1.plot(history['loss'], label='Loss')
    ax1.set_title('Training Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    
    # Plot training accuracy
    ax2.plot(history['accuracy'], label='Accuracy', color='orange')
    ax2.set_title('Training Accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

## Part 3: Advanced Evaluation Metrics (BLEU Score)

In [None]:
def simple_bleu(reference: str, candidate: str, n: int = 4) -> float:
    """A simplified implementation of the BLEU score."""
    ref_tokens = reference.lower().split()
    cand_tokens = candidate.lower().split()
    
    clipped_counts = 0
    candidate_ngram_counts = 0
    
    for i in range(1, n + 1):
        # N-grams for candidate
        cand_ngrams = Counter([' '.join(cand_tokens[j:j+i]) for j in range(len(cand_tokens) - i + 1)])
        if not cand_ngrams: continue
        
        # N-grams for reference
        ref_ngrams = Counter([' '.join(ref_tokens[j:j+i]) for j in range(len(ref_tokens) - i + 1)])
        
        # Clip counts
        for ngram, count in cand_ngrams.items():
            clipped_counts += min(count, ref_ngrams.get(ngram, 0))
            candidate_ngram_counts += count

    if candidate_ngram_counts == 0:
        return 0.0
        
    precision = clipped_counts / candidate_ngram_counts
    
    # Brevity Penalty
    len_cand, len_ref = len(cand_tokens), len(ref_tokens)
    if len_cand > len_ref:
        bp = 1.0
    else:
        bp = math.exp(1 - len_ref / len_cand) if len_cand > 0 else 0.0
        
    return bp * precision

# Example BLEU score calculation
reference_text = "The quick brown fox jumps over the lazy dog"
candidate_1 = "The fast brown fox jumps over the lazy dog" # High similarity
candidate_2 = "A cat sits on the mat" # Low similarity

bleu_1 = simple_bleu(reference_text, candidate_1)
bleu_2 = simple_bleu(reference_text, candidate_2)

print(f'--- BLEU Score Example ---")
print(f'BLEU for candidate 1: {bleu_1:.4f}')
print(f'BLEU for candidate 2: {bleu_2:.4f}')

## Part 4: Running the Pipeline

In [None]:
# Initialize model and pipeline
model = MockModel()
mock_dataset = [1] * 20 # 20 dummy examples
pipeline = TrainingPipeline(model, mock_dataset, lr=0.01)

# Run training
pipeline.run(epochs=20)

# Plot the results
plot_history(pipeline.history)

# Now, let's load a checkpoint and inspect it
checkpoint_path = 'model_epoch_20.npy'
if os.path.exists(checkpoint_path):
    new_model = MockModel()
    new_model.load_checkpoint(checkpoint_path)
    print(f'
Successfully loaded model from {checkpoint_path}')
    print(f'Weights are the same: {np.allclose(model.weights, new_model.weights)}')
    
    # Clean up checkpoint files
    for f in os.listdir():
        if f.endswith('.npy'):
            os.remove(f)

## Exercises

1. **Implement ROUGE Score**: ROUGE is another popular metric that focuses on recall (while BLEU focuses on precision). Implement a simple version of ROUGE-N, which counts the overlap of n-grams between the reference and candidate.
2. **Add Learning Rate Scheduling**: Modify the `TrainingPipeline` to include a simple learning rate scheduler, such as one that decreases the learning rate by a factor of 10 every 5 epochs. How does this affect the training curve?
3. **Save Best Checkpoint Only**: Modify the checkpointing logic to only save the model if its evaluation accuracy has improved since the last save. This is a common strategy to avoid filling up disk space.

## Summary

You learned:
- How to create a structured pipeline for running fine-tuning jobs.
- The importance of monitoring metrics and how to visualize them.
- How to implement a text generation evaluation metric like BLEU from scratch.
- A practical approach to saving and loading model checkpoints during training.