# TODO

---

## Prerequisites


In [None]:
%%capture
import sys
python = sys.executable

!{python} -m ensurepip --upgrade
!{python} -m pip install --upgrade pip setuptools wheel
!{python} -m pip install --upgrade --force-reinstall python-dotenv
%pip install aiohttp

Restart the Kernel

In [None]:
from IPython.display import clear_output
clear_output(wait=True)

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

Clone the repository

In [None]:
!git clone https://github.com/NVIDIA-AI-Blueprints/llm-router.git
!cd llm-router && git checkout ces-dev # TODO: Remove once ces-dev is merged to main

In [None]:
# 1. Setup the repository script
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash

# 2. Install Git LFS
!sudo apt-get install git-lfs

# 3. Initialize Git LFS
!git lfs install

!git lfs fetch
!git lfs checkout

In [None]:
import os
os.chdir('llm-router')

---

## 1. Setup and Imports

First, we set up the Python path and import required libraries.

In [None]:
%pip install uv

In [None]:
# %%capture
# %pip install -r requirements.txt -r multimodal_router/src/nat_sfc_router/training/requirements.txt
!cd multimodal_router && uv pip install .
%pip install clip-client

In [None]:
import json
import random
import os
from datasets import load_from_disk
from tqdm import tqdm
import numpy as np
from PIL import Image
import base64
import io
from openai import AzureOpenAI, OpenAI
import time
from functools import wraps


---

## 2. [OPTIONAL] Data Preparation (`prepare_hf_data.py`)

This section prepares training data for the multimodal router by:

1. **Loading a multimodal dataset** from HuggingFace (images + text)
2. **Generating responses** from multiple models (GPT-5, Qwen-VL, Nemotron)
3. **Judging responses** using LLM-as-judge to determine correctness
4. **Creating evaluation data** with model success/failure labels for training

The output is JSON files containing prompts, images (base64), and per-model success indicators.

**This section is OPTIONAL because the train and test datasets have already been pre-generated for you for your convenience. You can verify with the cell below.**

### 2.1 Load Dataset

Load the multimodal dataset containing images and text prompts with ground truth answers.

> **⚠️ Important:** This section requires the `finevision_combined_images_text` dataset to be available locally. 
> 
> **If you don't have this dataset**, you can **skip Section 2 entirely** and proceed to Section 3 (Embedding Generation), as pre-computed evaluation files (`hf_evaluations_train.json` and `hf_evaluations_test.json`) may already exist in `src/nat_sfc_router/training/`.
>
> To check if you can skip this section, run:
> ```python
> import os
> print("Can skip Section 2:", os.path.exists("src/nat_sfc_router/training/hf_evaluations_train.json"))
> ```


In [None]:
# Check if we can skip this section (pre-computed evaluations exist)
eval_files_exist = (
    os.path.exists("multimodal_router/src/nat_sfc_router/training/hf_evaluations_train.json") and
    os.path.exists("multimodal_router/src/nat_sfc_router/training/hf_evaluations_test.json")
)

if eval_files_exist:
    print("✓ Pre-computed evaluation files found!")
    print("  - src/nat_sfc_router/training/hf_evaluations_train.json")
    print("  - src/nat_sfc_router/training/hf_evaluations_test.json")
    print("\n⚠️ You can SKIP Section 2 (cells 17-36) and proceed directly to Section 3 (Embedding Generation).")
    print("   Set SKIP_DATA_PREP = False above and re-run, or simply skip to Section 3.")
    
    SKIP_DATA_PREP = True  # Set to False if you want to regenerate evaluations
else:
    SKIP_DATA_PREP = False

if not SKIP_DATA_PREP:
    # Dataset path - update this to point to your dataset location
    DATASET_PATH = "finevision_combined_images_text"  # or full path like "/path/to/dataset"
    
    print("Loading dataset...")
    dataset = load_from_disk(DATASET_PATH)
    
    print(f"Dataset loaded with {len(dataset)} samples")
    print(f"Sample fields: {dataset[0].keys() if len(dataset) > 0 else 'N/A'}")

### 2.2 Configuration

Define the models to route between and a utility decorator for handling API retries.


In [None]:
# Model IDs mapping for the router
MODEL_IDS = {
    "gpt-5-chat": 0,
    "Qwen/Qwen3-VL-8B-Instruct": 1,
    "nvidia/nvidia-nemotron-nano-9b-v2": 2
}

def retry_with_exponential_backoff(
    max_retries=50,
    initial_delay=1,
    exponential_base=2,
    max_delay=3000,
    jitter=True
):
    """
    Decorator to retry a function with exponential backoff.
    
    Args:
        max_retries: Maximum number of retry attempts
        initial_delay: Initial delay in seconds
        exponential_base: Base for exponential backoff
        max_delay: Maximum delay between retries
        jitter: Add random jitter to prevent thundering herd
    """
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            delay = initial_delay
            last_exception = None
            
            for attempt in range(max_retries + 1):
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    last_exception = e
                    
                    if attempt == max_retries:
                        print(f"Max retries ({max_retries}) reached for {func.__name__}. Last error: {e}")
                        raise
                    
                    # Calculate delay with exponential backoff
                    current_delay = min(delay * (exponential_base ** attempt), max_delay)
                    
                    # Add jitter if enabled
                    if jitter:
                        current_delay = current_delay * (0.5 + random.random())
                    
                    print(f"Error in {func.__name__} (attempt {attempt + 1}/{max_retries + 1}): {e}")
                    print(f"Retrying in {current_delay:.2f} seconds...")
                    time.sleep(current_delay)
            
            raise last_exception
        
        return wrapper
    return decorator

print(f"Configured {len(MODEL_IDS)} models for routing: {list(MODEL_IDS.keys())}")


### 2.3 API Configuration

Configure API endpoints for the different models. Set these environment variables:
- `AZURE_OPENAI_ENDPOINT`: Azure OpenAI endpoint URL
- `OPENAI_API_KEY`: Azure OpenAI API key
- `VLM_MODEL_BASE_URL`: Base URL for the vision-language model (Qwen)
- `NANO_TEXT_MODEL_BASE_URL`: Base URL for the text-only model (Nemotron Nano)


In [None]:
# Load API configuration from environment variables
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Verify configuration
print("API Configuration Status:")
print(f"  AZURE_OPENAI_ENDPOINT: {'✓ Set' if AZURE_OPENAI_ENDPOINT else '✗ Not set'}")
print(f"  OPENAI_API_KEY: {'✓ Set' if AZURE_OPENAI_API_KEY else '✗ Not set'}")
print(f"  VLM_MODEL_BASE_URL: {'✓ Set' if os.getenv('VLM_MODEL_BASE_URL') else '✗ Not set'}")
print(f"  NANO_TEXT_MODEL_BASE_URL: {'✓ Set' if os.getenv('NANO_TEXT_MODEL_BASE_URL') else '✗ Not set'}")


### 2.4 Helper Functions

Utility function to extract prompts and images from dataset samples.


In [None]:
def prepare_prompt_with_images(sample):
    """
    Prepare the prompt by extracting text content and collecting images from the sample.
    
    Args:
        sample: Dataset sample with 'text', 'images', and 'answer' fields
    
    Returns:
        Dict with 'prompt', 'images', and 'answer' keys
    """
    return {
        "prompt": sample["text"][0]["content"],
        "images": sample["images"],
        "answer": sample["answer"]
    }


### 2.5 Model Response Generators

Functions to generate responses from each model. These handle the different API formats and image encoding requirements for each model type.


In [None]:
@retry_with_exponential_backoff(max_retries=5, initial_delay=1)
def generate_response_with_openai(prompt_data):
    """Generate response using Azure OpenAI (GPT-5-chat) with vision support."""
    client = AzureOpenAI(
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_key=AZURE_OPENAI_API_KEY,
        api_version="2025-01-01-preview"
    )
    
    # Prepare content with text and images
    content = [{"type": "text", "text": prompt_data['prompt']}]
    
    # Add images if available
    for image in prompt_data.get('images', []):
        if image is not None:
            # Convert PIL Image to base64
            buffered = io.BytesIO()
            image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
            
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{img_base64}"
                }
            })
    
    response = client.chat.completions.create(
        model="gpt-5-chat",
        messages=[{"role": "user", "content": content}]
    )
    return response.choices[0].message.content


In [None]:
@retry_with_exponential_backoff(max_retries=5, initial_delay=1)
def generate_response_with_nemotron_vlm(prompt_data):
    """Generate response using Qwen VLM (vision-language model)."""
    client = OpenAI(
        base_url=os.getenv("VLM_MODEL_BASE_URL"),
        api_key="not-needed"
    )
    
    content = [{"type": "text", "text": prompt_data['prompt']}]
    
    # Add images if available
    for image in prompt_data.get('images', []):
        if image is not None:
            # Convert PIL Image to base64
            buffered = io.BytesIO()
            image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
            
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{img_base64}"
                }
            })
    
    response = client.chat.completions.create(
        model="Qwen/Qwen3-VL-8B-Instruct",
        messages=[{"role": "user", "content": content}]
    )
    return response.choices[0].message.content


In [None]:
def generate_response_with_text(prompt_data):
    """Generate response using Nemotron Nano (text-only model)."""
    client = OpenAI(
        base_url=os.getenv("NANO_TEXT_MODEL_BASE_URL"),
        api_key="not-needed"
    )
    
    # Build messages from conversation
    messages = []
    conversation = prompt_data['prompt']
    first_user_turn = True
    
    # Process each turn in the conversation
    for turn in conversation:
        role = turn.get("role", "user")
        content_text = turn.get("content", "")
        
        # For the first user message, include images
        if role == "user" and first_user_turn:
            first_user_turn = False
            content = [{"type": "text", "text": content_text}]
            
            # Add images if available
            for image_dict in prompt_data.get('images', []):
                if image_dict is not None and 'bytes' in image_dict:
                    # Get image bytes and convert to base64
                    img_bytes = image_dict['bytes']
                    if img_bytes is not None:
                        # Convert bytes to PIL Image then to base64
                        image = Image.open(io.BytesIO(img_bytes))
                        buffered = io.BytesIO()
                        image.save(buffered, format="PNG")
                        img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
                        
                        content.append({
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{img_base64}"
                            }
                        })
            messages.append({"role": role, "content": content})
        else:
            messages.append({"role": role, "content": content_text})
    
    response = client.chat.completions.create(
        model="nvidia/nvidia-nemotron-nano-9b-v2",
        messages=messages
    )
    return response.choices[0].message.content


In [None]:
def generate_model_responses(prompt_data, model_ids):
    """
    Generate responses from different models for the given prompt.
    
    Args:
        prompt_data: Dict with 'prompt', 'images', etc.
        model_ids: List of model IDs to generate responses from
    
    Returns:
        Dict mapping model_id to response
    """
    responses = {}
    for model_id in model_ids:
        try:
            if model_id == "gpt-5-chat":
                responses[model_id] = generate_response_with_openai(prompt_data)
            elif model_id == "Qwen/Qwen3-VL-8B-Instruct":
                responses[model_id] = generate_response_with_nemotron_vlm(prompt_data)
            elif model_id == "nvidia/nvidia-nemotron-nano-9b-v2":
                # Text-only model cannot process images, return fixed response
                responses[model_id] = "I cannot answer that question because I am a model that can only answer questions without images."
        except Exception as e:
            print(f"Failed to generate response for {model_id} after all retries. Skipping. Error: {e}")
            continue
    return responses


### 2.6 LLM-as-Judge

Use a powerful LLM (GPT-5) to evaluate whether each model's response correctly answers the question. This creates the ground truth labels for router training.


In [None]:
@retry_with_exponential_backoff(max_retries=5, initial_delay=1)
def judge_single_response(prompt_data, model_response, ground_truth_label):
    """
    Use Azure OpenAI as a judge to evaluate if a single model response correctly answers the question.
    
    Args:
        prompt_data: Dict with prompt and images
        model_response: The model's generated response
        ground_truth_label: The correct answer from the dataset
    
    Returns:
        True if the model response is correct, False otherwise
    """
    client = AzureOpenAI(
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_key=AZURE_OPENAI_API_KEY,
        api_version="2025-01-01-preview"
    )

    # Compose the system prompt for the judge
    sys_prompt = (
        "You are an expert AI tasked with evaluating whether a model's response correctly answers a question. "
        "You will be given the original prompt (with images), the ground truth answer, and the model's response. "
        "Your task is to determine whether the model's response correctly answers the question and matches "
        "or logically aligns with the ground truth answer.\n\n"
        "Reply strictly with 'yes' if the response is correct, or 'no' if it is incorrect."
    )

    content = [{"type": "text", "text": "PROMPT: " + prompt_data['prompt']}]
    
    # Add images if available
    for image in prompt_data.get('images', []):
        if image is not None:
            # Convert PIL Image to base64
            buffered = io.BytesIO()
            image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
            
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{img_base64}"
                }
            })

    content.append({"type": "text", "text": f"\n\nGROUND TRUTH ANSWER: {ground_truth_label}"})
    content.append({"type": "text", "text": f"\n\nMODEL RESPONSE: {model_response}"})

    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": content}
    ]

    try:
        response = client.chat.completions.create(
            model="gpt-5-chat",
            messages=messages
        )
        judge_answer = response.choices[0].message.content.strip().lower()
        return judge_answer.startswith("yes")
    except Exception as e:
        print(f"Error during Azure judge evaluation: {e}")
        return False


### 2.7 Evaluation Pipeline

Main function that orchestrates the entire evaluation process: generating responses from all models and judging each response.


In [None]:
def create_evaluation_data(dataset_split, num_samples=None):
    """
    Evaluate all models on the dataset and record success (1) or failure (0) for each model.
    
    Args:
        dataset_split: The dataset split to process
        num_samples: Number of samples to process (None = all)
    
    Returns:
        List of evaluation samples with model success indicators
    """
    evaluation_data = []
    model_list = list(MODEL_IDS.keys())
    
    # Limit number of samples if specified
    num_to_process = len(dataset_split) if num_samples is None else min(num_samples, len(dataset_split))
    
    print(f"\nProcessing {num_to_process} samples...")
    
    skipped_samples = 0
    
    for idx in tqdm(range(num_to_process)):
        sample = dataset_split[idx]
        
        # Prepare prompt with images
        prompt_data = prepare_prompt_with_images(sample)
        
        # Skip if prompt is empty
        if not prompt_data["prompt"] or len(prompt_data["prompt"]) == 0:
            print(f"Skipping sample {idx}: Empty prompt")
            skipped_samples += 1
            continue
        
        responses = generate_model_responses(prompt_data, model_list)
        
        # Judge each model's response independently
        model_scores = {}
        all_failed = True
        
        for model_id in model_list:
            if model_id not in responses:
                print(f"Skipping model {model_id} for sample {idx}: No response generated")
                model_scores[model_id] = 0
                continue
            
            try:
                # Judge if the response is correct
                is_correct = judge_single_response(prompt_data, responses[model_id], prompt_data["answer"])
                model_scores[model_id] = 1 if is_correct else 0
                if is_correct:
                    all_failed = False
                print(f"Sample {idx} - {model_id}: {'✓' if is_correct else '✗'}")
            except Exception as e:
                print(f"Failed to judge {model_id} for sample {idx}: {e}")
                model_scores[model_id] = 0
        
        # Convert images to base64 for serialization
        images_base64 = []
        for image in prompt_data["images"]:
            if image is not None:
                buffered = io.BytesIO()
                image.save(buffered, format="PNG")
                img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
                images_base64.append(img_base64)
        
        evaluation_data.append({
            "idx": idx,
            "prompt": prompt_data["prompt"],
            "answer": prompt_data["answer"],
            "images": images_base64,
            "model_scores": model_scores
        })
    
    print(f"\nSkipped {skipped_samples} samples due to failures")
    
    return evaluation_data


### 2.8 Run Evaluation

Execute the evaluation pipeline. Adjust `NUM_SAMPLES` to control how many samples to process.

> **Note:** This step can take a long time depending on the number of samples. For a quick test, use a small value like 10-50. For training, use 1000-2000+ samples.


In [None]:
# Configuration: Number of samples to evaluate
# Set to None to process all samples, or a specific number for faster testing
NUM_SAMPLES = 2000  # Adjust as needed

# Run the evaluation
print("="*80)
print("Starting model evaluation...")
print("="*80)

all_evaluations = create_evaluation_data(dataset, num_samples=NUM_SAMPLES)

print(f"\n✓ Generated {len(all_evaluations)} evaluation samples")


### 2.9 Train/Test Split and Statistics

Split the evaluation data into training and testing sets (80/20 split) and display model success rates.


In [None]:
# Split into training and testing sets (80/20 split)
split_idx = int(len(all_evaluations) * 0.8)
train_evaluations = all_evaluations[:split_idx]
test_evaluations = all_evaluations[split_idx:]

print(f"Train samples: {len(train_evaluations)}")
print(f"Test samples:  {len(test_evaluations)}")

# Calculate and print success rates for each model
print("\n" + "="*80)
print("Model Success Rates (Training Set):")
print("="*80)
for model_id in MODEL_IDS.keys():
    success_count = sum(1 for item in train_evaluations if item["model_scores"].get(model_id, 0) == 1)
    total_count = len(train_evaluations)
    rate = 100 * success_count / total_count if total_count > 0 else 0
    print(f"  {model_id:40s}: {success_count:4d}/{total_count:4d} ({rate:5.1f}%)")


### 2.10 Save Evaluation Data

Save the evaluation data to JSON files for use in the embedding generation step.


In [None]:
# Output paths for the evaluation data
# These files will be used by the embedding generation step
output_train_path = "multimodal_router/src/nat_sfc_router/training/hf_evaluations_train.json"
output_test_path = "multimodal_router/src/nat_sfc_router/training/hf_evaluations_test.json"

# Ensure output directory exists
os.makedirs(os.path.dirname(output_train_path), exist_ok=True)

# Save training data
with open(output_train_path, "w") as f:
    json.dump(train_evaluations, f, indent=2)
print(f"✓ Saved training data to: {output_train_path}")

# Save test data
with open(output_test_path, "w") as f:
    json.dump(test_evaluations, f, indent=2)
print(f"✓ Saved test data to: {output_test_path}")


---

## 3. Embedding Generation (`generate_embeddings.py`)

This section generates CLIP embeddings for the evaluation data:

1. **Load evaluation JSON files** from the previous step
2. **Connect to CLIP server** for embedding generation
3. **Generate multimodal embeddings** (text + image → 1024-dim vector)
4. **Save embeddings as NumPy arrays** for neural network training

The embeddings combine:
- **Text embedding** (512 dimensions from CLIP text encoder)
- **Image embedding** (512 dimensions from CLIP image encoder, or zeros if no image)

First, let's spin up an instance of the CLIP model to use for embeddings generation

In [None]:
# =====================================================
# PREREQUISITE: Deploy CLIP server with Docker (Linux)
# =====================================================
# Run this command in your terminal BEFORE proceeding with the notebook.

!docker run -d --rm \
    --name clip_server \
    --gpus all \
    -p 51000:51000 \
    jinaai/clip-as-service:latest

In [None]:
# Verify CLIP server is responding
c = Client('grpc://0.0.0.0:51000')
c.profile()

Now, let's proceed with configuring the CLIP client

In [None]:
# Imports for embedding generation
# Note: json, numpy, os, tqdm are already imported from the previous section
from clip_client import Client

# CLIP embedding dimensions
CLIP_TEXT_DIM = 512
CLIP_IMAGE_DIM = 512
COMBINED_DIM = CLIP_TEXT_DIM + CLIP_IMAGE_DIM  # 1024

print(f"CLIP embedding configuration:")
print(f"  Text dimension:     {CLIP_TEXT_DIM}")
print(f"  Image dimension:    {CLIP_IMAGE_DIM}")
print(f"  Combined dimension: {COMBINED_DIM}")

### 3.1 CLIP Client Setup

Connect to the CLIP embedding server. Set the `CLIP_SERVER` environment variable to your server address (e.g., `10.185.119.147:51000`).


In [None]:
def load_embedding_model():
    """
    Load and connect to the CLIP server for embedding generation.
    
    Returns:
        CLIP client connected to the server
    """
    print("Connecting to CLIP server...")
    clip_server = os.getenv("CLIP_SERVER", "0.0.0.0:51000")
    if not clip_server:
        raise ValueError("CLIP_SERVER environment variable not set. Please set it to your CLIP server address.")
    
    client = Client(f'grpc://{clip_server}')
    print(f"✓ Connected to CLIP server at: {clip_server}")
    return client


### 3.2 Data Loading Functions

Functions to load the evaluation JSON files and prepare samples for embedding generation.


In [None]:
def load_json_data(json_path):
    """
    Load evaluation data from JSON file.
    
    Args:
        json_path: Path to the JSON file
    
    Returns:
        List of evaluation samples
    """
    with open(json_path, 'r') as f:
        data = json.load(f)
    return data


def prepare_prompt_for_embedding(sample):
    """
    Prepare a sample from the evaluation JSON for embedding generation.
    Extracts prompt text and base64-encoded images.
    
    Args:
        sample: Evaluation sample with 'prompt' and 'images' fields
    
    Returns:
        Dict with 'prompt' (text) and 'images' (list of base64 strings)
    """
    prompt = sample["prompt"]
    images = sample.get("images", [])
    
    # Filter out None values if any
    images = [img for img in images if img is not None]
    
    return {
        "prompt": prompt,
        "images": images,
        "task": sample.get("task", ""),
        "label": sample.get("label", "")
    }


### 3.3 Embedding Generation Function

Generate CLIP embeddings for text+image pairs. The function concatenates:
- Text embedding (512 dim) from CLIP text encoder
- Image embedding (512 dim) from CLIP image encoder, or zeros if no image


In [None]:
def generate_embeddings_for_dataset(clip_client, dataset_split, max_samples=None):
    """
    Generate embeddings for all prompts in the dataset using CLIP.
    
    Args:
        clip_client: The CLIP client
        dataset_split: List of evaluation samples to process
        max_samples: Maximum number of samples to process (None = all)
    
    Returns:
        numpy array of embeddings [num_samples, 1024]
        - If text + image: [text_embedding(512) | image_embedding(512)]
        - If text only: [text_embedding(512) | zeros(512)]
    """
    embeddings_list = []
    
    num_to_process = len(dataset_split) if max_samples is None else min(max_samples, len(dataset_split))
    
    print(f"Generating embeddings for {num_to_process} samples...")
    print(f"Output embedding dimension: {COMBINED_DIM}")
    
    for idx in tqdm(range(num_to_process)):
        sample = dataset_split[idx]
        prompt_data = prepare_prompt_for_embedding(sample)
        
        text = prompt_data['prompt']
        images = prompt_data['images']
        
        # Generate embedding based on whether images are present
        if len(images) > 0:
            # Has both text and image - concatenate text and image embeddings
            text_embedding = clip_client.encode([text])  # Shape: (1, 512)
            image_data_uri = f"data:image/png;base64,{images[0]}"
            image_embedding = clip_client.encode([image_data_uri])  # Shape: (1, 512)
            
            # Concatenate text and image embeddings
            combined_embedding = np.concatenate([
                text_embedding[0],  # (512,)
                image_embedding[0]  # (512,)
            ])  # Result: (1024,)
            
            embeddings_list.append(combined_embedding)
        else:
            # Text only - pad with zeros
            text_embedding = clip_client.encode([text])  # Shape: (1, 512)
            
            # Pad with 512 zeros
            padding = np.zeros(CLIP_IMAGE_DIM)
            combined_embedding = np.concatenate([
                text_embedding[0],  # (512,)
                padding  # (512,)
            ])  # Result: (1024,)
            
            embeddings_list.append(combined_embedding)
    
    # Stack all embeddings
    embeddings = np.stack(embeddings_list, axis=0)
    print(f"Generated embeddings shape: {embeddings.shape}")
    
    return embeddings


### 3.4 Load Evaluation Data

Load the evaluation JSON files created in the previous section.


In [None]:
# Paths to evaluation data (created in the previous section)
train_json_path = "multimodal_router/src/nat_sfc_router/training/hf_evaluations_train.json"
test_json_path = "multimodal_router/src/nat_sfc_router/training/hf_evaluations_test.json"

# Load the evaluation data
print("Loading evaluation data from JSON files...")
train_data = load_json_data(train_json_path)
test_data = load_json_data(test_json_path)

print(f"✓ Loaded {len(train_data)} train samples from: {train_json_path}")
print(f"✓ Loaded {len(test_data)} test samples from: {test_json_path}")


### 3.5 Connect to CLIP Server

Establish connection to the CLIP embedding server.


In [None]:
# Connect to CLIP server
clip_client = load_embedding_model()


### 3.6 Generate Embeddings

Generate CLIP embeddings for both training and test sets.


In [None]:
# Generate embeddings for training set
print("="*80)
print("Generating TRAIN embeddings...")
print("="*80)
train_embeddings = generate_embeddings_for_dataset(clip_client, train_data, max_samples=None)
print(f"\n✓ Train embeddings shape: {train_embeddings.shape}")


In [None]:
# Generate embeddings for test set
print("="*80)
print("Generating TEST embeddings...")
print("="*80)
test_embeddings = generate_embeddings_for_dataset(clip_client, test_data, max_samples=None)
print(f"\n✓ Test embeddings shape: {test_embeddings.shape}")


### 3.7 Save Embeddings

Save the generated embeddings as NumPy arrays for use in neural network training.


In [None]:
# Output paths for embeddings
train_embeddings_path = "multimodal_router/src/nat_sfc_router/training/hf_train_embeddings.npy"
test_embeddings_path = "multimodal_router/src/nat_sfc_router/training/hf_test_embeddings.npy"
metadata_path = "multimodal_router/src/nat_sfc_router/training/hf_embeddings_metadata.json"

# Save embeddings
np.save(train_embeddings_path, train_embeddings)
print(f"✓ Saved train embeddings to: {train_embeddings_path}")
print(f"  Shape: {train_embeddings.shape}")

np.save(test_embeddings_path, test_embeddings)
print(f"✓ Saved test embeddings to: {test_embeddings_path}")
print(f"  Shape: {test_embeddings.shape}")

# Save metadata
metadata = {
    "train_samples": len(train_embeddings),
    "test_samples": len(test_embeddings),
    "embedding_dim": train_embeddings.shape[1],
    "model": f"CLIP (grpc://{os.getenv('CLIP_SERVER', '0.0.0.0:51000')})",
    "text_dim": CLIP_TEXT_DIM,
    "image_dim": CLIP_IMAGE_DIM,
    "combined_dim": COMBINED_DIM,
    "embedding_structure": "text_embedding(512) | image_embedding_or_zeros(512)"
}

with open(metadata_path, "w") as f:
    json.dump(metadata, f, indent=2)
print(f"✓ Saved metadata to: {metadata_path}")

print("\n" + "="*80)
print("Embeddings generated successfully!")
print("You can now use these embeddings for training the neural network router.")
print("="*80)


---

## 4. Neural Network Router Training (`nn_router.py`)

This section trains a neural network router using the pre-computed embeddings:

1. **Load embeddings and labels** from the previous steps
2. **Define neural network architecture** (multi-layer with batch normalization and dropout)
3. **Train with class weighting** to handle imbalanced data
4. **Hyperparameter tuning** via random search
5. **Evaluate router performance** on test set
6. **Test threshold configurations** for cost-quality tradeoffs
7. **Save trained model** for inference

The router predicts which model will correctly answer a given query, enabling intelligent routing to optimize for both accuracy and cost.

In [None]:
# Imports for neural network router
from pathlib import Path
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

### 4.1 Configuration

Set up file paths, random seeds, device selection, and hyperparameter configurations.


In [None]:
# File paths for training data (from previous sections)
TRAIN_JSON = "multimodal_router/src/nat_sfc_router/training/hf_evaluations_train.json"
TEST_JSON = "multimodal_router/src/nat_sfc_router/training/hf_evaluations_test.json"
TRAIN_EMBEDDINGS = "multimodal_router/src/nat_sfc_router/training/hf_train_embeddings.npy"
TEST_EMBEDDINGS = "multimodal_router/src/nat_sfc_router/training/hf_test_embeddings.npy"

# Training settings
RANDOM_STATE = 42
TUNE_HYPERPARAMETERS = True  # Set to False to skip tuning and use default params
USE_CLASS_WEIGHTS = True  # Set to False to disable class weighting for imbalanced data

# Set random seeds for reproducibility
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# Check for GPU availability
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

# Default hyperparameters
default_config = {
    'hidden_dims': [512, 256, 128],  # Hidden layer dimensions
    'dropout': 0.3,
    'learning_rate': 0.001,
    'batch_size': 64,
    'epochs': 50,
    'weight_decay': 1e-5,
    'patience': 10,  # Early stopping patience
}

# Hyperparameter search space for tuning
param_grid = {
    'hidden_dims': [
        [256, 128],
        [512, 256],
        [512, 256, 128],
        [1024, 512, 256],
        [256, 128, 64],
    ],
    'dropout': [0.2, 0.3, 0.4, 0.5],
    'learning_rate': [0.0001, 0.0005, 0.001, 0.002],
    'batch_size': [32, 64, 128],
    'weight_decay': [0, 1e-6, 1e-5, 1e-4],
}

print(f"Configuration:")
print(f"  TUNE_HYPERPARAMETERS: {TUNE_HYPERPARAMETERS}")
print(f"  USE_CLASS_WEIGHTS: {USE_CLASS_WEIGHTS}")
print(f"  Default hidden dims: {default_config['hidden_dims']}")


### 4.2 Neural Network Architecture

Define the `RouterNetwork` class - a multi-output neural network that predicts which model will correctly answer a given query.


In [None]:
class RouterNetwork(nn.Module):
    """
    Multi-output neural network for routing.
    Predicts probability that each model will be correct for a given input.
    
    Architecture:
    - Multiple hidden layers with batch normalization and dropout
    - Sigmoid output for independent probability predictions per model
    """
    def __init__(self, input_dim, output_dim, hidden_dims=[512, 256, 128], dropout=0.3):
        super(RouterNetwork, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        # Build hidden layers
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = hidden_dim
        
        # Output layer (one sigmoid output per model)
        layers.append(nn.Linear(prev_dim, output_dim))
        
        self.network = nn.Sequential(*layers)
        
    def forward(self, x):
        logits = self.network(x)
        # Apply sigmoid to get probabilities for each model independently
        return torch.sigmoid(logits)


### 4.3 Data Loading

Function to load embeddings and labels from the files created in previous sections.


In [None]:
def load_data(json_path: str, embeddings_path: str, selected_models=None):
    """
    Load data from JSON file and corresponding embeddings.
    
    Args:
        json_path: Path to JSON file with model_scores
        embeddings_path: Path to embeddings .npy file
        selected_models: Optional list of model names to keep. If None, keeps all models.
    
    Returns:
        embeddings (X), labels (Y), model_names, and prompts.
    """
    # Load JSON
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    # Load embeddings
    embeddings = np.load(embeddings_path)
    
    # Verify alignment
    if len(data) != len(embeddings):
        raise ValueError(f"Mismatch: {len(data)} JSON records vs {len(embeddings)} embeddings")
    
    # Extract labels from model_scores
    if not data:
        raise ValueError("Empty JSON file")
    
    all_model_names = list(data[0]['model_scores'].keys())
    print(f"Found models in data: {all_model_names}")
    
    # Filter to selected models if specified
    if selected_models is not None:
        model_names = [m for m in all_model_names if m in selected_models]
        if len(model_names) != len(selected_models):
            missing = set(selected_models) - set(model_names)
            print(f"Warning: The following selected models were not found in data: {missing}")
        print(f"Using selected models: {model_names}")
    else:
        model_names = all_model_names
    
    # Build label matrix
    labels = []
    prompts = []
    for record in data:
        prompts.append(record['prompt'])
        label_row = [record['model_scores'].get(model, 0) for model in model_names]
        labels.append(label_row)
    
    Y = np.array(labels, dtype=np.float32)
    X = embeddings.astype(np.float32)
    
    return X, Y, model_names, prompts


### 4.4 Training Utilities

Functions for class weight computation and model training with early stopping.


In [None]:
def compute_class_weights(y_train, model_names, method='inverse'):
    """
    Compute class weights to handle imbalanced data.
    
    Args:
        y_train: Training labels (n_samples, n_models)
        model_names: List of model names
        method: 'inverse' or 'balanced'
    
    Returns:
        torch.Tensor of shape (n_models,) with weights for each model
    """
    weights = []
    
    print("\n" + "="*80)
    print("Computing class weights to handle data imbalance:")
    print("="*80)
    
    for i, model_name in enumerate(model_names):
        pos_rate = y_train[:, i].mean()
        
        if method == 'inverse':
            weight = 1.0 / pos_rate if pos_rate > 0 else 1.0
        elif method == 'balanced':
            n_pos = y_train[:, i].sum()
            weight = len(y_train) / (2 * n_pos) if n_pos > 0 else 1.0
        else:
            weight = 1.0
        
        weights.append(weight)
        print(f"  {model_name:40s}: pos_rate={pos_rate:.4f}, weight={weight:.4f}")
    
    # Normalize weights
    weights = np.array(weights)
    weights = weights * len(weights) / weights.sum()
    
    print(f"\nNormalized weights (sum={weights.sum():.2f}):")
    for i, model_name in enumerate(model_names):
        print(f"  {model_name:40s}: {weights[i]:.4f}")
    print("="*80)
    
    return torch.FloatTensor(weights)


def train_model(model, train_loader, val_loader, config, model_names, class_weights=None):
    """
    Train the neural network with early stopping.
    """
    # Use weighted BCE loss if class weights are provided
    if class_weights is not None:
        def weighted_bce_loss(outputs, targets):
            bce = -(targets * torch.log(outputs + 1e-8) + (1 - targets) * torch.log(1 - outputs + 1e-8))
            weighted = bce * class_weights.to(outputs.device)
            return weighted.mean()
        criterion = weighted_bce_loss
    else:
        criterion = nn.BCELoss()
    
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
    
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    for epoch in range(config['epochs']):
        # Training
        model.train()
        train_loss = 0.0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(DEVICE), batch_y.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        
        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(DEVICE), batch_y.to(DEVICE)
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item()
        val_loss /= len(val_loader)
        
        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"    Epoch {epoch+1}/{config['epochs']}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}")
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_state = model.state_dict().copy()
        else:
            patience_counter += 1
            if patience_counter >= config['patience']:
                print(f"    Early stopping at epoch {epoch+1}")
                break
    
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    return model, best_val_loss


### 4.5 Evaluation Functions

Functions to evaluate router performance on test data.


In [None]:
def evaluate_router(model, X_test, y_test, model_names, model_thresholds=None):
    """Evaluate the router on test set with optional thresholds."""
    model.eval()
    with torch.no_grad():
        X_test_tensor = torch.FloatTensor(X_test).to(DEVICE)
        proba_test = model(X_test_tensor).cpu().numpy()
    
    # Evaluate per-model metrics
    metrics = {}
    for i, model_name in enumerate(model_names):
        y_true = y_test[:, i]
        y_score = proba_test[:, i]
        try:
            auc = roc_auc_score(y_true, y_score)
        except ValueError:
            auc = float("nan")
        preds = (y_score >= 0.5).astype(int)
        acc = accuracy_score(y_true, preds)
        f1 = f1_score(y_true, preds, zero_division=0)
        metrics[model_name] = {"auc": auc, "accuracy": acc, "f1": f1, "positive_rate": y_true.mean()}
    
    # Router evaluation (system accuracy)
    chosen_idx = np.argmax(proba_test, axis=1)
    
    # Apply model thresholds if specified
    if model_thresholds is not None and len(model_thresholds) > 0:
        threshold_map = {model_names.index(m): t for m, t in model_thresholds.items() if m in model_names}
        for i in range(len(chosen_idx)):
            chosen_model_idx = chosen_idx[i]
            if chosen_model_idx in threshold_map:
                if proba_test[i, chosen_model_idx] < threshold_map[chosen_model_idx]:
                    sorted_indices = np.argsort(proba_test[i])[::-1]
                    for candidate_idx in sorted_indices:
                        if candidate_idx == chosen_model_idx:
                            continue
                        if candidate_idx in threshold_map:
                            if proba_test[i, candidate_idx] >= threshold_map[candidate_idx]:
                                chosen_idx[i] = candidate_idx
                                break
                        else:
                            chosen_idx[i] = candidate_idx
                            break
    
    chosen_label = np.array([y_test[i, chosen_idx[i]] for i in range(len(chosen_idx))])
    system_accuracy = chosen_label.mean()
    
    # Model selection counts
    model_selection_counts = {}
    for i, model_name in enumerate(model_names):
        count = np.sum(chosen_idx == i)
        model_selection_counts[model_name] = {
            'count': int(count),
            'percentage': count / len(chosen_idx) * 100,
            'accuracy_when_chosen': chosen_label[chosen_idx == i].mean() if count > 0 else 0.0
        }
    
    any_correct = (y_test.sum(axis=1) >= 1).mean()
    always_acc = {model_names[i]: y_test[:, i].mean() for i in range(len(model_names))}
    
    return {
        'metrics': metrics, 'system_accuracy': system_accuracy,
        'model_selection_counts': model_selection_counts, 'any_correct': any_correct,
        'always_acc': always_acc, 'proba_test': proba_test
    }


### 4.6 Hyperparameter Tuning

Random search over hyperparameter space to find optimal configuration.


In [None]:
def tune_hyperparameters(X_train, y_train, X_val, y_val, model_names, class_weights=None, n_trials=10):
    """Perform random search over hyperparameters."""
    print("\n" + "="*80)
    print("HYPERPARAMETER TUNING (Neural Network)")
    print("="*80)
    print(f"Running {n_trials} random search trials...")
    
    best_val_loss = float('inf')
    best_config = None
    best_model = None
    
    input_dim = X_train.shape[1]
    output_dim = y_train.shape[1]
    
    for trial in range(n_trials):
        config = {
            'hidden_dims': random.choice(param_grid['hidden_dims']),
            'dropout': float(np.random.choice(param_grid['dropout'])),
            'learning_rate': float(np.random.choice(param_grid['learning_rate'])),
            'batch_size': int(np.random.choice(param_grid['batch_size'])),
            'weight_decay': float(np.random.choice(param_grid['weight_decay'])),
            'epochs': default_config['epochs'],
            'patience': default_config['patience'],
        }
        
        print(f"\nTrial {trial+1}/{n_trials}")
        print(f"  Config: hidden={config['hidden_dims']}, dropout={config['dropout']:.2f}, "
              f"lr={config['learning_rate']:.4f}, bs={config['batch_size']}")
        
        model = RouterNetwork(input_dim, output_dim, config['hidden_dims'], config['dropout']).to(DEVICE)
        
        train_dataset = TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
        val_dataset = TensorDataset(torch.FloatTensor(X_val), torch.FloatTensor(y_val))
        train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)
        
        model, val_loss = train_model(model, train_loader, val_loader, config, model_names, class_weights)
        
        print(f"  Final validation loss: {val_loss:.4f}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_config = config.copy()
            best_model = model
            print(f"  *** New best model! ***")
    
    print("\n" + "="*80)
    print("Best hyperparameters found:")
    print("="*80)
    for key, value in best_config.items():
        print(f"  {key}: {value}")
    
    return best_model, best_config


### 4.7 Load Training Data

Load embeddings and labels for router training.


In [None]:
# Define models to train on
selected_models = [
    'gpt-5-chat',
    'nvidia/nvidia-nemotron-nano-9b-v2',
    'Qwen/Qwen3-VL-8B-Instruct'
]

# Load training data
print("Loading training data...")
X_train_full, y_train_full, router_model_names, train_prompts = load_data(
    TRAIN_JSON, TRAIN_EMBEDDINGS, selected_models=selected_models
)

# Load test data
print("\nLoading test data...")
X_test, y_test, _, test_prompts = load_data(
    TEST_JSON, TEST_EMBEDDINGS, selected_models=selected_models
)

print(f"\n✓ Training set: {X_train_full.shape[0]} samples, {X_train_full.shape[1]} features")
print(f"✓ Test set: {X_test.shape[0]} samples")
print(f"✓ Models: {router_model_names}")
print(f"\nLabel distribution (train):")
for i, model in enumerate(router_model_names):
    pos_rate = y_train_full[:, i].mean()
    print(f"  {model}: {pos_rate:.3f} positive rate")


### 4.8 Train Router Model

Split data into train/validation sets, compute class weights, and train the neural network.


In [None]:
# Split training data into train/val for hyperparameter tuning
val_size = int(0.15 * len(X_train_full))
indices = np.random.permutation(len(X_train_full))
val_indices = indices[:val_size]
train_indices = indices[val_size:]

X_train = X_train_full[train_indices]
y_train = y_train_full[train_indices]
X_val = X_train_full[val_indices]
y_val = y_train_full[val_indices]

print(f"Split: {len(X_train)} train, {len(X_val)} validation")

input_dim = X_train.shape[1]
output_dim = y_train.shape[1]

# Compute class weights
class_weights = None
if USE_CLASS_WEIGHTS:
    class_weights = compute_class_weights(y_train, router_model_names, method='balanced')

# Train model
if TUNE_HYPERPARAMETERS:
    router_model, best_config = tune_hyperparameters(
        X_train, y_train, X_val, y_val, router_model_names, class_weights, n_trials=10
    )
else:
    print("\nTraining with default parameters...")
    config = default_config.copy()
    router_model = RouterNetwork(input_dim, output_dim, config['hidden_dims'], config['dropout']).to(DEVICE)
    
    train_dataset = TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
    val_dataset = TensorDataset(torch.FloatTensor(X_val), torch.FloatTensor(y_val))
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)
    
    router_model, val_loss = train_model(router_model, train_loader, val_loader, config, router_model_names, class_weights)
    best_config = config

print("\n✓ Training complete!")


### 4.9 Evaluate Router

Evaluate the trained router on the test set and display metrics.


In [None]:
# Evaluate on test set
print("="*80)
print("EVALUATION ON TEST SET")
print("="*80)

results = evaluate_router(router_model, X_test, y_test, router_model_names)

# Print per-model metrics
print("\nPer-model metrics (on test set):")
print("="*80)
for model_name, m in results['metrics'].items():
    print(f"  {model_name:40s}: AUC={m['auc']:.3f}, acc={m['accuracy']:.3f}, "
          f"f1={m['f1']:.3f}, pos_rate={m['positive_rate']:.3f}")

# Print baselines
print("\n" + "="*80)
print("Baseline (always choose single model) accuracies:")
print("="*80)
for model_name, acc in results['always_acc'].items():
    print(f"  {model_name:40s}: {acc:.3f}")

# Print router selection distribution
print("\n" + "="*80)
print("Router model selection distribution:")
print("="*80)
for model_name, stats in results['model_selection_counts'].items():
    print(f"  {model_name:40s}: selected {stats['count']:4d} times "
          f"({stats['percentage']:5.1f}%) - accuracy when chosen: {stats['accuracy_when_chosen']:.3f}")

# Print final results
print("\n" + "="*80)
print(f"Router system accuracy: {results['system_accuracy']:.3f}")
print(f"Oracle (best possible):  {results['any_correct']:.3f}")
print("="*80)


### 4.10 Save Trained Model

Save the trained router model for later use in inference.


In [None]:
# Save model and config
out_dir = Path("multimodal_router/src/nat_sfc_router/training/router_artifacts")
out_dir.mkdir(parents=True, exist_ok=True)

model_path = out_dir / "nn_router.pth"
torch.save({
    'model_state_dict': router_model.state_dict(),
    'config': best_config,
    'model_names': router_model_names,
    'input_dim': input_dim,
    'output_dim': output_dim,
}, model_path)

print(f"✓ Saved trained router model to: {model_path}")
print(f"\nModel configuration:")
for key, value in best_config.items():
    print(f"  {key}: {value}")
