# Lumentra Voice AI - H100 Testing Notebook

Test FunctionGemma 270M (router) + Gemma 3 12B / Qwen 32B (main LLM) on H100.

**Runtime Requirements:**
- GPU: A100/H100 (40GB+ VRAM)
- High-RAM runtime recommended for Qwen 32B

**Models Tested:**
1. FunctionGemma 270M - Fast function calling router (~550MB)
2. Gemma 3 12B - Main conversation + tool calling
3. Qwen3 32B - Heavy backup for complex queries

In [None]:
# Check GPU availability
!nvidia-smi

import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Install dependencies
!pip install -q transformers accelerate bitsandbytes
!pip install -q vllm  # For fast inference
!pip install -q huggingface_hub

## 1. FunctionGemma 270M Setup

Ultra-fast function calling router. Runs on CPU or GPU.

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import time

# Load FunctionGemma 270M
print("Loading FunctionGemma 270M...")
fg_model_id = "google/functiongemma-2b"  # Using 2B version; 270M not on HF yet

fg_tokenizer = AutoTokenizer.from_pretrained(fg_model_id)
fg_model = AutoModelForCausalLM.from_pretrained(
    fg_model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)
print(f"FunctionGemma loaded on {fg_model.device}")

In [None]:
# Tool definitions for FunctionGemma
TOOLS = [
    {
        "name": "check_availability",
        "description": "Check available appointment slots for a specific date",
        "parameters": {
            "type": "object",
            "properties": {
                "date": {"type": "string", "description": "Date in YYYY-MM-DD format"},
                "service_type": {"type": "string", "description": "Type of service"}
            },
            "required": ["date"]
        }
    },
    {
        "name": "create_booking",
        "description": "Create a new booking for the customer",
        "parameters": {
            "type": "object",
            "properties": {
                "customer_name": {"type": "string"},
                "customer_phone": {"type": "string"},
                "date": {"type": "string"},
                "time": {"type": "string"}
            },
            "required": ["customer_name", "customer_phone", "date", "time"]
        }
    },
    {
        "name": "transfer_to_human",
        "description": "Transfer call to human staff",
        "parameters": {
            "type": "object",
            "properties": {
                "reason": {"type": "string"}
            },
            "required": ["reason"]
        }
    }
]

import json

def build_fg_prompt(user_message: str) -> str:
    """Build FunctionGemma prompt."""
    tools_json = json.dumps(TOOLS, indent=2)
    return f"""You are a function calling AI. Given the user message and available tools, decide if a function should be called.

Available tools:
{tools_json}

User message: {user_message}

If a function should be called, respond with:
<start_function_call>function_name(param1="value1", param2="value2")<end_function_call>

If no function is needed, respond with:
<no_function_call>

Response:"""

In [None]:
def route_with_functiongemma(user_message: str) -> dict:
    """Route user message using FunctionGemma."""
    start = time.time()
    
    prompt = build_fg_prompt(user_message)
    inputs = fg_tokenizer(prompt, return_tensors="pt").to(fg_model.device)
    
    with torch.no_grad():
        outputs = fg_model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.1,
            do_sample=True,
            pad_token_id=fg_tokenizer.eos_token_id
        )
    
    response = fg_tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response[len(prompt):].strip()
    
    latency_ms = (time.time() - start) * 1000
    
    # Parse response
    import re
    func_match = re.search(r'<start_function_call>(.+?)<end_function_call>', response)
    
    if func_match:
        func_call = func_match.group(1)
        # Parse function name and args
        name_match = re.match(r'(\w+)\((.*)\)', func_call)
        if name_match:
            func_name = name_match.group(1)
            args_str = name_match.group(2)
            # Simple arg parsing
            args = {}
            for arg_match in re.finditer(r'(\w+)="([^"]+)"', args_str):
                args[arg_match.group(1)] = arg_match.group(2)
            
            return {
                "action": "function_call",
                "function": func_name,
                "arguments": args,
                "latency_ms": latency_ms,
                "raw_output": response
            }
    
    return {
        "action": "llm_required",
        "latency_ms": latency_ms,
        "raw_output": response
    }

# Test FunctionGemma routing
test_messages = [
    "Hi there!",
    "I want to book an appointment for tomorrow at 2pm",
    "What times are available on Friday?",
    "I need to speak to a manager about a refund",
    "Can you check availability for next Monday?",
]

print("Testing FunctionGemma routing:")
print("=" * 60)
for msg in test_messages:
    result = route_with_functiongemma(msg)
    print(f"\nInput: {msg}")
    print(f"Action: {result['action']}")
    if result['action'] == 'function_call':
        print(f"Function: {result['function']}")
        print(f"Args: {result['arguments']}")
    print(f"Latency: {result['latency_ms']:.1f}ms")

## 2. Gemma 3 12B Setup

Main conversation model with tool calling capability.

In [None]:
# Load Gemma 3 12B (or Gemma 2 9B if 3 not available)
print("Loading Gemma model...")

gemma_model_id = "google/gemma-2-9b-it"  # Using Gemma 2 9B IT

gemma_tokenizer = AutoTokenizer.from_pretrained(gemma_model_id)
gemma_model = AutoModelForCausalLM.from_pretrained(
    gemma_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    # Use 4-bit quantization to fit in memory
    # load_in_4bit=True,  # Uncomment if memory constrained
)
print(f"Gemma loaded on {gemma_model.device}")

In [None]:
SYSTEM_PROMPT = """You are Luna, the AI voice assistant for Sunrise Salon.

You help callers with booking appointments, checking availability, and general inquiries.
Keep responses concise and natural for voice conversation.

Available tools:
- check_availability(date): Check available time slots
- create_booking(customer_name, customer_phone, date, time): Book an appointment
- transfer_to_human(reason): Transfer to staff

When using tools, format as: <tool>tool_name(args)</tool>
After tool use, provide a natural response based on the result."""

def chat_with_gemma(user_message: str, history: list = None) -> dict:
    """Chat with Gemma model."""
    start = time.time()
    
    if history is None:
        history = []
    
    # Build conversation
    messages = [{"role": "user", "content": SYSTEM_PROMPT}]
    messages.append({"role": "assistant", "content": "I understand. I'm Luna, ready to help customers."})
    
    for h in history:
        messages.append(h)
    
    messages.append({"role": "user", "content": user_message})
    
    # Format for Gemma
    prompt = gemma_tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = gemma_tokenizer(prompt, return_tensors="pt").to(gemma_model.device)
    
    with torch.no_grad():
        outputs = gemma_model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            do_sample=True,
            pad_token_id=gemma_tokenizer.eos_token_id
        )
    
    response = gemma_tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract just the assistant response
    response = response.split("<start_of_turn>model\n")[-1]
    response = response.split("<end_of_turn>")[0].strip()
    
    latency_ms = (time.time() - start) * 1000
    
    return {
        "text": response,
        "latency_ms": latency_ms
    }

# Test Gemma conversation
print("Testing Gemma conversation:")
print("=" * 60)

test_convos = [
    "Hi, I'd like to book a haircut",
    "Do you have anything available tomorrow afternoon?",
    "2pm works for me. My name is John and my number is 555-1234",
]

history = []
for msg in test_convos:
    result = chat_with_gemma(msg, history)
    print(f"\nUser: {msg}")
    print(f"Luna: {result['text']}")
    print(f"Latency: {result['latency_ms']:.1f}ms")
    history.append({"role": "user", "content": msg})
    history.append({"role": "assistant", "content": result['text']})

## 3. Complete Pipeline Test

FunctionGemma routes -> Gemma/Qwen handles conversation

In [None]:
def voice_pipeline(user_message: str, history: list = None) -> dict:
    """Complete voice AI pipeline."""
    total_start = time.time()
    
    # Step 1: Route with FunctionGemma
    route_result = route_with_functiongemma(user_message)
    
    if route_result["action"] == "function_call":
        # Direct function execution
        func_name = route_result["function"]
        func_args = route_result["arguments"]
        
        # Simulate tool execution
        if func_name == "check_availability":
            tool_result = {"available": True, "slots": ["10am", "2pm", "4pm"]}
            response = f"I have availability at 10am, 2pm, and 4pm. Which time works best for you?"
        elif func_name == "create_booking":
            tool_result = {"success": True, "confirmation": "BK12345"}
            response = f"Your appointment is booked. Your confirmation code is BK12345."
        elif func_name == "transfer_to_human":
            tool_result = {"transferred": True}
            response = "I'll transfer you to our team now. Please hold."
        else:
            tool_result = {}
            response = "I'll help you with that."
        
        total_latency = (time.time() - total_start) * 1000
        return {
            "text": response,
            "tool_called": func_name,
            "tool_args": func_args,
            "tool_result": tool_result,
            "router_latency_ms": route_result["latency_ms"],
            "total_latency_ms": total_latency,
            "used_llm": False
        }
    
    # Step 2: Use Gemma for conversation
    llm_result = chat_with_gemma(user_message, history)
    
    total_latency = (time.time() - total_start) * 1000
    return {
        "text": llm_result["text"],
        "tool_called": None,
        "router_latency_ms": route_result["latency_ms"],
        "llm_latency_ms": llm_result["latency_ms"],
        "total_latency_ms": total_latency,
        "used_llm": True
    }

# Test complete pipeline
print("Testing Complete Pipeline:")
print("=" * 60)

pipeline_tests = [
    "Hello!",
    "What times do you have available tomorrow?",
    "I'd like to book an appointment for tomorrow at 2pm. My name is Sarah.",
    "Thanks, can you tell me about your pricing?",
    "I want to speak to a manager about my last visit",
]

history = []
latencies = []

for msg in pipeline_tests:
    result = voice_pipeline(msg, history)
    print(f"\nUser: {msg}")
    print(f"Response: {result['text']}")
    print(f"Tool called: {result.get('tool_called', 'None')}")
    print(f"Used LLM: {result['used_llm']}")
    print(f"Router: {result['router_latency_ms']:.1f}ms | Total: {result['total_latency_ms']:.1f}ms")
    
    latencies.append(result['total_latency_ms'])
    history.append({"role": "user", "content": msg})
    history.append({"role": "assistant", "content": result['text']})

print("\n" + "=" * 60)
print(f"Average latency: {sum(latencies)/len(latencies):.1f}ms")
print(f"Min latency: {min(latencies):.1f}ms")
print(f"Max latency: {max(latencies):.1f}ms")

## 4. Benchmark: Throughput Test

In [None]:
import random

# Benchmark messages
benchmark_messages = [
    "Hi",
    "Hello there",
    "I want to book an appointment",
    "What times are available?",
    "Do you have anything tomorrow?",
    "Can I book for 2pm?",
    "What are your hours?",
    "How much does a haircut cost?",
    "I need to reschedule",
    "Thanks, bye!",
] * 5  # 50 messages

random.shuffle(benchmark_messages)

print(f"Running benchmark with {len(benchmark_messages)} messages...")
print("=" * 60)

router_latencies = []
total_latencies = []
llm_used_count = 0

benchmark_start = time.time()

for i, msg in enumerate(benchmark_messages):
    result = voice_pipeline(msg)
    router_latencies.append(result['router_latency_ms'])
    total_latencies.append(result['total_latency_ms'])
    if result['used_llm']:
        llm_used_count += 1
    
    if (i + 1) % 10 == 0:
        print(f"Processed {i + 1}/{len(benchmark_messages)}")

benchmark_time = time.time() - benchmark_start

print("\n" + "=" * 60)
print("BENCHMARK RESULTS")
print("=" * 60)
print(f"Total messages: {len(benchmark_messages)}")
print(f"Total time: {benchmark_time:.1f}s")
print(f"Throughput: {len(benchmark_messages)/benchmark_time:.1f} messages/sec")
print(f"\nRouter latency:")
print(f"  Mean: {sum(router_latencies)/len(router_latencies):.1f}ms")
print(f"  Min: {min(router_latencies):.1f}ms")
print(f"  Max: {max(router_latencies):.1f}ms")
print(f"\nTotal latency:")
print(f"  Mean: {sum(total_latencies)/len(total_latencies):.1f}ms")
print(f"  Min: {min(total_latencies):.1f}ms")
print(f"  Max: {max(total_latencies):.1f}ms")
print(f"\nLLM usage: {llm_used_count}/{len(benchmark_messages)} ({100*llm_used_count/len(benchmark_messages):.1f}%)")

## 5. Optional: Qwen 32B for Complex Queries

Uncomment to test Qwen 32B (requires more VRAM).

In [None]:
# Uncomment to load Qwen 32B
# Requires ~70GB VRAM for full precision, ~35GB with 4-bit quantization

# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch

# print("Loading Qwen3 32B...")
# qwen_model_id = "Qwen/Qwen2.5-32B-Instruct"  # Using Qwen 2.5 as Qwen 3 not available yet

# qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_id)
# qwen_model = AutoModelForCausalLM.from_pretrained(
#     qwen_model_id,
#     torch_dtype=torch.bfloat16,
#     device_map="auto",
#     load_in_4bit=True,  # Use 4-bit to fit in memory
# )
# print(f"Qwen loaded")

## 6. Memory Usage Summary

In [None]:
import gc

print("GPU Memory Usage:")
print("=" * 40)
print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"Reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
print(f"Max allocated: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")

# Free memory if needed
# del fg_model, gemma_model
# gc.collect()
# torch.cuda.empty_cache()

## Summary

**Results:**
- FunctionGemma router: ~X ms average latency
- Direct tool calls: ~X ms total (no LLM needed)
- LLM conversation: ~X ms total

**Recommendations:**
1. Use FunctionGemma 270M for routing (very fast)
2. Use Gemma 2 9B for most conversations
3. Reserve Qwen 32B for complex edge cases

**Next Steps:**
- Deploy FunctionGemma on Ollama for production
- Use Groq API for Llama 3.1 8B (free tier)
- Consider RunPod/Modal for Gemma 3 12B hosting