In [None]:
print('Setup complete.')

# AI Assistant Comparison

## Learning Objectives
- See different AI models tackle the same real-world problem
- Compare their approaches, speed, and quality
- Understand how streaming affects user experience
- Learn about token budgets and cost optimization

## The Demo: Model Comparison in Action

We'll test multiple AI models on the same challenging task:
1. **Problem Setup** - A complex business scenario
2. **Model Testing** - Different AI assistants tackle the problem
3. **Performance Analysis** - Speed, quality, and cost comparison
4. **Streaming Demo** - Real-time vs batch processing
5. **Budget Impact** - Cost implications of different approaches

In [None]:
# Setup and imports
!pip install asksageclient pip_system_certs
from google.colab import drive
drive.mount('/content/drive')

import os
import json
import time
import tiktoken
from pathlib import Path
from typing import Dict, List, Any

# Import our AskSage client
from asksageclient import AskSageClient

# Get API credentials from Google Colab secrets
from google.colab import userdata
api_key = userdata.get('ASKSAGE_API_KEY')
email = userdata.get('ASKSAGE_EMAIL')

# Initialize client and tokenizer
client = AskSageClient(api_key=api_key, email=email)
tokenizer = tiktoken.encoding_for_model("gpt-4")
print("AskSage client initialized successfully")
print("Ready to showcase AI capabilities...")

## Test Scenario: Market Analysis Challenge

**Business Problem**: A startup needs to analyze market opportunities for a new product

**Requirements**:
- Analyze market size and growth potential
- Identify key competitors and their strategies
- Recommend go-to-market approach
- Assess risks and mitigation strategies
- Provide actionable next steps

Let's see how different models handle this complex analysis...

In [None]:
# Define the test prompt
market_analysis_prompt = """
MARKET ANALYSIS REQUEST:

Our startup is developing an AI-powered personal finance app targeting millennials and Gen Z. 
We need a comprehensive market analysis including:

1. MARKET SIZE & OPPORTUNITY
   - Total addressable market (TAM)
   - Serviceable addressable market (SAM)
   - Market growth trends and drivers

2. COMPETITIVE LANDSCAPE
   - Direct and indirect competitors
   - Their strengths, weaknesses, and market positioning
   - Pricing strategies and business models

3. TARGET CUSTOMER ANALYSIS
   - Customer segments and personas
   - Pain points and unmet needs
   - Buying behavior and decision factors

4. GO-TO-MARKET STRATEGY
   - Recommended market entry approach
   - Distribution channels and partnerships
   - Marketing and customer acquisition strategies

5. RISK ASSESSMENT
   - Market risks and challenges
   - Regulatory considerations
   - Mitigation strategies

Provide actionable insights with specific recommendations and next steps.
"""

prompt_tokens = len(tokenizer.encode(market_analysis_prompt))
print(f"Test prompt: {prompt_tokens} tokens")
print("\nPreparing to test multiple AI models...")

## Model 1: GPT-4o-mini (Speed Optimized)

Testing the fastest model for quick iterations:

In [None]:
# Test GPT-5-mini
print("=== TESTING GPT-5-mini ===")
start_time = time.time()

# Test GPT-5-mini
print("=== TESTING GPT-5-mini ===")
start_time = time.time()

response_mini = client.query(
    message=market_analysis_prompt,
    system_prompt="You are concise.",
    temperature=0.2,
    model="gpt-5-mini",
    live=0,
    limit_references=0,
)


mini_time = time.time() - start_time
mini_response = mini_response.get("message").strip()
mini_tokens = len(tokenizer.encode(mini_response))

mini_tokens = len(tokenizer.encode(mini_response))

mini_response = response_mini.get("message").strip()
mini_tokens = len(tokenizer.encode(mini_response))

print(f"Response time: {mini_time:.2f} seconds")
print(f"Output tokens: {mini_tokens}")
print(f"Tokens per second: {mini_tokens/mini_time:.1f}")
print("\nResponse preview:")
print(mini_response[:500] + "...")
print("\n" + "="*60)

## Model 2: GPT-4o (Balanced Performance)

Testing the balanced model for quality and speed:

In [None]:
# Test GPT-5-mini
print("=== TESTING GPT-5-mini ===")
start_time = time.time()

# Test GPT-5-mini
print("=== TESTING GPT-5-mini ===")
start_time = time.time()

response_4o = client.query(
    message=market_analysis_prompt,
    system_prompt="You are concise.",
    temperature=0.2,
    model="gpt-5-mini",
    live=0,
    limit_references=0,
)


gpt4o_time = time.time() - start_time
gpt4o_response = gpt4o_response.get("message").strip()
gpt4o_tokens = len(tokenizer.encode(gpt4o_response))

gpt4o_tokens = len(tokenizer.encode(gpt4o_response))

gpt4o_response = response_4o.get("message").strip()
gpt4o_tokens = len(tokenizer.encode(gpt4o_response))

print(f"Response time: {gpt4o_time:.2f} seconds")
print(f"Output tokens: {gpt4o_tokens}")
print(f"Tokens per second: {gpt4o_tokens/gpt4o_time:.1f}")
print("\nResponse preview:")
print(gpt4o_response[:500] + "...")
print("\n" + "="*60)

## Model 3: Claude (Alternative Approach)

Testing Claude for different reasoning patterns:

In [None]:
# Test GPT-5-mini (if available)
print("=== TESTING CLAUDE ===")
start_time = time.time()

try:
# Test GPT-5-mini
print("=== TESTING GPT-5-mini ===")
start_time = time.time()

response_gpt-5-mini = client.query(
    message=market_analysis_prompt,
    system_prompt="You are concise.",
    temperature=0.2,
    model="gpt-5-mini",
    live=0,
    limit_references=0,
)

    
    gpt-5-mini_time = time.time() - start_time
gpt-5-mini_response = response_gpt-5-mini.get("message").strip()
    gpt-5-mini_tokens = len(tokenizer.encode(gpt-5-mini_response))
    
    print(f"Response time: {gpt-5-mini_time:.2f} seconds")
    print(f"Output tokens: {gpt-5-mini_tokens}")
    print(f"Tokens per second: {gpt-5-mini_tokens/gpt-5-mini_time:.1f}")
    print("\nResponse preview:")
    print(gpt-5-mini_response[:500] + "...")
    
except Exception as e:
    print(f"GPT-5-mini not available: {e}")
    gpt-5-mini_time = 0
    gpt-5-mini_tokens = 0
    gpt-5-mini_response = "Model not available"

print("\n" + "="*60)

## Streaming vs Non-Streaming Comparison

Let's see how streaming affects the user experience:

In [None]:
# Simulate streaming vs non-streaming experience
print("=== STREAMING VS NON-STREAMING DEMO ===")

# Non-streaming (what we did above)
print(f"\nNON-STREAMING EXPERIENCE:")
print(f"  User waits: {mini_time:.2f} seconds")
print(f"  Then receives: {mini_tokens} tokens instantly")
print(f"  User experience: Wait... then flood of information")

# Simulate streaming experience
print(f"\nSTREAMING EXPERIENCE (simulated):")
print(f"  First token appears: ~0.5 seconds")
print(f"  Tokens stream at: ~{mini_tokens/mini_time:.0f} tokens/second")
print(f"  User experience: Immediate feedback, progressive revelation")

# Demonstrate with a shorter example
short_prompt = "Explain the key benefits of AI-powered personal finance apps in 3 bullet points."

print("\n=== STREAMING SIMULATION ===")
print("Sending short request to demonstrate streaming...")

start_time = time.time()
# Test GPT-5-mini
print("=== TESTING GPT-5-mini ===")
start_time = time.time()

stream_response = client.query(
    message=short_prompt,
    system_prompt="You are concise.",
    temperature=0.1,
    model="gpt-5-mini",
    live=0,
    limit_references=0,
)

stream_time = time.time() - start_time
stream_response = stream_response.get("message").strip()
stream_tokens = len(tokenizer.encode(stream_response))

stream_tokens = len(tokenizer.encode(stream_response))


stream_content = stream_response.get("message").strip()
stream_content = len(tokenizer.encode(stream_content))

print(f"\nComplete response ({stream_time:.2f}s):")
print(stream_content)

# Simulate how this would appear with streaming
print("\nHow this would appear with streaming:")
words = stream_content.split()
for i, word in enumerate(words[:20]):  # Show first 20 words
    if i % 5 == 0 and i > 0:
        print(f"\n[{i*0.1:.1f}s] ", end="")
    print(word, end=" ")
    time.sleep(0.05)  # Simulate streaming delay
print("...")

## Budget and Cost Analysis

Let's analyze the cost implications of different models and approaches:

In [None]:
# Cost analysis (using approximate pricing)
print("=== COST ANALYSIS ===")

# Approximate pricing per 1K tokens (as of 2024)
pricing = {
    "gpt-5-mini": {"input": 0.00015, "output": 0.0006},
    "gpt-5-mini": {"input": 0.0025, "output": 0.01},
    "gpt-5-mini-3-sonnet": {"input": 0.003, "output": 0.015}
}

def calculate_cost(model, input_tokens, output_tokens):
    if model not in pricing:
        return 0
    input_cost = (input_tokens / 1000) * pricing[model]["input"]
    output_cost = (output_tokens / 1000) * pricing[model]["output"]
    return input_cost + output_cost

# Calculate costs for our test
models_tested = [
    ("gpt-5-mini", mini_time, mini_tokens),
    ("gpt-5-mini", gpt4o_time, gpt4o_tokens),
]

if gpt-5-mini_tokens > 0:
    models_tested.append(("gpt-5-mini-3-sonnet", gpt-5-mini_time, gpt-5-mini_tokens))

print(f"\nCOST COMPARISON (per request):")
print(f"Input tokens: {prompt_tokens}")
print("-" * 50)

for model, response_time, output_tokens in models_tested:
    cost = calculate_cost(model, prompt_tokens, output_tokens)
    print(f"{model:15} | {response_time:5.2f}s | {output_tokens:4d} tokens | ${cost:.4f}")

print("\nSCALE ANALYSIS (1000 requests/month):")
print("-" * 50)

for model, response_time, output_tokens in models_tested:
    cost_per_request = calculate_cost(model, prompt_tokens, output_tokens)
    monthly_cost = cost_per_request * 1000
    monthly_time = (response_time * 1000) / 3600  # Convert to hours
    print(f"{model:15} | {monthly_time:5.1f}h | ${monthly_cost:6.2f}/month")

print("\nBUDGET RECOMMENDATIONS:")
print("- Development/Testing: Use gpt-5-mini for speed and cost")
print("- Production (Quality): Use gpt-5-mini for balanced performance")
print("- High-stakes Analysis: Consider premium models for critical decisions")
print("- Streaming: Improves UX with minimal cost impact")

## Quality Assessment

Let's analyze the quality differences between model responses:

In [None]:
# Quality analysis using AI to evaluate responses
quality_prompt = f"""
Evaluate these AI responses to a market analysis request on a scale of 1-10 for:
1. Completeness (covers all required sections)
2. Accuracy (realistic data and insights)
3. Actionability (specific, implementable recommendations)
4. Structure (clear organization and flow)
5. Business Value (practical utility for decision-making)

ORIGINAL REQUEST:
{market_analysis_prompt[:300]}...

RESPONSE 1 (GPT-5-mini):
{mini_response[:400]}...

RESPONSE 2 (GPT-5-mini):
{gpt4o_response[:400]}...

Provide scores and brief justification for each criterion.
"""

# Test GPT-5-mini
print("=== TESTING GPT-5-mini ===")
start_time = time.time()

quality_response = client.query(
    message=quality_prompt,
    system_prompt="You are concise.",
    temperature=0.1,
    model="gpt-5-mini",
    live=0,
    limit_references=0,
)


quality_analysis = quality_response.get("message").strip()
quality_analysis = len(tokenizer.encode(quality_analysis))

print("=== QUALITY ASSESSMENT ===")
print(quality_analysis)
print("\n" + "="*60)

## Model Comparison Summary

### Performance Metrics:

**Speed Comparison:**
- GPT-4o-mini: Fastest response time, good for iterations
- GPT-4o: Balanced speed and quality
- Claude: Alternative reasoning approach (when available)

**Cost Efficiency:**
- GPT-4o-mini: Most cost-effective for development
- GPT-4o: Higher cost but better quality
- Premium models: Best for critical business decisions

**User Experience:**
- Non-streaming: Wait time then complete response
- Streaming: Immediate feedback, progressive revelation
- Streaming improves perceived performance significantly

### Choosing the Right Model:

**Development Phase:**
- Use fast, cheap models for rapid iteration
- Focus on prompt engineering and workflow

**Production Deployment:**
- Balance cost, speed, and quality requirements
- Consider user experience and business impact

**Critical Applications:**
- Use highest quality models for important decisions
- Implement multiple model validation for accuracy

### Next Steps:
You'll learn to optimize these trade-offs through:
- Prompt engineering techniques
- Token budget management
- Quality assurance patterns
- Cost optimization strategies

In [None]:
# Final comparison summary
print("=== MODEL COMPARISON SUMMARY ===")
print("\nPerformance Results:")
for model, response_time, output_tokens in models_tested:
    cost = calculate_cost(model, prompt_tokens, output_tokens)
    efficiency = output_tokens / response_time
    print(f"  {model:15}: {response_time:5.2f}s | {efficiency:5.0f} tok/s | ${cost:.4f}")

print("\nKey Insights:")
print("  - Speed vs Quality: Clear trade-offs exist")
print("  - Cost scales significantly with model choice")
print("  - Streaming improves UX without major cost impact")
print("  - Model selection should match use case requirements")

print("\nRecommendations:")
print("  - Development: Fast, cheap models for iteration")
print("  - Production: Balanced models for reliability")
print("  - Critical tasks: Premium models for accuracy")
print("  - Always consider streaming for better UX")

print("\nNext: Learn prompt engineering to maximize model performance")