In [None]:
print('Setup complete.')

# Lab 2: Tokenization & Costing

## Objectives
- Build a tokenization analyzer for different text types
- Create a cost estimation calculator for various models
- Generate a comparison table of model costs and performance
- **Exit ticket**: Submit table as `results/costing.json`

## Time Allocation: 45-60 minutes

## Overview
Understanding tokenization and costs is crucial for building production LLM applications. In this lab, you'll build tools to analyze token usage and estimate costs across different providers and models.

## Prerequisites
- Completed Lab 1 (Environment Bootstrap)
- Basic understanding of tokenization from the demo

In [None]:
# Install required packages
!pip install tiktoken pandas matplotlib seaborn tabulate asksageclient

In [None]:
import tiktoken
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
from datetime import datetime
from typing import Dict, List, Tuple
from tabulate import tabulate

print("✅ Libraries imported successfully")

## Task 1: Build a Tokenization Analyzer (20 minutes)

Create a comprehensive tokenization analyzer that works with different encodings.

In [None]:
class TokenizationAnalyzer:
    """Analyze tokenization patterns across different encoders"""
    
    def __init__(self):
        # TODO: Initialize different encoders
        self.encoders = {
        }
        
        # TODO: Initialize the encoders
        for model, encoder in self.encoders.items():
            try:
                # Fill in the initialization
                pass
            except Exception as e:
                print(f"Warning: Could not load encoder for {model}: {e}")
    
    def analyze_text(self, text: str, model: str = 'gpt-4') -> Dict:
        """Analyze tokenization for a given text and model"""
        
        if model not in self.encoders or self.encoders[model] is None:
            return {'error': f'Model {model} not available'}
        
        encoder = self.encoders[model]
        
        # TODO: Tokenize the text
        tokens = None  
        token_strings = None  
        
        # Calculate statistics
        char_count = len(text)
        token_count = len(tokens)
        
        # TODO: Calculate tokens per character ratio
        tokens_per_char = None  # 
        
        # TODO: Find average token length
        avg_token_length = None  #
        
        return {
            'model': model,
            'text_preview': text[:50] + '...' if len(text) > 50 else text,
            'char_count': char_count,
            'token_count': token_count,
            'tokens_per_char': round(tokens_per_char, 3),
            'avg_token_length': round(avg_token_length, 2),
            'tokens': tokens[:10],  # First 10 tokens for inspection
            'token_strings': token_strings[:10]  # First 10 token strings
        }
    
    def compare_models(self, text: str) -> pd.DataFrame:
        """Compare tokenization across different models"""
        results = []
        
        for model in self.encoders.keys():
            if self.encoders[model] is not None:
                analysis = self.analyze_text(text, model)
                if 'error' not in analysis:
                    results.append({
                        'Model': model,
                        'Tokens': analysis['token_count'],
                        'Characters': analysis['char_count'],
                        'Tokens/Char': analysis['tokens_per_char'],
                        'Avg Token Length': analysis['avg_token_length']
                    })
        
        return pd.DataFrame(results)

# TODO: Create an instance of the analyzer
analyzer = None  #

print("🔍 Tokenization analyzer ready!")

## Task 2: Test Different Text Types (15 minutes)

Analyze how different types of content are tokenized.

In [None]:
# Test cases for different content types
test_texts = {
    'simple_english': "Hello, how are you today? I hope you're having a great day!",
    'technical_text': "Machine learning algorithms utilize gradient descent optimization to minimize loss functions through backpropagation in neural networks.",
    'code_snippet': '''def calculate_fibonacci(n):
    if n <= 1:
        return n
    return calculate_fibonacci(n-1) + calculate_fibonacci(n-2)''',
    'json_data': '{"name": "John Doe", "age": 30, "city": "New York", "skills": ["Python", "JavaScript", "SQL"]}',
    'multilingual': "Hello, Bonjour, Hola, Guten Tag, こんにちは, 你好, Здравствуйте",
    'numbers_symbols': "Price: $123.45, Temperature: 72°F, Equation: E=mc², Date: 2024-01-15",
    'long_word': "Pneumonoultramicroscopicsilicovolcanoconiosisantidisestablishmentarianism",
    'repeated_text': "test " * 20,
}

print("📊 Analyzing different text types...\n")

# TODO: Analyze each text type and store results
analysis_results = []

for text_type, text in test_texts.items():
    print(f"Analyzing: {text_type}")
    
    # TODO: Use the analyzer to analyze the text 
    result = None  #
    
    if result and 'error' not in result:
        # TODO: Add text_type to the result and append to analysis_results
        result['text_type'] = text_type

        
        print(f"  - Characters: {result['char_count']}, Tokens: {result['token_count']}, Ratio: {result['tokens_per_char']}")
    else:
        print(f"  - Error analyzing {text_type}")
    
    print()

# TODO: Create a DataFrame from results for easier analysis
if analysis_results:
    df_analysis = pd.DataFrame(analysis_results)
    print("📈 Analysis Summary:")
    print(df_analysis[['text_type', 'char_count', 'token_count', 'tokens_per_char', 'avg_token_length']].to_string(index=False))
else:
    print("⚠️ No analysis results generated. Check your implementation.")

## Task 3: Build a Cost Calculator (15 minutes)

Create a comprehensive cost calculator for different models and providers.

In [None]:
class LLMCostCalculator:
    """Calculate costs for different LLM providers and models"""
    
    def __init__(self):
        self.pricing = {
            'asksage': {
                "gpt-5":      {"input_per_m": 1.25, "output_per_m": 10.00},
                "gpt-5-mini": {"input_per_m": 0.25, "output_per_m": 2.00},
            },
        }

        # Performance characteristics (tokens per second, approximate)
        self.performance = {
            'asksage': {
                'gpt-5': 30,
                'gpt-5-mini': 50,
            },
        }
    
    def calculate_cost(self, provider: str, model: str, input_tokens: int, output_tokens: int) -> Dict:
        """Calculate cost for a specific model and token usage"""
        
        if provider not in self.pricing or model not in self.pricing[provider]:
            return {'error': f'Pricing not available for {provider}/{model}'}
        
        prices = self.pricing[provider][model]
        
        # TODO: Calculate costs (prices are per 1K tokens)
        input_cost = None 
        output_cost = None  
        total_cost = None 
        
        # TODO: Estimate processing time
        tokens_per_second = self.performance.get(provider, {}).get(model, 30)
        estimated_time = None 
        
        return {
            'provider': provider,
            'model': model,
            'input_tokens': input_tokens,
            'output_tokens': output_tokens,
            'input_cost': round(input_cost, 6),
            'output_cost': round(output_cost, 6),
            'total_cost': round(total_cost, 6),
            'estimated_time_seconds': round(estimated_time, 2),
            'cost_per_token': round(total_cost / (input_tokens + output_tokens), 8) if (input_tokens + output_tokens) > 0 else 0
        }
    
    def compare_models(self, input_tokens: int, output_tokens: int) -> pd.DataFrame:
        """Compare costs across all available models"""
        results = []
        
        for provider, models in self.pricing.items():
            for model in models.keys():
                cost_data = self.calculate_cost(provider, model, input_tokens, output_tokens)
                if 'error' not in cost_data:
                    results.append({
                        'Provider': provider.title(),
                        'Model': model,
                        'Total Cost ($)': cost_data['total_cost'],
                        'Cost per 1K tokens ($)': round(cost_data['cost_per_token'] * 1000, 6),
                        'Est. Time (s)': cost_data['estimated_time_seconds']
                    })
        
        df = pd.DataFrame(results)
        return df.sort_values('Total Cost ($)')

# TODO: Create an instance of the cost calculator
cost_calc = None  

print("💰 Cost calculator ready!")

## Task 4: Generate Cost Comparison Table (15 minutes)

Create a comprehensive comparison table for different usage scenarios.

In [None]:
# Define different usage scenarios
scenarios = {
    'quick_query': {'input_tokens': 50, 'output_tokens': 100, 'description': 'Simple question with short answer'},
    'chat_conversation': {'input_tokens': 200, 'output_tokens': 150, 'description': 'Conversational exchange'},
    'code_generation': {'input_tokens': 300, 'output_tokens': 500, 'description': 'Generate code from description'},
    'document_summary': {'input_tokens': 1000, 'output_tokens': 200, 'description': 'Summarize a document'},
    'large_analysis': {'input_tokens': 2000, 'output_tokens': 1000, 'description': 'Complex analysis task'}
}

print("📊 Generating cost comparison tables...\n")

# TODO: Generate comparison tables for each scenario
comparison_results = {}

for scenario_name, scenario_data in scenarios.items():
    print(f"Scenario: {scenario_name.replace('_', ' ').title()}")
    print(f"Description: {scenario_data['description']}")
    print(f"Input tokens: {scenario_data['input_tokens']}, Output tokens: {scenario_data['output_tokens']}")
    print()
    
    # TODO: Generate comparison table using cost_calc.compare_models()
    comparison_df = None 
    
    if comparison_df is not None and not comparison_df.empty:
        print(comparison_df.to_string(index=False))
        
        # Store for final results
        comparison_results[scenario_name] = {
            'scenario': scenario_data,
            'comparison': comparison_df.to_dict('records')
        }
    else:
        print("No comparison data available")
    
    print("\n" + "="*80 + "\n")

## Task 5: Create Final Results and Visualizations (10 minutes)

In [None]:
# TODO: Create a comprehensive analysis combining tokenization and costing data

def create_final_analysis():
    """Create the final analysis for exit ticket"""
    
    final_results = {
        'lab_info': {
            'lab_name': 'Tokenization & Costing Analysis',
            'completion_time': datetime.now().isoformat(),
            'student_name': 'Your Name Here'
        },
        'tokenization_analysis': {},
        'cost_comparisons': comparison_results,
        'key_insights': [],
        'recommendations': []
    }
    
    # Add tokenization analysis if available
    if 'analysis_results' in globals() and analysis_results:
        # TODO: Process tokenization results for inclusion
        tokenization_summary = []
        for result in analysis_results:
            tokenization_summary.append({
                'text_type': result['text_type'],
                'tokens_per_char': result['tokens_per_char'],
                'efficiency': 'High' if result['tokens_per_char'] < 0.8 else 'Medium' if result['tokens_per_char'] < 1.2 else 'Low'
            })
        
        final_results['tokenization_analysis'] = {
            'text_types_analyzed': len(tokenization_summary),
            'summary': tokenization_summary
        }
    
    # TODO: Add key insights based on analysis
    insights = [
        # Add your insights here based on the analysis
        "Different text types have varying tokenization efficiency",
        "Code and technical text typically require more tokens per character",
        "Model costs vary significantly across providers",
        # TODO: Add more insights based on your findings
    ]
    
    final_results['key_insights'] = insights
    
    # TODO: Add recommendations
    recommendations = [
        "Use GPT-5-mini for cost-sensitive applications",
        "Reserve GPT-5 for complex reasoning tasks",
        # TODO: Add more recommendations
    ]
    
    final_results['recommendations'] = recommendations
    
    return final_results

# TODO: Generate the final analysis
final_analysis = create_final_analysis()

# TODO: Save to results/costing.json for exit ticket
with open('results/costing.json', 'w') as f:
    json.dump(final_analysis, f, indent=2)

print("✅ Final analysis saved to results/costing.json")
print("\n📋 Exit Ticket: Submit the file 'results/costing.json'")

# Display summary
print("\n🎯 Lab Summary:")
print(f"- Analyzed {len(test_texts)} different text types")
print(f"- Compared costs across {len(scenarios)} usage scenarios")
print(f"- Generated {len(final_analysis['key_insights'])} key insights")
print(f"- Provided {len(final_analysis['recommendations'])} recommendations")

## 🎯 Lab Completion Checklist

Mark each item as complete:

- [ ] **Task 1**: Built tokenization analyzer with multiple encoders
- [ ] **Task 2**: Analyzed different text types (8 categories)
- [ ] **Task 3**: Created comprehensive cost calculator
- [ ] **Task 4**: Generated cost comparison tables for 5 scenarios
- [ ] **Task 5**: Created final analysis with insights and recommendations
- [ ] **Exit Ticket**: Saved results to `results/costing.json`

## 📝 Exit Ticket Submission

Submit the file `results/costing.json` that contains your complete analysis.

## 🚀 What You've Learned

- How different content types affect tokenization efficiency
- Cost structures across major LLM providers
- Performance vs. cost trade-offs for different models
- How to estimate costs for production applications

## 🔄 What's Next

In Lab 3, you'll build a cross-provider ping CLI that puts this cost knowledge into practice with real API calls!

## 🆘 Troubleshooting

**Common Issues:**
- **Tokenizer errors**: Make sure tiktoken is properly installed
- **Missing results**: Check that all TODO sections are completed
- **File save errors**: Ensure the results directory exists
- **Analysis errors**: Verify your tokenization analyzer is properly initialized