In [None]:
print('Setup complete.')

# Observability & Caching - Demo (AskSage Edition)

**Focus**: prompt hashing, read-through cache, TTLs, cache busting with AskSage's gpt-5-mini

This notebook demonstrates advanced caching strategies for LLM applications to improve performance and reduce costs using AskSage's Python client.

## Learning Objectives
- Understand prompt hashing for cache keys
- Implement read-through caching patterns
- Configure TTL (Time-To-Live) for cache entries
- Implement cache busting strategies
- Use AskSage Python client with gpt-5-mini

In [None]:
# Install required packages
!pip install asksageclient diskcache pandas

import os
import time
import hashlib
import json
from datetime import datetime, timedelta
from typing import Optional, Dict, Any
import diskcache as dc
import pandas as pd
from asksageclient import AskSageClient

# Initialize AskSage client
# Set your AskSage credentials as environment variables:
# export ASKSAGE_API_KEY="your-api-key"
# export ASKSAGE_TENANT="your-tenant"
# export ASKSAGE_USERNAME="your-username"

ask_sage_client = AskSageClient(
    api_key=os.getenv("ASKSAGE_API_KEY"),
    tenant=os.getenv("ASKSAGE_TENANT"),
    username=os.getenv("ASKSAGE_USERNAME")
)

print("✅ All packages installed and AskSage client initialized!")
print(f"Available models: {ask_sage_client.get_models()['response'][:5]}...")  # Show first 5 models

## 1. Prompt Hashing

First, we'll implement a system to generate consistent hash keys from prompts for caching.

In [None]:
class PromptHasher:
    """Generate consistent hash keys from prompts for caching."""
    
    @staticmethod
    def hash_prompt(prompt: str, model: str = "gpt-5-mini", **kwargs) -> str:
        """Generate a hash from prompt components."""
        # Create a string representation of all prompt components
        prompt_data = {
            "model": model,
            "prompt": prompt,
            "params": {k: v for k, v in kwargs.items() if k in ["temperature", "max_tokens", "top_p"]}
        }
        
        # Convert to JSON string for consistent ordering
        prompt_str = json.dumps(prompt_data, sort_keys=True)
        
        # Generate SHA-256 hash
        return hashlib.sha256(prompt_str.encode('utf-8')).hexdigest()[:16]

# Test prompt hashing
hasher = PromptHasher()

# Example prompts
prompt1 = "What is the capital of France?"
prompt2 = "What is the capital of Germany?"
prompt1_duplicate = "What is the capital of France?"

hash1 = hasher.hash_prompt(prompt1)
hash2 = hasher.hash_prompt(prompt2)
hash1_dup = hasher.hash_prompt(prompt1_duplicate)

print(f"Prompt 1: '{prompt1}'")
print(f"Hash 1:   {hash1}")
print(f"\nPrompt 2: '{prompt2}'")
print(f"Hash 2:   {hash2}")
print(f"\nPrompt 1 (duplicate): '{prompt1_duplicate}'")
print(f"Hash 1 (duplicate):   {hash1_dup}")
print(f"\n✅ Same prompts produce identical hashes: {hash1 == hash1_dup}")
print(f"✅ Different prompts produce different hashes: {hash1 != hash2}")

## 2. Read-Through Cache Implementation

Implement a read-through cache that automatically fetches and caches LLM responses.

In [None]:
class LLMCache:
    """Read-through cache for LLM responses with TTL support."""
    
    def __init__(self, cache_dir: str = "/tmp/llm_cache_asksage", default_ttl: int = 3600):
        """Initialize cache with disk storage and default TTL."""
        self.cache = dc.Cache(cache_dir)
        self.default_ttl = default_ttl
        self.hasher = PromptHasher()
        
        # Statistics
        self.hits = 0
        self.misses = 0
    
    def get_cache_key(self, prompt: str, model: str, **kwargs) -> str:
        """Generate cache key for the request."""
        return self.hasher.hash_prompt(prompt, model, **kwargs)
    
    def get(self, cache_key: str) -> Optional[Dict[str, Any]]:
        """Get item from cache if not expired."""
        try:
            cached_item = self.cache.get(cache_key)
            if cached_item is None:
                self.misses += 1
                return None
            
            # Check TTL
            if 'expires_at' in cached_item:
                if datetime.now() > datetime.fromisoformat(cached_item['expires_at']):
                    self.cache.delete(cache_key)
                    self.misses += 1
                    return None
            
            self.hits += 1
            return cached_item
        except Exception as e:
            print(f"Cache get error: {e}")
            self.misses += 1
            return None
    
    def set(self, cache_key: str, response: str, ttl: Optional[int] = None) -> None:
        """Store item in cache with TTL."""
        ttl = ttl or self.default_ttl
        expires_at = datetime.now() + timedelta(seconds=ttl)
        
        cache_item = {
            'response': response,
            'cached_at': datetime.now().isoformat(),
            'expires_at': expires_at.isoformat(),
            'ttl': ttl
        }
        
        try:
            self.cache.set(cache_key, cache_item, expire=ttl)
        except Exception as e:
            print(f"Cache set error: {e}")
    
    def bust_cache(self, pattern: Optional[str] = None) -> int:
        """Clear cache entries. If pattern provided, clear matching keys only."""
        if pattern:
            # For simplicity, clear all - in production you'd implement pattern matching
            cleared = len(self.cache)
            self.cache.clear()
            return cleared
        else:
            cleared = len(self.cache)
            self.cache.clear()
            return cleared
    
    def get_stats(self) -> Dict[str, Any]:
        """Get cache statistics."""
        total_requests = self.hits + self.misses
        hit_rate = (self.hits / total_requests * 100) if total_requests > 0 else 0
        
        return {
            'hits': self.hits,
            'misses': self.misses,
            'total_requests': total_requests,
            'hit_rate': f"{hit_rate:.1f}%",
            'cache_size': len(self.cache)
        }

# Initialize cache
cache = LLMCache(default_ttl=300)  # 5 minute TTL
print("✅ LLM Cache initialized with 5-minute TTL")
print(f"Cache stats: {cache.get_stats()}")

## 3. Cached AskSage LLM Wrapper

Create a wrapper that integrates caching with AskSage LLM calls.

In [None]:
class CachedAskSageLLM:
    """AskSage LLM wrapper with integrated caching."""
    
    def __init__(self, client: AskSageClient, cache: LLMCache, model: str = "gpt-5-mini"):
        self.client = client
        self.cache = cache
        self.model = model
    
    def query(self, prompt: str, **kwargs) -> str:
        """Query AskSage with caching (read-through pattern)."""
        # Generate cache key
        cache_key = self.cache.get_cache_key(prompt, self.model, **kwargs)
        
        print(f"🔍 Cache key: {cache_key}")
        
        # Try to get from cache first (read-through)
        cached_result = self.cache.get(cache_key)
        if cached_result:
            print(f"💾 Cache HIT! Retrieved from cache (TTL: {cached_result['ttl']}s)")
            return cached_result['response']
        
        print("🌐 Cache MISS! Calling AskSage...")
        
        # Cache miss - call the actual LLM
        start_time = time.time()
        try:
            # Use AskSage client to query with gpt-5-mini
            response = self.client.query(
                query=prompt,
                model=self.model,
                **kwargs
            )
            
            # Extract response text from AskSage response
            if 'ret' in response:
                response_text = response['ret']
            elif 'response' in response:
                response_text = response['response']
            else:
                response_text = str(response)
            
            call_duration = time.time() - start_time
            
            print(f"⚡ AskSage call completed in {call_duration:.2f}s")
            
            # Store in cache
            self.cache.set(cache_key, response_text)
            print(f"💾 Response cached with key: {cache_key}")
            
            return response_text
            
        except Exception as e:
            print(f"❌ AskSage call failed: {e}")
            raise
    
    def get_cache_stats(self) -> Dict[str, Any]:
        """Get current cache statistics."""
        return self.cache.get_stats()
    
    def bust_cache(self) -> int:
        """Clear the cache."""
        return self.cache.bust_cache()

# Initialize cached AskSage LLM
cached_llm = CachedAskSageLLM(ask_sage_client, cache, model="gpt-5-mini")

print("✅ Cached AskSage LLM wrapper initialized with gpt-5-mini")

## 4. Testing Read-Through Cache

Demonstrate the cache in action with repeated queries.

In [None]:
# Test questions
questions = [
    "What is the capital of Japan?",
    "Explain quantum computing in one sentence.",
    "What is the capital of Japan?",  # Duplicate to test cache
    "Name three benefits of caching in software systems."
]

print("🚀 Testing read-through cache behavior with AskSage gpt-5-mini\n")
print("="*60)

for i, question in enumerate(questions, 1):
    print(f"\n📝 Question {i}: {question}")
    print("-" * 50)
    
    response = cached_llm.query(question)
    
    print(f"🤖 Response: {response[:100]}{'...' if len(response) > 100 else ''}")
    print(f"📊 Cache Stats: {cached_llm.get_cache_stats()}")
    print()

print("\n" + "="*60)
print(f"📈 Final Cache Statistics: {cached_llm.get_cache_stats()}")

## 5. TTL (Time-To-Live) Demonstration

Show how cache entries expire based on TTL settings.

In [None]:
# Create a new cache with very short TTL for demonstration
short_ttl_cache = LLMCache(cache_dir="/tmp/llm_cache_short_asksage", default_ttl=5)  # 5 seconds
cached_llm_short = CachedAskSageLLM(ask_sage_client, short_ttl_cache, model="gpt-5-mini")

print("⏰ Testing TTL behavior with AskSage (5-second expiration)\n")

test_question = "What is machine learning?"

print(f"📝 Question: {test_question}")
print("-" * 50)

# First call - should be a cache miss
print("\n🔥 First call:")
response1 = cached_llm_short.query(test_question)
print(f"Stats: {cached_llm_short.get_cache_stats()}")

# Immediate second call - should be a cache hit
print("\n🔄 Immediate second call:")
response2 = cached_llm_short.query(test_question)
print(f"Stats: {cached_llm_short.get_cache_stats()}")

print("\n⏳ Waiting 6 seconds for cache to expire...")
time.sleep(6)

# Third call after TTL expiration - should be a cache miss
print("\n🔥 Third call (after TTL expiration):")
response3 = cached_llm_short.query(test_question)
print(f"Stats: {cached_llm_short.get_cache_stats()}")

print("\n✅ TTL demonstration complete!")

## 6. Cache Busting Strategies

Demonstrate different approaches to invalidating cache entries.

In [None]:
print("💥 Cache Busting Demonstration\n")

# Fill cache with several entries
test_questions = [
    "What is artificial intelligence?",
    "How does blockchain work?",
    "Explain cloud computing."
]

print("📚 Filling cache with multiple entries...")
for question in test_questions:
    cached_llm.query(question)

print(f"\n📊 Cache after filling: {cached_llm.get_cache_stats()}")

# Strategy 1: Manual cache busting
print("\n🗑️  Strategy 1: Manual cache clear")
cleared_count = cached_llm.bust_cache()
print(f"Cleared {cleared_count} cache entries")
print(f"📊 Cache after busting: {cached_llm.get_cache_stats()}")

# Strategy 2: Version-based cache busting
print("\n🔄 Strategy 2: Version-based cache keys")

class VersionedCache(LLMCache):
    """Cache with version-based keys for easy invalidation."""
    
    def __init__(self, *args, version="v1", **kwargs):
        super().__init__(*args, **kwargs)
        self.version = version
    
    def get_cache_key(self, prompt: str, model: str, **kwargs) -> str:
        base_key = super().get_cache_key(prompt, model, **kwargs)
        return f"{self.version}:{base_key}"
    
    def increment_version(self):
        """Increment version to invalidate all existing cache entries."""
        current_num = int(self.version[1:]) if self.version.startswith('v') else 1
        self.version = f"v{current_num + 1}"
        print(f"🆕 Cache version updated to: {self.version}")

# Test versioned cache
versioned_cache = VersionedCache(cache_dir="/tmp/llm_cache_versioned_asksage", version="v1")
versioned_llm = CachedAskSageLLM(ask_sage_client, versioned_cache, model="gpt-5-mini")

# Add entry with v1
test_prompt = "What is Python?"
versioned_llm.query(test_prompt)
print(f"📊 V1 cache: {versioned_llm.get_cache_stats()}")

# Increment version (effectively busts cache)
versioned_cache.increment_version()

# Same query with v2 - should be cache miss
versioned_llm.query(test_prompt)
print(f"📊 V2 cache: {versioned_llm.get_cache_stats()}")

print("\n✅ Cache busting strategies demonstrated!")

## 7. Performance Comparison

Compare performance with and without caching using AskSage.

In [None]:
import statistics

print("🏁 Performance Comparison: Cached vs Uncached AskSage calls\n")

# Test questions (some duplicates to show cache benefits)
performance_questions = [
    "What is the meaning of life?",
    "How does photosynthesis work?",
    "What is the meaning of life?",  # Duplicate
    "Explain the theory of relativity.",
    "How does photosynthesis work?",  # Duplicate
    "What is quantum mechanics?",
    "What is the meaning of life?",  # Another duplicate
]

# Test without cache (regular AskSage client)
print("⚡ Testing WITHOUT cache...")
uncached_times = []
for i, question in enumerate(performance_questions, 1):
    start_time = time.time()
    ask_sage_client.query(query=question, model="gpt-5-mini")
    duration = time.time() - start_time
    uncached_times.append(duration)
    print(f"  Query {i}: {duration:.2f}s")

print("\n💾 Testing WITH cache...")
# Reset cache for fair comparison
cache.bust_cache()
cached_times = []
for i, question in enumerate(performance_questions, 1):
    start_time = time.time()
    cached_llm.query(question)
    duration = time.time() - start_time
    cached_times.append(duration)
    print(f"  Query {i}: {duration:.2f}s")

# Calculate statistics
uncached_total = sum(uncached_times)
cached_total = sum(cached_times)
uncached_avg = statistics.mean(uncached_times)
cached_avg = statistics.mean(cached_times)

print("\n" + "="*50)
print("📊 PERFORMANCE RESULTS")
print("="*50)
print(f"Without Cache:")
print(f"  Total Time: {uncached_total:.2f}s")
print(f"  Average per query: {uncached_avg:.2f}s")
print(f"\nWith Cache:")
print(f"  Total Time: {cached_total:.2f}s")
print(f"  Average per query: {cached_avg:.2f}s")
print(f"\n🚀 Performance Improvement:")
print(f"  Time saved: {uncached_total - cached_total:.2f}s")
print(f"  Speed improvement: {((uncached_total / cached_total - 1) * 100):.1f}%")
print(f"\n📈 Final cache stats: {cached_llm.get_cache_stats()}")

## Summary

In this demo, we covered key caching concepts for LLM applications using AskSage's gpt-5-mini:

### 1. **Prompt Hashing**
- Created consistent hash keys from prompt components
- Included model parameters in hash calculation
- Ensured identical prompts produce identical cache keys

### 2. **Read-Through Cache**
- Implemented automatic cache population on cache misses
- Used disk-based storage for persistence
- Provided transparent caching interface

### 3. **TTL (Time-To-Live)**
- Configured automatic cache expiration
- Prevented stale data from being served
- Balanced performance with freshness

### 4. **Cache Busting**
- Manual cache clearing for immediate invalidation
- Version-based keys for gradual cache invalidation
- Strategic cache management for different scenarios

### 5. **AskSage Integration**
- Used AskSage Python client with gpt-5-mini model
- Maintained API compatibility while adding caching
- Demonstrated enterprise-grade LLM usage patterns

### Key Benefits Demonstrated:
- **Performance**: Significant reduction in response times for repeated queries
- **Cost Savings**: Reduced API calls to expensive LLM services
- **Scalability**: Better handling of high-frequency, repetitive requests
- **Reliability**: Consistent responses for identical inputs
- **Enterprise Ready**: Integration with AskSage's enterprise platform