In [None]:
print('Setup complete.')

# AI System Monitoring & Observability

## Learning Objectives
- See real-time AI system monitoring in action
- Watch intelligent alerting and anomaly detection
- Understand AI performance metrics and optimization
- Learn about cost monitoring and resource management

## The Demo: Intelligent AI Operations

We'll demonstrate:
1. **Real-time Monitoring** - Track AI system performance
2. **Intelligent Alerting** - AI-powered anomaly detection
3. **Cost Optimization** - Monitor and optimize AI spending
4. **Performance Analytics** - Understand usage patterns
5. **Predictive Scaling** - AI-driven resource management

In [None]:
# Setup and imports
import os
import json
import time
import random
from datetime import datetime, timedelta
from typing import Dict, List, Any
import pandas as pd

# Import our AskSage client
import sys
sys.path.append('../../../bootcamp_common')
from ask_sage import AskSageClient

# Initialize client
client = AskSageClient()
print("AskSage client initialized successfully")
print("Ready to demonstrate AI system monitoring...")

## Simulated AI System Metrics

Let's create realistic AI system performance data to monitor:

In [None]:
# Generate realistic AI system metrics
def generate_ai_metrics(hours=24):
    """Generate realistic AI system metrics for monitoring demo"""
    metrics = []
    base_time = datetime.now() - timedelta(hours=hours)
    
    for i in range(hours * 4):  # 15-minute intervals
        timestamp = base_time + timedelta(minutes=i * 15)
        
        # Simulate realistic patterns with some anomalies
        hour = timestamp.hour
        is_business_hours = 9 <= hour <= 17
        is_anomaly = random.random() < 0.05  # 5% chance of anomaly
        
        # Base metrics with realistic patterns
        base_requests = 100 if is_business_hours else 20
        base_latency = 200 if is_business_hours else 150
        base_cost = 0.05 if is_business_hours else 0.01
        
        # Add noise and anomalies
        requests_per_min = base_requests + random.randint(-20, 20)
        avg_latency_ms = base_latency + random.randint(-50, 50)
        cost_per_request = base_cost + random.uniform(-0.01, 0.01)
        
        if is_anomaly:
            requests_per_min *= random.uniform(0.1, 3.0)  # Spike or drop
            avg_latency_ms *= random.uniform(1.5, 5.0)   # Latency spike
            cost_per_request *= random.uniform(2.0, 4.0) # Cost spike
        
        # Calculate derived metrics
        success_rate = random.uniform(0.95, 1.0) if not is_anomaly else random.uniform(0.7, 0.95)
        token_usage = requests_per_min * random.randint(50, 500)
        total_cost = requests_per_min * cost_per_request
        
        metrics.append({
            "timestamp": timestamp.isoformat(),
            "requests_per_minute": max(0, int(requests_per_min)),
            "avg_latency_ms": max(50, int(avg_latency_ms)),
            "success_rate": round(success_rate, 3),
            "token_usage": int(token_usage),
            "cost_per_request": round(cost_per_request, 4),
            "total_cost_usd": round(total_cost, 2),
            "is_anomaly": is_anomaly,
            "business_hours": is_business_hours
        })
    
    return metrics

# Generate 24 hours of metrics
ai_metrics = generate_ai_metrics(24)
print(f"Generated {len(ai_metrics)} metric data points")
print(f"Time range: {ai_metrics[0]['timestamp']} to {ai_metrics[-1]['timestamp']}")
print(f"Anomalies included: {sum(1 for m in ai_metrics if m['is_anomaly'])}")

# Show sample data
print("\nSample metrics:")
df_sample = pd.DataFrame(ai_metrics[-5:])
print(df_sample[['timestamp', 'requests_per_minute', 'avg_latency_ms', 'success_rate', 'total_cost_usd']].to_string(index=False))

## AI-Powered Anomaly Detection

Let's have AI analyze the metrics and detect anomalies automatically:

In [None]:
# AI analyzes metrics for anomalies
recent_metrics = ai_metrics[-20:]  # Last 5 hours

anomaly_prompt = f"""
Analyze these AI system metrics and detect anomalies, performance issues, and optimization opportunities.

Metrics Data (last 5 hours):
{json.dumps(recent_metrics, indent=2)}

Provide analysis in JSON format:
{{
  "system_health": {{
    "overall_status": "Healthy|Warning|Critical",
    "health_score": "number 1-10",
    "summary": "string"
  }},
  "anomalies_detected": [
    {{
      "timestamp": "string",
      "metric": "string",
      "anomaly_type": "Spike|Drop|Trend|Outlier",
      "severity": "High|Medium|Low",
      "description": "string",
      "impact": "string"
    }}
  ],
  "performance_insights": [
    {{
      "metric": "string",
      "trend": "Improving|Stable|Degrading",
      "insight": "string"
    }}
  ],
  "cost_analysis": {{
    "total_cost_period": "number",
    "cost_trend": "string",
    "optimization_potential": "string"
  }},
  "alerts": [
    {{
      "priority": "Critical|High|Medium|Low",
      "message": "string",
      "action_required": "string"
    }}
  ]
}}
"""

print("=== AI ANOMALY DETECTION ===")
anomaly_response = client.query({
    "model": "gpt-4o-mini",
    "messages": [{"role": "user", "content": anomaly_prompt}],
    "temperature": 0.1,
    "max_tokens": 1500
})

anomaly_result = anomaly_response['choices'][0]['message']['content']
print(anomaly_result)

# Parse anomaly analysis
import re
json_match = re.search(r'\{.*\}', anomaly_result, re.DOTALL)
if json_match:
    anomaly_data = json.loads(json_match.group())
    
    health = anomaly_data.get('system_health', {})
    anomalies = anomaly_data.get('anomalies_detected', [])
    alerts = anomaly_data.get('alerts', [])
    
    print(f"\n✓ System Health: {health.get('overall_status', 'Unknown')}")
    print(f"✓ Health Score: {health.get('health_score', 0)}/10")
    print(f"✓ Anomalies Detected: {len(anomalies)}")
    print(f"✓ Active Alerts: {len(alerts)}")
    
    # Show critical alerts
    critical_alerts = [a for a in alerts if a.get('priority') == 'Critical']
    if critical_alerts:
        print(f"\n🚨 CRITICAL ALERTS:")
        for alert in critical_alerts:
            print(f"  - {alert.get('message', 'Unknown alert')}")
            print(f"    Action: {alert.get('action_required', 'No action specified')}")

## Intelligent Cost Optimization

Let's have AI analyze costs and suggest optimizations:

In [None]:
# AI analyzes costs and suggests optimizations
cost_optimization_prompt = f"""
Analyze AI system costs and provide optimization recommendations.

24-Hour Metrics Summary:
- Total requests: {sum(m['requests_per_minute'] for m in ai_metrics)}
- Total cost: ${sum(m['total_cost_usd'] for m in ai_metrics):.2f}
- Average latency: {sum(m['avg_latency_ms'] for m in ai_metrics) / len(ai_metrics):.1f}ms
- Average success rate: {sum(m['success_rate'] for m in ai_metrics) / len(ai_metrics):.3f}
- Total tokens: {sum(m['token_usage'] for m in ai_metrics):,}

Business Hours vs Off-Hours:
Business Hours Metrics:
{json.dumps([m for m in ai_metrics if m['business_hours']][-5:], indent=2)}

Off-Hours Metrics:
{json.dumps([m for m in ai_metrics if not m['business_hours']][-5:], indent=2)}

Provide cost optimization analysis:
{{
  "cost_breakdown": {{
    "total_daily_cost": "number",
    "business_hours_cost": "number",
    "off_hours_cost": "number",
    "cost_per_successful_request": "number"
  }},
  "usage_patterns": [
    {{
      "pattern": "string",
      "cost_impact": "High|Medium|Low",
      "description": "string"
    }}
  ],
  "optimization_opportunities": [
    {{
      "strategy": "string",
      "potential_savings": "string",
      "implementation_effort": "Low|Medium|High",
      "risk_level": "Low|Medium|High",
      "description": "string"
    }}
  ],
  "recommendations": [
    {{
      "action": "string",
      "priority": "High|Medium|Low",
      "expected_savings": "string",
      "timeline": "string"
    }}
  ]
}}
"""

print("=== AI COST OPTIMIZATION ===")
cost_response = client.query({
    "model": "gpt-4o-mini",
    "messages": [{"role": "user", "content": cost_optimization_prompt}],
    "temperature": 0.1,
    "max_tokens": 1500
})

cost_result = cost_response['choices'][0]['message']['content']
print(cost_result)

# Parse cost analysis
json_match = re.search(r'\{.*\}', cost_result, re.DOTALL)
if json_match:
    cost_data = json.loads(json_match.group())
    
    breakdown = cost_data.get('cost_breakdown', {})
    opportunities = cost_data.get('optimization_opportunities', [])
    recommendations = cost_data.get('recommendations', [])
    
    print(f"\n✓ Daily Cost Analysis: ${breakdown.get('total_daily_cost', 0):.2f}")
    print(f"✓ Optimization Opportunities: {len(opportunities)}")
    print(f"✓ Actionable Recommendations: {len(recommendations)}")
    
    # Show high-priority recommendations
    high_priority = [r for r in recommendations if r.get('priority') == 'High']
    if high_priority:
        print(f"\n💰 HIGH-PRIORITY COST OPTIMIZATIONS:")
        for rec in high_priority:
            print(f"  - {rec.get('action', 'Unknown action')}")
            print(f"    Savings: {rec.get('expected_savings', 'Not specified')}")
            print(f"    Timeline: {rec.get('timeline', 'Not specified')}")

## Predictive Scaling Analysis

Let's have AI predict future resource needs and scaling requirements:

In [None]:
# AI predicts scaling needs
scaling_prompt = f"""
Based on the AI system metrics, predict future scaling needs and resource requirements.

Current Performance Trends:
- Peak requests/min: {max(m['requests_per_minute'] for m in ai_metrics)}
- Average requests/min: {sum(m['requests_per_minute'] for m in ai_metrics) / len(ai_metrics):.1f}
- Peak latency: {max(m['avg_latency_ms'] for m in ai_metrics)}ms
- Success rate range: {min(m['success_rate'] for m in ai_metrics):.3f} - {max(m['success_rate'] for m in ai_metrics):.3f}

Recent 6-hour trend:
{json.dumps(ai_metrics[-24:], indent=2)}

Provide scaling analysis:
{{
  "current_capacity": {{
    "utilization_level": "Low|Medium|High|Critical",
    "capacity_percentage": "number",
    "bottlenecks": ["list of bottlenecks"]
  }},
  "growth_predictions": {{
    "next_7_days": "string",
    "next_30_days": "string",
    "growth_rate": "string"
  }},
  "scaling_recommendations": [
    {{
      "timeframe": "Immediate|Short-term|Long-term",
      "action": "string",
      "resource_type": "Compute|Memory|Network|Storage",
      "justification": "string",
      "estimated_cost": "string"
    }}
  ],
  "risk_assessment": [
    {{
      "risk": "string",
      "probability": "High|Medium|Low",
      "impact": "High|Medium|Low",
      "mitigation": "string"
    }}
  ]
}}
"""

print("=== AI PREDICTIVE SCALING ===")
scaling_response = client.query({
    "model": "gpt-4o-mini",
    "messages": [{"role": "user", "content": scaling_prompt}],
    "temperature": 0.1,
    "max_tokens": 1500
})

scaling_result = scaling_response['choices'][0]['message']['content']
print(scaling_result)

# Parse scaling analysis
json_match = re.search(r'\{.*\}', scaling_result, re.DOTALL)
if json_match:
    scaling_data = json.loads(json_match.group())
    
    capacity = scaling_data.get('current_capacity', {})
    predictions = scaling_data.get('growth_predictions', {})
    scaling_recs = scaling_data.get('scaling_recommendations', [])
    risks = scaling_data.get('risk_assessment', [])
    
    print(f"\n✓ Current Utilization: {capacity.get('utilization_level', 'Unknown')}")
    print(f"✓ Capacity Usage: {capacity.get('capacity_percentage', 0)}%")
    print(f"✓ Scaling Actions: {len(scaling_recs)}")
    print(f"✓ Risk Factors: {len(risks)}")
    
    # Show immediate actions
    immediate_actions = [r for r in scaling_recs if r.get('timeframe') == 'Immediate']
    if immediate_actions:
        print(f"\n⚡ IMMEDIATE SCALING ACTIONS:")
        for action in immediate_actions:
            print(f"  - {action.get('action', 'Unknown action')}")
            print(f"    Resource: {action.get('resource_type', 'Not specified')}")
            print(f"    Cost: {action.get('estimated_cost', 'Not specified')}")
    
    # Show high-risk items
    high_risks = [r for r in risks if r.get('probability') == 'High' or r.get('impact') == 'High']
    if high_risks:
        print(f"\n⚠️ HIGH-RISK FACTORS:")
        for risk in high_risks:
            print(f"  - {risk.get('risk', 'Unknown risk')}")
            print(f"    Probability: {risk.get('probability', 'Unknown')} | Impact: {risk.get('impact', 'Unknown')}")

## Monitoring Dashboard Summary

### What We Demonstrated:

**1. Intelligent Anomaly Detection**
- AI automatically identified performance anomalies
- Classified anomaly types and severity levels
- Generated actionable alerts with context

**2. Cost Intelligence**
- Analyzed spending patterns across business hours
- Identified optimization opportunities
- Provided specific cost-saving recommendations

**3. Predictive Scaling**
- Analyzed current capacity utilization
- Predicted future resource needs
- Recommended proactive scaling actions

**4. Risk Assessment**
- Identified potential system risks
- Assessed probability and impact
- Suggested mitigation strategies

### Business Value:

**Proactive Operations**
- Prevent issues before they impact users
- Optimize costs automatically
- Scale resources predictively

**Intelligent Insights**
- Context-aware alerting reduces noise
- Business-impact focused recommendations
- Data-driven decision making

**Operational Efficiency**
- Reduce manual monitoring overhead
- Faster incident response
- Optimized resource utilization

### Traditional vs AI Monitoring:

**Traditional Monitoring:**
- Static thresholds and rules
- Reactive alerting
- Manual analysis required
- Limited context understanding

**AI-Powered Monitoring:**
- Dynamic anomaly detection
- Predictive insights
- Automated root cause analysis
- Business context awareness

This demonstrates how AI transforms system monitoring from reactive alerting to proactive, intelligent operations management.