In [None]:
print('Setup complete.')

# Lab 01: LLM Deployment Fundamentals

## Learning Objectives
- Understand LLM deployment architectures
- Implement basic model serving endpoints
- Handle deployment configurations
- Monitor deployment health

In [None]:
# Import required libraries
import os
import json
import time
import asyncio
from typing import Dict, List, Optional
from dataclasses import dataclass
from datetime import datetime

# Web framework
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn

## Part 1: Deployment Configuration

In [None]:
@dataclass
class ModelConfig:
    """Model deployment configuration"""
    model_name: str
    model_version: str
    max_batch_size: int = 32
    timeout_seconds: int = 30

class PredictionRequest(BaseModel):
    text: str
    max_length: int = 100
    temperature: float = 0.7

class PredictionResponse(BaseModel):
    text: str
    model_version: str
    processing_time: float

## Part 2: Model Server Implementation

In [None]:
class ModelServer:
    """LLM Model Server"""
    
    def __init__(self, model_config: ModelConfig):
        self.config = model_config
        self.app = FastAPI(title="LLM Server")
        self.setup_routes()
        
    def setup_routes(self):
        @self.app.get("/health")
        async def health():
            return {"status": "healthy", "model": self.config.model_name}
        
        @self.app.post("/predict")
        async def predict(request: PredictionRequest):
            start = time.time()
            # Mock prediction
            result = f"Response for: {request.text[:30]}..."
            return PredictionResponse(
                text=result,
                model_version=self.config.model_version,
                processing_time=time.time() - start
            )

## Part 3: Load Balancing

In [None]:
class LoadBalancer:
    """Simple round-robin load balancer"""
    
    def __init__(self, backends: List[str]):
        self.backends = backends
        self.current = 0
        
    def get_next_backend(self) -> str:
        backend = self.backends[self.current % len(self.backends)]
        self.current += 1
        return backend

## Part 4: Deployment Monitor

In [None]:
class DeploymentMonitor:
    """Monitor deployment metrics"""
    
    def __init__(self):
        self.metrics = {
            "requests": 0,
            "errors": 0,
            "total_latency": 0.0
        }
    
    def record_request(self, latency: float, error: bool = False):
        self.metrics["requests"] += 1
        self.metrics["total_latency"] += latency
        if error:
            self.metrics["errors"] += 1
    
    def get_stats(self) -> Dict:
        total = self.metrics["requests"]
        if total == 0:
            return {"requests": 0, "error_rate": 0, "avg_latency": 0}
        
        return {
            "requests": total,
            "error_rate": self.metrics["errors"] / total,
            "avg_latency": self.metrics["total_latency"] / total
        }

## Part 5: Example Usage

In [None]:
# Configure and test deployment
config = ModelConfig(
    model_name="llm-v1",
    model_version="1.0.0"
)

# Initialize components
server = ModelServer(config)
balancer = LoadBalancer(["http://localhost:8001", "http://localhost:8002"])
monitor = DeploymentMonitor()

# Simulate requests
print("Simulating deployment...")
for i in range(10):
    backend = balancer.get_next_backend()
    latency = 0.1 + (i % 3) * 0.05
    monitor.record_request(latency)
    
print("\nDeployment Stats:")
print(json.dumps(monitor.get_stats(), indent=2))

## Exercises

1. Implement request batching for efficiency
2. Add circuit breaker pattern for fault tolerance
3. Create a canary deployment strategy

## Summary

You learned:
- Building model serving APIs
- Load balancing strategies
- Monitoring production deployments
- Key deployment patterns