In [None]:
# nb32_fastapi_docker_deploy.ipynb
# FastAPI + Docker 部署 - 生產級 LLM API 服務

## 1. 環境初始化 & 共享快取設置

# === Shared Cache Bootstrap ===
import os, pathlib, torch, sys
from typing import Optional, List, Dict, Any
import asyncio
from datetime import datetime

AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
paths = {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}
for k, v in paths.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)

print(f"[Cache] Root: {AI_CACHE_ROOT}")
print(f"[GPU] Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"[GPU] Device: {torch.cuda.get_device_name(0)}")
    print(f"[GPU] Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")



In [None]:
# Install dependencies

pip install fastapi[all] uvicorn[standard] python-multipart
pip install docker python-dotenv pydantic-settings
pip install prometheus-client structlog
pip install httpx pytest-asyncio

# app/models.py - Pydantic 模型定義
from pydantic import BaseModel, Field
from typing import List, Dict, Any, Optional
from enum import Enum

class MessageRole(str, Enum):
    SYSTEM = "system"
    USER = "user"
    ASSISTANT = "assistant"

class ChatMessage(BaseModel):
    role: MessageRole
    content: str
    metadata: Optional[Dict[str, Any]] = None

class GenerateRequest(BaseModel):
    prompt: str = Field(..., description="Input prompt for generation")
    max_tokens: int = Field(512, ge=1, le=4096)
    temperature: float = Field(0.7, ge=0.0, le=2.0)
    top_p: float = Field(0.9, ge=0.0, le=1.0)
    model_id: Optional[str] = Field("Qwen/Qwen2.5-7B-Instruct", description="Model identifier")

class ChatRequest(BaseModel):
    messages: List[ChatMessage]
    max_tokens: int = Field(512, ge=1, le=4096)
    temperature: float = Field(0.7, ge=0.0, le=2.0)
    stream: bool = Field(False, description="Enable streaming response")
    use_tools: bool = Field(False, description="Enable tool calling")

class RetrieveRequest(BaseModel):
    query: str = Field(..., description="Search query")
    top_k: int = Field(5, ge=1, le=20)
    collection_name: Optional[str] = Field("default", description="Vector collection name")

class GenerateResponse(BaseModel):
    text: str
    model_id: str
    tokens_used: int
    generation_time: float
    metadata: Dict[str, Any] = {}

class ChatResponse(BaseModel):
    message: ChatMessage
    model_id: str
    tokens_used: int
    generation_time: float
    tool_calls: Optional[List[Dict[str, Any]]] = None

class RetrieveResponse(BaseModel):
    results: List[Dict[str, Any]]
    query: str
    total_found: int
    search_time: float

class HealthResponse(BaseModel):
    status: str
    timestamp: str
    model_loaded: bool
    gpu_available: bool
    memory_usage: Dict[str, float]

In [None]:
# app/core/llm_service.py - LLM 服務管理
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import structlog
from typing import List, Dict, Any, Optional
import time
import psutil
import threading

logger = structlog.get_logger()


class LLMService:
    """Unified LLM service with model management and generation"""

    def __init__(self):
        self.model = None
        self.tokenizer = None
        self.current_model_id = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_lock = threading.Lock()

    async def load_model(self, model_id: str, force_reload: bool = False) -> bool:
        """Load or switch model with optimized configuration"""
        if self.current_model_id == model_id and not force_reload:
            return True

        try:
            with self.model_lock:
                logger.info("Loading model", model_id=model_id, device=self.device)

                # Clear previous model
                if self.model is not None:
                    del self.model
                    torch.cuda.empty_cache()

                # Configure quantization for low VRAM
                quantization_config = None
                if torch.cuda.is_available():
                    quantization_config = BitsAndBytesConfig(
                        load_in_4bit=True,
                        bnb_4bit_compute_dtype=torch.float16,
                        bnb_4bit_use_double_quant=True,
                        bnb_4bit_quant_type="nf4",
                    )

                # Load tokenizer and model
                self.tokenizer = AutoTokenizer.from_pretrained(
                    model_id,
                    trust_remote_code=True,
                    cache_dir=os.environ.get("TRANSFORMERS_CACHE"),
                )

                self.model = AutoModelForCausalLM.from_pretrained(
                    model_id,
                    quantization_config=quantization_config,
                    device_map="auto",
                    trust_remote_code=True,
                    torch_dtype=torch.float16,
                    cache_dir=os.environ.get("TRANSFORMERS_CACHE"),
                )

                # Set pad token if missing
                if self.tokenizer.pad_token is None:
                    self.tokenizer.pad_token = self.tokenizer.eos_token

                self.current_model_id = model_id
                logger.info("Model loaded successfully", model_id=model_id)
                return True

        except Exception as e:
            logger.error("Failed to load model", model_id=model_id, error=str(e))
            return False

    async def generate_text(
        self,
        prompt: str,
        max_tokens: int = 512,
        temperature: float = 0.7,
        top_p: float = 0.9,
    ) -> Dict[str, Any]:
        """Generate text with the loaded model"""
        if self.model is None or self.tokenizer is None:
            raise ValueError("No model loaded")

        start_time = time.time()

        try:
            # Tokenize input
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=2048,
            ).to(self.device)

            # Generate with optimized parameters
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs.input_ids,
                    attention_mask=inputs.attention_mask,
                    max_new_tokens=max_tokens,
                    temperature=temperature,
                    top_p=top_p,
                    do_sample=temperature > 0,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                )

            # Decode output
            generated_text = self.tokenizer.decode(
                outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
            )

            generation_time = time.time() - start_time
            tokens_used = outputs[0].shape[1] - inputs.input_ids.shape[1]

            return {
                "text": generated_text.strip(),
                "tokens_used": tokens_used,
                "generation_time": generation_time,
                "model_id": self.current_model_id,
            }

        except Exception as e:
            logger.error("Generation failed", error=str(e))
            raise

    async def chat_completion(
        self,
        messages: List[ChatMessage],
        max_tokens: int = 512,
        temperature: float = 0.7,
    ) -> Dict[str, Any]:
        """Chat completion with message formatting"""
        if self.model is None or self.tokenizer is None:
            raise ValueError("No model loaded")

        # Format messages for chat model
        formatted_prompt = self._format_chat_messages(messages)

        result = await self.generate_text(
            formatted_prompt, max_tokens=max_tokens, temperature=temperature
        )

        return {
            "message": ChatMessage(role=MessageRole.ASSISTANT, content=result["text"]),
            "tokens_used": result["tokens_used"],
            "generation_time": result["generation_time"],
            "model_id": result["model_id"],
        }

    def _format_chat_messages(self, messages: List[ChatMessage]) -> str:
        """Format messages for instruction-tuned models"""
        formatted = ""
        for msg in messages:
            if msg.role == MessageRole.SYSTEM:
                formatted += f"System: {msg.content}\n"
            elif msg.role == MessageRole.USER:
                formatted += f"User: {msg.content}\n"
            elif msg.role == MessageRole.ASSISTANT:
                formatted += f"Assistant: {msg.content}\n"

        formatted += "Assistant: "
        return formatted

    def get_model_info(self) -> Dict[str, Any]:
        """Get current model information and system stats"""
        memory_info = psutil.virtual_memory()
        gpu_memory = 0

        if torch.cuda.is_available():
            gpu_memory = torch.cuda.memory_allocated(0) / 1e9

        return {
            "model_loaded": self.model is not None,
            "current_model": self.current_model_id,
            "device": self.device,
            "system_memory_gb": memory_info.total / 1e9,
            "memory_used_gb": memory_info.used / 1e9,
            "gpu_memory_gb": gpu_memory,
        }


# Global service instance
llm_service = LLMService()

In [None]:
# app/main.py - FastAPI 應用主體
from fastapi import FastAPI, HTTPException, BackgroundTasks, Depends
from fastapi.middleware.cors import CORSMiddleware
from fastapi.middleware.gzip import GZipMiddleware
from contextlib import asynccontextmanager
import structlog
from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST
from fastapi.responses import Response
import time

# Import our models and services
from .models import *
from .core.llm_service import llm_service

# Configure structured logging
structlog.configure(
    processors=[
        structlog.stdlib.filter_by_level,
        structlog.stdlib.add_logger_name,
        structlog.stdlib.add_log_level,
        structlog.stdlib.PositionalArgumentsFormatter(),
        structlog.processors.TimeStamper(fmt="iso"),
        structlog.processors.StackInfoRenderer(),
        structlog.processors.format_exc_info,
        structlog.processors.UnicodeDecoder(),
        structlog.processors.JSONRenderer(),
    ],
    context_class=dict,
    logger_factory=structlog.stdlib.LoggerFactory(),
    wrapper_class=structlog.stdlib.BoundLogger,
    cache_logger_on_first_use=True,
)

logger = structlog.get_logger()

# Prometheus metrics
REQUEST_COUNT = Counter(
    "api_requests_total", "Total API requests", ["method", "endpoint", "status"]
)
REQUEST_DURATION = Histogram(
    "api_request_duration_seconds", "Request duration", ["method", "endpoint"]
)
GENERATION_DURATION = Histogram(
    "llm_generation_duration_seconds", "LLM generation time"
)
TOKENS_GENERATED = Counter("llm_tokens_generated_total", "Total tokens generated")


@asynccontextmanager
async def lifespan(app: FastAPI):
    """Application lifespan management"""
    # Startup
    logger.info("Starting FastAPI LLM service")

    # Preload default model
    default_model = os.getenv("DEFAULT_MODEL", "Qwen/Qwen2.5-7B-Instruct")
    success = await llm_service.load_model(default_model)
    if not success:
        logger.warning("Failed to preload default model", model=default_model)

    yield

    # Shutdown
    logger.info("Shutting down FastAPI LLM service")


# Create FastAPI app
app = FastAPI(
    title="LLM API Service",
    description="Production-ready LLM API with RAG and Agent capabilities",
    version="1.0.0",
    lifespan=lifespan,
)

# Middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Configure appropriately for production
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)
app.add_middleware(GZipMiddleware, minimum_size=1000)


# Dependency for request timing
async def track_request_metrics(request):
    start_time = time.time()
    yield
    duration = time.time() - start_time
    REQUEST_DURATION.labels(method=request.method, endpoint=request.url.path).observe(
        duration
    )


@app.middleware("http")
async def metrics_middleware(request, call_next):
    """Track request metrics"""
    start_time = time.time()
    response = await call_next(request)
    duration = time.time() - start_time

    REQUEST_COUNT.labels(
        method=request.method, endpoint=request.url.path, status=response.status_code
    ).inc()

    REQUEST_DURATION.labels(method=request.method, endpoint=request.url.path).observe(
        duration
    )

    return response


# Health check endpoint
@app.get("/health", response_model=HealthResponse)
async def health_check():
    """Health check endpoint"""
    model_info = llm_service.get_model_info()

    return HealthResponse(
        status="healthy" if model_info["model_loaded"] else "degraded",
        timestamp=datetime.now().isoformat(),
        model_loaded=model_info["model_loaded"],
        gpu_available=torch.cuda.is_available(),
        memory_usage={
            "system_memory_gb": model_info["system_memory_gb"],
            "memory_used_gb": model_info["memory_used_gb"],
            "gpu_memory_gb": model_info["gpu_memory_gb"],
        },
    )


# Prometheus metrics endpoint
@app.get("/metrics")
async def metrics():
    """Prometheus metrics endpoint"""
    return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)


# Text generation endpoint
@app.post("/generate", response_model=GenerateResponse)
async def generate_text(request: GenerateRequest):
    """Generate text from prompt"""
    logger.info(
        "Generate request", prompt_length=len(request.prompt), model_id=request.model_id
    )

    try:
        # Load model if different from current
        if llm_service.current_model_id != request.model_id:
            success = await llm_service.load_model(request.model_id)
            if not success:
                raise HTTPException(
                    status_code=500, detail=f"Failed to load model: {request.model_id}"
                )

        # Generate text
        start_time = time.time()
        result = await llm_service.generate_text(
            prompt=request.prompt,
            max_tokens=request.max_tokens,
            temperature=request.temperature,
            top_p=request.top_p,
        )

        # Track metrics
        GENERATION_DURATION.observe(result["generation_time"])
        TOKENS_GENERATED.inc(result["tokens_used"])

        return GenerateResponse(
            text=result["text"],
            model_id=result["model_id"],
            tokens_used=result["tokens_used"],
            generation_time=result["generation_time"],
            metadata={
                "prompt_length": len(request.prompt),
                "temperature": request.temperature,
                "top_p": request.top_p,
            },
        )

    except Exception as e:
        logger.error("Generation failed", error=str(e))
        raise HTTPException(status_code=500, detail=str(e))


# Chat completion endpoint
@app.post("/chat", response_model=ChatResponse)
async def chat_completion(request: ChatRequest):
    """Chat completion with message history"""
    logger.info("Chat request", message_count=len(request.messages))

    try:
        # Generate response
        result = await llm_service.chat_completion(
            messages=request.messages,
            max_tokens=request.max_tokens,
            temperature=request.temperature,
        )

        # Track metrics
        GENERATION_DURATION.observe(result["generation_time"])
        TOKENS_GENERATED.inc(result["tokens_used"])

        return ChatResponse(
            message=result["message"],
            model_id=result["model_id"],
            tokens_used=result["tokens_used"],
            generation_time=result["generation_time"],
        )

    except Exception as e:
        logger.error("Chat completion failed", error=str(e))
        raise HTTPException(status_code=500, detail=str(e))


# Model management endpoints
@app.post("/models/load")
async def load_model(model_id: str):
    """Load a specific model"""
    logger.info("Model load request", model_id=model_id)

    success = await llm_service.load_model(model_id, force_reload=True)
    if not success:
        raise HTTPException(status_code=500, detail=f"Failed to load model: {model_id}")

    return {"status": "success", "model_id": model_id}


@app.get("/models/current")
async def get_current_model():
    """Get current model information"""
    model_info = llm_service.get_model_info()
    return model_info


# Simple RAG endpoint (placeholder for integration with existing RAG service)
@app.post("/retrieve", response_model=RetrieveResponse)
async def retrieve_documents(request: RetrieveRequest):
    """Document retrieval endpoint (integrate with existing RAG service)"""
    # This would integrate with the RAG service from nb26
    # For now, return a placeholder response

    return RetrieveResponse(
        results=[], query=request.query, total_found=0, search_time=0.0
    )


if __name__ == "__main__":
    import uvicorn

    uvicorn.run(
        "app.main:app",
        host="0.0.0.0",
        port=8000,
        reload=True,
        log_config=None,  # Use our structured logging
    )

In [None]:
## 4. Docker 配置文件

```dockerfile
# Dockerfile
FROM nvidia/cuda:11.8-devel-ubuntu22.04

# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV DEBIAN_FRONTEND=noninteractive
ENV AI_CACHE_ROOT=/app/cache

# Install system dependencies
RUN apt-get update && apt-get install -y \
    python3.10 \
    python3-pip \
    python3-venv \
    git \
    curl \
    && rm -rf /var/lib/apt/lists/*

# Create app directory
WORKDIR /app

# Create cache directory
RUN mkdir -p /app/cache

# Copy requirements first for better caching
COPY requirements.txt .

# Install Python dependencies
RUN pip3 install --no-cache-dir -r requirements.txt

# Copy application code
COPY . .

# Create non-root user
RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
USER appuser

# Expose port
EXPOSE 8000

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD curl -f http://localhost:8000/health || exit 1

# Run the application
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]
```

```yaml
# docker-compose.yml
version: '3.8'

services:
  llm-api:
    build: .
    ports:
      - "8000:8000"
    environment:
      - AI_CACHE_ROOT=/app/cache
      - DEFAULT_MODEL=Qwen/Qwen2.5-7B-Instruct
      - LOG_LEVEL=INFO
    volumes:
      # Mount cache directory to persist models
      - ./cache:/app/cache
      # Mount for development (optional)
      - .:/app
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    restart: unless-stopped

  prometheus:
    image: prom/prometheus:latest
    ports:
      - "9090:9090"
    volumes:
      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'

  grafana:
    image: grafana/grafana:latest
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    volumes:
      - grafana-storage:/var/lib/grafana
      - ./monitoring/dashboards:/etc/grafana/provisioning/dashboards
      - ./monitoring/datasources:/etc/grafana/provisioning/datasources

volumes:
  grafana-storage:
```

```txt
# requirements.txt
fastapi[all]==0.104.1
uvicorn[standard]==0.24.0
transformers==4.36.0
torch==2.1.0
accelerate==0.24.0
bitsandbytes==0.41.0
pydantic==2.5.0
pydantic-settings==2.1.0
python-multipart==0.0.6
python-dotenv==1.0.0
structlog==23.2.0
prometheus-client==0.19.0
psutil==5.9.6
httpx==0.25.0
pytest-asyncio==0.21.0
```

## 5. 監控配置

```yaml
# monitoring/prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

rule_files:

scrape_configs:
  - job_name: 'llm-api'
    static_configs:
      - targets: ['llm-api:8000']
    metrics_path: '/metrics'
    scrape_interval: 5s
```

```bash
# deployment/deploy.sh
#!/bin/bash

set -e

echo "🚀 Deploying LLM API Service..."

# Check if Docker is running
if ! docker info > /dev/null 2>&1; then
    echo "❌ Docker is not running. Please start Docker first."
    exit 1
fi

# Check for NVIDIA Docker support (if using GPU)
if command -v nvidia-docker &> /dev/null; then
    echo "✅ NVIDIA Docker support detected"
else
    echo "⚠️  NVIDIA Docker not found. GPU acceleration will not be available."
fi

# Create necessary directories
mkdir -p cache monitoring/dashboards monitoring/datasources

# Copy environment file if it doesn't exist
if [ ! -f .env ]; then
    cp .env.example .env
    echo "📝 Created .env file. Please update with your settings."
fi

# Build and start services
echo "🔨 Building Docker images..."
docker-compose build

echo "🚀 Starting services..."
docker-compose up -d

# Wait for health check
echo "⏳ Waiting for services to be healthy..."
sleep 30

# Check service health
if curl -f http://localhost:8000/health > /dev/null 2>&1; then
    echo "✅ LLM API service is healthy!"
    echo "🌐 API available at: http://localhost:8000"
    echo "📊 Prometheus available at: http://localhost:9090"
    echo "📈 Grafana available at: http://localhost:3000 (admin/admin)"
else
    echo "❌ Service health check failed. Check logs with: docker-compose logs"
    exit 1
fi

echo "🎉 Deployment completed successfully!"
```

## 6. 測試套件