In [None]:
# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# %%
# Dependencies for monitoring and metrics
import json
import time
import logging
import psutil
from datetime import datetime
from dataclasses import dataclass, asdict
from typing import Dict, List, Optional, Any
from pathlib import Path

# GPU monitoring (optional, graceful fallback)
try:
    import pynvml

    pynvml.nvmlInit()
    GPU_AVAILABLE = True
    print("[Monitor] GPU monitoring enabled via pynvml")
except ImportError:
    GPU_AVAILABLE = False
    print("[Monitor] GPU monitoring disabled (install nvidia-ml-py3 for VRAM tracking)")

# Ensure output directory exists
Path("outs").mkdir(exist_ok=True)
print("[Setup] Output directory ready: outs/")

In [None]:
# %%
# Structured logging configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("outs/llm_metrics.log", encoding="utf-8"),
    ],
)

logger = logging.getLogger("LLMMetrics")


@dataclass
class MetricsSnapshot:
    """Single measurement snapshot"""

    timestamp: str
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
    latency_ms: float
    tokens_per_second: float
    cpu_percent: float
    memory_mb: float
    gpu_memory_mb: Optional[float] = None
    model_id: str = ""
    backend: str = ""
    temperature: float = 0.0
    max_new_tokens: int = 0


class PerformanceProfiler:
    """Lightweight profiler for LLM inference"""

    def __init__(self):
        self.snapshots: List[MetricsSnapshot] = []
        self.process = psutil.Process()

    def get_gpu_memory_mb(self) -> Optional[float]:
        """Get current GPU memory usage in MB"""
        if not GPU_AVAILABLE:
            return None
        try:
            handle = pynvml.nvmlDeviceGetHandleByIndex(0)  # First GPU
            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
            return info.used / 1024**2  # Convert to MB
        except Exception as e:
            logger.warning(f"GPU memory query failed: {e}")
            return None

    def start_measurement(self):
        """Mark start of measurement period"""
        self.start_time = time.time()
        self.start_cpu = self.process.cpu_percent()
        self.start_memory = self.process.memory_info().rss / 1024**2
        self.start_gpu = self.get_gpu_memory_mb()

    def end_measurement(
        self,
        prompt_tokens: int,
        completion_tokens: int,
        model_id: str,
        backend: str,
        temperature: float,
        max_new_tokens: int,
    ) -> MetricsSnapshot:
        """End measurement and create snapshot"""
        end_time = time.time()
        latency_ms = (end_time - self.start_time) * 1000

        # Calculate tokens/sec (avoid division by zero)
        total_tokens = prompt_tokens + completion_tokens
        tokens_per_second = completion_tokens / max(0.001, (end_time - self.start_time))

        snapshot = MetricsSnapshot(
            timestamp=datetime.now().isoformat(),
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=total_tokens,
            latency_ms=latency_ms,
            tokens_per_second=tokens_per_second,
            cpu_percent=self.process.cpu_percent(),
            memory_mb=self.process.memory_info().rss / 1024**2,
            gpu_memory_mb=self.get_gpu_memory_mb(),
            model_id=model_id,
            backend=backend,
            temperature=temperature,
            max_new_tokens=max_new_tokens,
        )

        self.snapshots.append(snapshot)
        logger.info(
            f"Metrics: {latency_ms:.1f}ms | {tokens_per_second:.1f} tok/s | {total_tokens} tokens"
        )
        return snapshot


profiler = PerformanceProfiler()
print("[Profiler] Performance profiler initialized")

In [None]:
# %%
# Enhanced LLMAdapter with metrics integration
from transformers import AutoTokenizer, AutoModelForCausalLM


class MetricsLLMAdapter:
    """LLMAdapter with built-in performance monitoring"""

    def __init__(self, model_id: str, backend: str = "transformers", **kwargs):
        self.model_id = model_id
        self.backend = backend

        # Load model with low-VRAM defaults
        logger.info(f"Loading model: {model_id}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

        # Add padding token if missing
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.float16,  # Lower VRAM
            low_cpu_mem_usage=True,
            **kwargs,
        )

        logger.info(f"Model loaded on device: {self.model.device}")

    def count_tokens(self, text: str) -> int:
        """Count tokens in text"""
        return len(self.tokenizer.encode(text))

    def generate_with_metrics(
        self,
        messages: List[Dict],
        max_new_tokens: int = 256,
        temperature: float = 0.7,
        **kwargs,
    ) -> Dict[str, Any]:
        """Generate text with automatic metrics collection"""

        # Convert messages to prompt
        prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
        prompt_tokens = self.count_tokens(prompt)

        # Start profiling
        profiler.start_measurement()

        try:
            # Tokenize input
            inputs = self.tokenizer(
                prompt, return_tensors="pt", truncation=True, max_length=3072
            )
            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

            # Generate
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    do_sample=temperature > 0,
                    pad_token_id=self.tokenizer.eos_token_id,
                    **kwargs,
                )

            # Decode only the new tokens
            new_tokens = outputs[0][len(inputs["input_ids"][0]) :]
            completion = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
            completion_tokens = len(new_tokens)

        except Exception as e:
            logger.error(f"Generation failed: {e}")
            completion = f"[Error: {str(e)}]"
            completion_tokens = 0

        # End profiling
        snapshot = profiler.end_measurement(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            model_id=self.model_id,
            backend=self.backend,
            temperature=temperature,
            max_new_tokens=max_new_tokens,
        )

        return {"completion": completion, "metrics": asdict(snapshot), "prompt": prompt}


# Test with a small model if available, fallback to demo
try:
    # Use smaller model for baseline testing
    model_id = os.getenv(
        "MODEL_ID", "microsoft/DialoGPT-small"
    )  # Fallback to small model
    adapter = MetricsLLMAdapter(model_id)
    print(f"[Adapter] Loaded model: {model_id}")
except Exception as e:
    print(f"[Warning] Could not load model: {e}")
    print("[Info] Continuing with mock adapter for demo")
    adapter = None

In [None]:
# %%
# Enhanced LLMAdapter with metrics integration
from transformers import AutoTokenizer, AutoModelForCausalLM


class MetricsLLMAdapter:
    """LLMAdapter with built-in performance monitoring"""

    def __init__(self, model_id: str, backend: str = "transformers", **kwargs):
        self.model_id = model_id
        self.backend = backend

        # Load model with low-VRAM defaults
        logger.info(f"Loading model: {model_id}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

        # Add padding token if missing
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.float16,  # Lower VRAM
            low_cpu_mem_usage=True,
            **kwargs,
        )

        logger.info(f"Model loaded on device: {self.model.device}")

    def count_tokens(self, text: str) -> int:
        """Count tokens in text"""
        return len(self.tokenizer.encode(text))

    def generate_with_metrics(
        self,
        messages: List[Dict],
        max_new_tokens: int = 256,
        temperature: float = 0.7,
        **kwargs,
    ) -> Dict[str, Any]:
        """Generate text with automatic metrics collection"""

        # Convert messages to prompt
        prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
        prompt_tokens = self.count_tokens(prompt)

        # Start profiling
        profiler.start_measurement()

        try:
            # Tokenize input
            inputs = self.tokenizer(
                prompt, return_tensors="pt", truncation=True, max_length=3072
            )
            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

            # Generate
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    do_sample=temperature > 0,
                    pad_token_id=self.tokenizer.eos_token_id,
                    **kwargs,
                )

            # Decode only the new tokens
            new_tokens = outputs[0][len(inputs["input_ids"][0]) :]
            completion = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
            completion_tokens = len(new_tokens)

        except Exception as e:
            logger.error(f"Generation failed: {e}")
            completion = f"[Error: {str(e)}]"
            completion_tokens = 0

        # End profiling
        snapshot = profiler.end_measurement(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            model_id=self.model_id,
            backend=self.backend,
            temperature=temperature,
            max_new_tokens=max_new_tokens,
        )

        return {"completion": completion, "metrics": asdict(snapshot), "prompt": prompt}


# Test with a small model if available, fallback to demo
try:
    # Use smaller model for baseline testing
    model_id = os.getenv(
        "MODEL_ID", "microsoft/DialoGPT-small"
    )  # Fallback to small model
    adapter = MetricsLLMAdapter(model_id)
    print(f"[Adapter] Loaded model: {model_id}")
except Exception as e:
    print(f"[Warning] Could not load model: {e}")
    print("[Info] Continuing with mock adapter for demo")
    adapter = None

In [None]:
# %%
# Baseline test suite
test_messages = [
    [{"role": "user", "content": "Hello, how are you?"}],
    [{"role": "user", "content": "解釋什麼是人工智慧。"}],
    [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Write a short poem about technology."},
    ],
]

baseline_results = []

if adapter:
    print("[Baseline] Running performance tests...")

    for i, messages in enumerate(test_messages):
        print(f"\n--- Test {i+1}/3 ---")

        result = adapter.generate_with_metrics(
            messages=messages, max_new_tokens=100, temperature=0.7
        )

        baseline_results.append(result)
        print(f"Output: {result['completion'][:100]}...")
        print(
            f"Metrics: {result['metrics']['latency_ms']:.1f}ms, {result['metrics']['tokens_per_second']:.1f} tok/s"
        )

    print(f"\n[Baseline] Completed {len(baseline_results)} tests")
else:
    print("[Demo] Creating mock baseline results...")
    for i, messages in enumerate(test_messages):
        mock_result = {
            "completion": f"Mock response {i+1}",
            "metrics": {
                "timestamp": datetime.now().isoformat(),
                "prompt_tokens": 20,
                "completion_tokens": 50,
                "total_tokens": 70,
                "latency_ms": 1500.0,
                "tokens_per_second": 33.3,
                "cpu_percent": 25.0,
                "memory_mb": 2048.0,
                "gpu_memory_mb": 1024.0 if GPU_AVAILABLE else None,
                "model_id": "mock_model",
                "backend": "transformers",
                "temperature": 0.7,
                "max_new_tokens": 100,
            },
            "prompt": f"Mock prompt {i+1}",
        }
        baseline_results.append(mock_result)

In [None]:
# %%
# Smoke test - quick validation
print("[Smoke Test] Validating metrics collection...")

if baseline_results:
    sample = baseline_results[0]["metrics"]

    # Validate required fields
    required_fields = ["latency_ms", "tokens_per_second", "total_tokens", "timestamp"]
    missing = [f for f in required_fields if f not in sample or sample[f] is None]

    if missing:
        print(f"❌ Missing fields: {missing}")
    else:
        print("✅ All required metrics present")
        print(f"✅ Sample latency: {sample['latency_ms']:.1f}ms")
        print(f"✅ Sample throughput: {sample['tokens_per_second']:.1f} tok/s")

    # Check GPU monitoring
    if GPU_AVAILABLE and sample.get("gpu_memory_mb"):
        print(f"✅ GPU monitoring: {sample['gpu_memory_mb']:.1f}MB")
    else:
        print("⚠️ GPU monitoring unavailable")
else:
    print("❌ No baseline results to validate")

In [None]:
# %%
# Generate standardized baseline report
def calculate_summary_stats(results: List[Dict]) -> Dict[str, Any]:
    """Calculate summary statistics from baseline results"""
    if not results:
        return {}

    metrics = [r["metrics"] for r in results]

    # Extract numeric metrics
    latencies = [m["latency_ms"] for m in metrics]
    throughputs = [m["tokens_per_second"] for m in metrics]
    token_counts = [m["total_tokens"] for m in metrics]

    return {
        "test_count": len(results),
        "latency": {
            "mean_ms": sum(latencies) / len(latencies),
            "min_ms": min(latencies),
            "max_ms": max(latencies),
        },
        "throughput": {
            "mean_tok_per_sec": sum(throughputs) / len(throughputs),
            "min_tok_per_sec": min(throughputs),
            "max_tok_per_sec": max(throughputs),
        },
        "tokens": {
            "mean_total": sum(token_counts) / len(token_counts),
            "total_generated": sum(m["completion_tokens"] for m in metrics),
        },
    }


# Create comprehensive baseline report
baseline_report = {
    "meta": {
        "generated_at": datetime.now().isoformat(),
        "model_id": adapter.model_id if adapter else "mock_model",
        "backend": "transformers",
        "device": (
            str(torch.cuda.get_device_name(0)) if torch.cuda.is_available() else "CPU"
        ),
        "gpu_available": torch.cuda.is_available(),
        "gpu_monitoring": GPU_AVAILABLE,
    },
    "summary": calculate_summary_stats(baseline_results),
    "detailed_results": baseline_results,
    "system_info": {
        "python_version": f"{psutil.sys.version_info.major}.{psutil.sys.version_info.minor}",
        "cpu_count": psutil.cpu_count(),
        "memory_total_gb": psutil.virtual_memory().total / (1024**3),
    },
}

# Save to file
output_path = "outs/baseline.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(baseline_report, f, indent=2, ensure_ascii=False)

print(f"[Report] Baseline saved to: {output_path}")
print(
    f"[Summary] {baseline_report['summary']['test_count']} tests, "
    f"avg latency: {baseline_report['summary']['latency']['mean_ms']:.1f}ms, "
    f"avg throughput: {baseline_report['summary']['throughput']['mean_tok_per_sec']:.1f} tok/s"
)

# Display key metrics
if baseline_report["summary"]:
    print("\n=== Baseline Metrics Summary ===")
    print(
        f"平均延遲 (Average Latency): {baseline_report['summary']['latency']['mean_ms']:.1f}ms"
    )
    print(
        f"平均吞吐量 (Average Throughput): {baseline_report['summary']['throughput']['mean_tok_per_sec']:.1f} tokens/sec"
    )
    print(
        f"總生成詞元 (Total Tokens Generated): {baseline_report['summary']['tokens']['total_generated']}"
    )
    print(f"GPU 監控 (GPU Monitoring): {'啟用' if GPU_AVAILABLE else '未啟用'}")
    print("============================")

print("\n[Complete] nb06 baseline metrics collection finished!")
print(
    "Next: Use these metrics to compare different models/settings in future notebooks."
)

Smoke Test測試

In [None]:
# Quick validation that metrics work
assert len(baseline_results) > 0, "Should have baseline results"
assert "metrics" in baseline_results[0], "Should have metrics in results"
assert baseline_results[0]["metrics"]["latency_ms"] > 0, "Should have positive latency"
print("✅ Smoke test passed - metrics collection working")