In [None]:
# nb63_perf_latency_tokens_vram.ipynb
# Stage 7: Performance Monitoring & Resource Observability

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# ============================================================================
# Cell 2: Performance Profiler Class
# ============================================================================

import time
import json
import psutil
import gc
from typing import Dict, Any, Optional, List
from dataclasses import dataclass, asdict
from contextlib import contextmanager


@dataclass
class PerfMetrics:
    """Performance metrics container"""

    operation: str
    latency_ms: float
    tokens_per_sec: float
    vram_peak_mb: float
    cpu_percent: float
    memory_mb: float
    input_tokens: int
    output_tokens: int
    backend: str
    model_id: str
    quantization: Optional[str] = None
    timestamp: float = 0.0


class PerfProfiler:
    """Lightweight performance profiler for LLM operations"""

    def __init__(self):
        self.metrics_history: List[PerfMetrics] = []
        self.baseline_path = "outs/baseline.json"
        pathlib.Path("outs").mkdir(exist_ok=True)

    def get_vram_usage(self) -> float:
        """Get current VRAM usage in MB"""
        if torch.cuda.is_available():
            return torch.cuda.memory_allocated() / 1024 / 1024
        return 0.0

    def get_vram_peak(self) -> float:
        """Get peak VRAM usage in MB since last reset"""
        if torch.cuda.is_available():
            return torch.cuda.max_memory_allocated() / 1024 / 1024
        return 0.0

    def reset_vram_peak(self):
        """Reset VRAM peak counter"""
        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats()

    @contextmanager
    def profile_operation(
        self,
        operation: str,
        model_id: str,
        backend: str,
        input_tokens: int = 0,
        quantization: str = None,
    ):
        """Context manager for profiling operations"""
        # Setup
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        self.reset_vram_peak()

        start_time = time.time()
        start_vram = self.get_vram_usage()
        process = psutil.Process()
        start_memory = process.memory_info().rss / 1024 / 1024  # MB

        try:
            yield self

        finally:
            # Measurements
            end_time = time.time()
            latency_ms = (end_time - start_time) * 1000
            peak_vram = self.get_vram_peak()
            cpu_percent = process.cpu_percent()
            end_memory = process.memory_info().rss / 1024 / 1024

            # Create metrics (tokens_per_sec will be updated externally)
            metrics = PerfMetrics(
                operation=operation,
                latency_ms=latency_ms,
                tokens_per_sec=0.0,  # To be filled
                vram_peak_mb=peak_vram,
                cpu_percent=cpu_percent,
                memory_mb=end_memory - start_memory,
                input_tokens=input_tokens,
                output_tokens=0,  # To be filled
                backend=backend,
                model_id=model_id,
                quantization=quantization,
                timestamp=end_time,
            )

            self.current_metrics = metrics

    def finalize_metrics(self, output_tokens: int):
        """Update current metrics with output token count and calculate tokens/sec"""
        if hasattr(self, "current_metrics"):
            self.current_metrics.output_tokens = output_tokens
            if self.current_metrics.latency_ms > 0:
                self.current_metrics.tokens_per_sec = (
                    output_tokens * 1000
                ) / self.current_metrics.latency_ms
            self.metrics_history.append(self.current_metrics)

    def save_baseline(self):
        """Save performance baseline to JSON"""
        baseline_data = {
            "timestamp": time.time(),
            "gpu_info": {
                "available": torch.cuda.is_available(),
                "name": (
                    torch.cuda.get_device_name(0)
                    if torch.cuda.is_available()
                    else "CPU"
                ),
                "memory_gb": (
                    torch.cuda.get_device_properties(0).total_memory / 1024**3
                    if torch.cuda.is_available()
                    else 0
                ),
            },
            "metrics": [asdict(m) for m in self.metrics_history],
        }

        with open(self.baseline_path, "w", encoding="utf-8") as f:
            json.dump(baseline_data, f, indent=2, ensure_ascii=False)

        print(f"✅ Baseline saved to {self.baseline_path}")
        return baseline_data

    def print_summary(self):
        """Print performance summary"""
        if not self.metrics_history:
            print("❌ No metrics recorded")
            return

        print("\n" + "=" * 60)
        print("🚀 PERFORMANCE SUMMARY")
        print("=" * 60)

        for backend in set(m.backend for m in self.metrics_history):
            backend_metrics = [m for m in self.metrics_history if m.backend == backend]
            print(f"\n📊 Backend: {backend}")

            for metrics in backend_metrics:
                print(f"  {metrics.operation}:")
                print(f"    ⏱️  Latency: {metrics.latency_ms:.1f}ms")
                print(f"    🔥 Tokens/sec: {metrics.tokens_per_sec:.1f}")
                print(f"    💾 VRAM Peak: {metrics.vram_peak_mb:.1f}MB")
                print(f"    🎯 Tokens: {metrics.input_tokens}→{metrics.output_tokens}")
                if metrics.quantization:
                    print(f"    ⚡ Quant: {metrics.quantization}")


# Initialize global profiler
profiler = PerfProfiler()

In [None]:
# ============================================================================
# Cell 3: LLM Inference Performance Test
# ============================================================================

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import tiktoken


class LLMPerfTester:
    """Test LLM inference performance across different configurations"""

    def __init__(self, profiler: PerfProfiler):
        self.profiler = profiler
        self.test_prompts = [
            "請解釋什麼是檢索增強生成(RAG)？",
            "寫一個Python函數來計算斐波那契數列",
            "分析人工智慧在教育領域的應用前景與挑戰",
        ]

    def test_transformers_inference(
        self, model_id: str = "Qwen/Qwen2.5-7B-Instruct", use_4bit: bool = True
    ):
        """Test Transformers backend performance"""
        print(f"🧪 Testing Transformers: {model_id}")

        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        # Configure quantization
        quant_config = None
        quant_str = None
        if use_4bit and torch.cuda.is_available():
            quant_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_quant_type="nf4",
            )
            quant_str = "4bit"

        # Load model
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.float16,
            quantization_config=quant_config,
            trust_remote_code=True,
        )

        # Test inference
        for i, prompt in enumerate(self.test_prompts):
            input_ids = tokenizer(prompt, return_tensors="pt").input_ids
            input_tokens = input_ids.shape[1]

            with self.profiler.profile_operation(
                f"transformers_inference_{i+1}",
                model_id,
                "transformers",
                input_tokens,
                quant_str,
            ):
                with torch.no_grad():
                    outputs = model.generate(
                        input_ids.to(model.device),
                        max_new_tokens=100,
                        do_sample=True,
                        temperature=0.7,
                        pad_token_id=tokenizer.eos_token_id,
                    )

            output_tokens = outputs.shape[1] - input_tokens
            self.profiler.finalize_metrics(output_tokens)

            # Print sample output
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            print(f"  📝 Sample {i+1}: {response[-100:]}...")

        # Cleanup
        del model
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        gc.collect()

    def test_llamacpp_inference(self, model_path: str = None):
        """Test llama.cpp backend performance (if available)"""
        try:
            from llama_cpp import Llama

            if not model_path:
                print("⏭️  Skipping llama.cpp test (no model path provided)")
                return

            print(f"🧪 Testing llama.cpp: {model_path}")

            # Load model
            llm = Llama(
                model_path=model_path,
                n_ctx=2048,
                n_gpu_layers=32 if torch.cuda.is_available() else 0,
                verbose=False,
            )

            # Test inference
            for i, prompt in enumerate(self.test_prompts[:1]):  # Test one prompt
                input_tokens = len(prompt.split()) * 1.3  # Rough estimate

                with self.profiler.profile_operation(
                    f"llamacpp_inference_{i+1}",
                    model_path,
                    "llama.cpp",
                    int(input_tokens),
                    "GGUF",
                ):
                    output = llm(prompt, max_tokens=100, temperature=0.7)

                output_tokens = len(output["choices"][0]["text"].split()) * 1.3
                self.profiler.finalize_metrics(int(output_tokens))

                print(f"  📝 Sample: {output['choices'][0]['text'][:100]}...")

            del llm

        except ImportError:
            print("⏭️  Skipping llama.cpp test (library not installed)")
        except Exception as e:
            print(f"❌ llama.cpp test failed: {e}")


# Run LLM performance tests
llm_tester = LLMPerfTester(profiler)

# Test with quantization
llm_tester.test_transformers_inference(use_4bit=True)

# Test llama.cpp if available
# llm_tester.test_llamacpp_inference("/path/to/model.gguf")

In [None]:
# ============================================================================
# Cell 4: RAG Retrieval Performance Test
# ============================================================================

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer


class RAGPerfTester:
    """Test RAG retrieval performance"""

    def __init__(self, profiler: PerfProfiler):
        self.profiler = profiler
        self.test_queries = [
            "什麼是機器學習？",
            "如何實作神經網路？",
            "深度學習的應用領域有哪些？",
        ]

    def setup_test_index(self, n_docs: int = 1000):
        """Create test index with synthetic documents"""
        print(f"🔧 Setting up test index with {n_docs} documents...")

        # Generate synthetic documents
        doc_templates = [
            "機器學習是人工智慧的重要分支，專注於讓電腦系統自動學習和改進。",
            "神經網路是模仿人腦神經元結構設計的計算模型，用於解決複雜問題。",
            "深度學習使用多層神經網路來學習數據的抽象表示和特徵。",
            "自然語言處理結合語言學和機器學習來理解和生成人類語言。",
            "電腦視覺使用深度學習技術來識別和理解圖像內容。",
        ]

        docs = []
        for i in range(n_docs):
            base_doc = doc_templates[i % len(doc_templates)]
            docs.append(f"{base_doc} 文檔編號: {i+1}")

        return docs

    def test_embedding_performance(self, model_name: str = "BAAI/bge-m3"):
        """Test embedding model performance"""
        print(f"🧪 Testing Embedding: {model_name}")

        # Load embedding model
        embedding_model = SentenceTransformer(model_name)

        # Setup test documents
        docs = self.setup_test_index(500)

        # Test document embedding
        with self.profiler.profile_operation(
            "embedding_documents",
            model_name,
            "sentence-transformers",
            len(" ".join(docs).split()),
            None,
        ):
            doc_embeddings = embedding_model.encode(
                docs, normalize_embeddings=True, show_progress_bar=False
            )

        # Estimate tokens processed (rough)
        total_tokens = sum(len(doc.split()) for doc in docs) * 1.3
        self.profiler.finalize_metrics(int(total_tokens))

        # Test query embedding
        for i, query in enumerate(self.test_queries):
            query_tokens = len(query.split()) * 1.3

            with self.profiler.profile_operation(
                f"embedding_query_{i+1}",
                model_name,
                "sentence-transformers",
                int(query_tokens),
                None,
            ):
                query_embedding = embedding_model.encode(
                    [query], normalize_embeddings=True
                )

            self.profiler.finalize_metrics(int(query_tokens))

        return doc_embeddings, embedding_model

    def test_faiss_performance(self, embeddings: np.ndarray):
        """Test FAISS index performance"""
        print(f"🧪 Testing FAISS Index: {embeddings.shape}")

        # Build FAISS index
        with self.profiler.profile_operation(
            "faiss_index_build", f"IndexFlatIP-{embeddings.shape[0]}", "faiss", 0, None
        ):
            index = faiss.IndexFlatIP(embeddings.shape[1])
            index.add(embeddings.astype("float32"))

        self.profiler.finalize_metrics(embeddings.shape[0])

        # Test search performance
        embedding_model = SentenceTransformer("BAAI/bge-m3")

        for i, query in enumerate(self.test_queries):
            query_embedding = embedding_model.encode([query], normalize_embeddings=True)

            with self.profiler.profile_operation(
                f"faiss_search_{i+1}",
                f"IndexFlatIP-{embeddings.shape[0]}",
                "faiss",
                1,
                None,
            ):
                scores, indices = index.search(query_embedding.astype("float32"), k=10)

            self.profiler.finalize_metrics(10)  # Retrieved 10 docs
            print(f"  🎯 Query {i+1}: Top score = {scores[0][0]:.4f}")

        return index


# Run RAG performance tests
rag_tester = RAGPerfTester(profiler)

# Test embedding performance
doc_embeddings, emb_model = rag_tester.test_embedding_performance()

# Test FAISS performance
faiss_index = rag_tester.test_faiss_performance(doc_embeddings)

In [None]:
# ============================================================================
# Cell 5: End-to-End RAG Performance Test
# ============================================================================


class EndToEndRAGTester:
    """Test complete RAG pipeline performance"""

    def __init__(self, profiler: PerfProfiler, embedding_model, faiss_index, docs):
        self.profiler = profiler
        self.embedding_model = embedding_model
        self.index = faiss_index
        self.docs = docs
        self.test_questions = ["什麼是機器學習的基本概念？", "神經網路如何進行訓練？"]

    def rag_retrieve_and_answer(self, query: str, llm_model, tokenizer, k: int = 5):
        """Complete RAG pipeline: retrieve + generate"""

        # Step 1: Query embedding
        with self.profiler.profile_operation(
            "rag_query_embedding",
            "BAAI/bge-m3",
            "sentence-transformers",
            len(query.split()) * 1.3,
            None,
        ):
            query_embedding = self.embedding_model.encode(
                [query], normalize_embeddings=True
            )

        self.profiler.finalize_metrics(len(query.split()))

        # Step 2: Retrieval
        with self.profiler.profile_operation(
            "rag_retrieval", f"FAISS-{len(self.docs)}", "faiss", 1, None
        ):
            scores, indices = self.index.search(query_embedding.astype("float32"), k=k)

        self.profiler.finalize_metrics(k)

        # Step 3: Context preparation
        retrieved_docs = [self.docs[idx] for idx in indices[0]]
        context = "\n".join([f"[{i+1}] {doc}" for i, doc in enumerate(retrieved_docs)])

        prompt = f"""基於以下資料回答問題：

資料：
{context}

問題：{query}

回答："""

        # Step 4: Generation
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        input_tokens = input_ids.shape[1]

        with self.profiler.profile_operation(
            "rag_generation", "Qwen2.5-7B", "transformers", input_tokens, "4bit"
        ):
            with torch.no_grad():
                outputs = llm_model.generate(
                    input_ids.to(llm_model.device),
                    max_new_tokens=200,
                    do_sample=True,
                    temperature=0.7,
                    pad_token_id=tokenizer.eos_token_id,
                )

        output_tokens = outputs.shape[1] - input_tokens
        self.profiler.finalize_metrics(output_tokens)

        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = response[len(prompt) :].strip()

        return {
            "query": query,
            "retrieved_docs": retrieved_docs,
            "answer": answer,
            "scores": scores[0].tolist(),
        }

    def test_e2e_performance(self):
        """Test end-to-end RAG performance"""
        print("🧪 Testing End-to-End RAG Pipeline")

        # Load LLM for generation
        model_id = "Qwen/Qwen2.5-7B-Instruct"
        tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        # Load with 4-bit quantization
        quant_config = (
            BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_quant_type="nf4",
            )
            if torch.cuda.is_available()
            else None
        )

        llm_model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.float16,
            quantization_config=quant_config,
            trust_remote_code=True,
        )

        # Test each question
        results = []
        for i, question in enumerate(self.test_questions):
            print(f"  📋 Question {i+1}: {question}")

            result = self.rag_retrieve_and_answer(question, llm_model, tokenizer)
            results.append(result)

            print(f"  ✅ Answer preview: {result['answer'][:100]}...")
            print(f"  📊 Top retrieval score: {result['scores'][0]:.4f}")

        # Cleanup
        del llm_model
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        gc.collect()

        return results


# Create test documents for E2E testing
test_docs = rag_tester.setup_test_index(200)

# Run end-to-end test
e2e_tester = EndToEndRAGTester(profiler, emb_model, faiss_index, test_docs)
e2e_results = e2e_tester.test_e2e_performance()

In [None]:
# ============================================================================
# Cell 6: VRAM Peak Tracking & Memory Optimization
# ============================================================================


class VRAMAnalyzer:
    """Analyze VRAM usage patterns and suggest optimizations"""

    def __init__(self, profiler: PerfProfiler):
        self.profiler = profiler

    def analyze_vram_usage(self):
        """Analyze VRAM usage from metrics history"""
        if not torch.cuda.is_available():
            print("⚠️  CUDA not available - skipping VRAM analysis")
            return

        print("\n" + "=" * 50)
        print("💾 VRAM USAGE ANALYSIS")
        print("=" * 50)

        # GPU info
        gpu_name = torch.cuda.get_device_name(0)
        total_vram = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"🖥️  GPU: {gpu_name}")
        print(f"📏 Total VRAM: {total_vram:.1f} GB")

        # Analyze metrics
        vram_metrics = [
            (m.operation, m.vram_peak_mb) for m in self.profiler.metrics_history
        ]

        if not vram_metrics:
            print("❌ No VRAM metrics available")
            return

        # Sort by VRAM usage
        vram_metrics.sort(key=lambda x: x[1], reverse=True)

        print("\n📊 VRAM Usage by Operation:")
        for op, vram_mb in vram_metrics[:10]:  # Top 10
            vram_gb = vram_mb / 1024
            usage_pct = (vram_gb / total_vram) * 100
            print(f"  {op:<30} {vram_mb:>8.1f} MB ({usage_pct:>5.1f}%)")

        # Recommendations
        max_vram = max(vram_mb for _, vram_mb in vram_metrics)
        max_vram_gb = max_vram / 1024

        print(f"\n🎯 Peak VRAM Usage: {max_vram:.1f} MB ({max_vram_gb:.2f} GB)")

        if max_vram_gb > total_vram * 0.9:
            print("⚠️  HIGH VRAM USAGE - Consider optimizations:")
            print("   • Use smaller models or more aggressive quantization")
            print("   • Reduce batch sizes")
            print("   • Enable gradient checkpointing")
            print("   • Use model sharding or offloading")
        elif max_vram_gb > total_vram * 0.7:
            print("⚡ MODERATE VRAM USAGE - Optimizations available:")
            print("   • 8-bit quantization could reduce usage")
            print("   • Consider larger batch sizes for efficiency")
        else:
            print("✅ EFFICIENT VRAM USAGE - Well optimized!")
            print("   • Consider larger models or batch sizes")

    def memory_optimization_test(self):
        """Test different memory optimization strategies"""
        print("\n🧪 Testing Memory Optimizations...")

        test_text = "請詳細解釋深度學習的基本原理和應用場景。" * 5

        # Test different configurations
        configs = [
            {"name": "Baseline FP16", "dtype": torch.float16, "quant": None},
            {"name": "4-bit Quantization", "dtype": torch.float16, "quant": "4bit"},
        ]

        if torch.cuda.is_available():
            for config in configs:
                print(f"\n  🔧 Testing: {config['name']}")

                try:
                    # Reset VRAM tracking
                    torch.cuda.empty_cache()
                    self.profiler.reset_vram_peak()

                    # Simulate model loading
                    if config["quant"] == "4bit":
                        # Simulate 4-bit loading (reduced VRAM)
                        dummy_tensor = torch.randn(
                            1000, 1000, dtype=config["dtype"]
                        ).cuda()
                        dummy_tensor = dummy_tensor * 0.5  # Simulate quantization
                    else:
                        dummy_tensor = torch.randn(
                            2000, 2000, dtype=config["dtype"]
                        ).cuda()

                    peak_vram = self.profiler.get_vram_peak()
                    print(f"    💾 Peak VRAM: {peak_vram:.1f} MB")

                    # Cleanup
                    del dummy_tensor
                    torch.cuda.empty_cache()

                except Exception as e:
                    print(f"    ❌ Failed: {e}")


# Run VRAM analysis
vram_analyzer = VRAMAnalyzer(profiler)
vram_analyzer.analyze_vram_usage()
vram_analyzer.memory_optimization_test()

In [None]:
# ============================================================================
# Cell 7: Performance Report Generation
# ============================================================================


def generate_performance_report():
    """Generate comprehensive performance report"""
    print("\n" + "=" * 60)
    print("📋 GENERATING PERFORMANCE REPORT")
    print("=" * 60)

    # Print summary
    profiler.print_summary()

    # Save baseline
    baseline_data = profiler.save_baseline()

    # Generate recommendations
    print("\n🎯 PERFORMANCE RECOMMENDATIONS:")

    # Analyze latency
    latencies = [m.latency_ms for m in profiler.metrics_history]
    if latencies:
        avg_latency = sum(latencies) / len(latencies)
        max_latency = max(latencies)

        print(f"   ⏱️  Average Latency: {avg_latency:.1f}ms")
        print(f"   ⏱️  Max Latency: {max_latency:.1f}ms")

        if avg_latency > 5000:  # 5 seconds
            print("   ⚠️  High latency detected - consider:")
            print("      • Smaller models or more aggressive quantization")
            print("      • GPU acceleration if using CPU")
            print("      • Batch processing for multiple requests")

    # Analyze throughput
    token_rates = [
        m.tokens_per_sec for m in profiler.metrics_history if m.tokens_per_sec > 0
    ]
    if token_rates:
        avg_tokens_sec = sum(token_rates) / len(token_rates)
        print(f"   🔥 Average Tokens/sec: {avg_tokens_sec:.1f}")

        if avg_tokens_sec < 10:
            print("   ⚠️  Low throughput detected - consider:")
            print("      • GPU acceleration or better hardware")
            print("      • Model quantization to reduce memory pressure")
            print("      • Batch inference for better GPU utilization")
        elif avg_tokens_sec > 50:
            print("   ✅ Excellent throughput - well optimized!")

    # Hardware utilization
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        total_vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"   🖥️  Hardware: {gpu_name} ({total_vram_gb:.1f}GB)")

    return baseline_data


# Generate the report
report_data = generate_performance_report()

In [None]:
# ============================================================================
# Cell 8: Smoke Test - Quick Performance Check
# ============================================================================


def smoke_test_performance():
    """Quick smoke test for performance monitoring"""
    print("\n" + "=" * 50)
    print("🔥 SMOKE TEST - Performance Monitoring")
    print("=" * 50)

    # Test basic profiler functionality
    test_profiler = PerfProfiler()

    # Simulate a quick operation
    with test_profiler.profile_operation(
        "smoke_test", "test-model", "test-backend", 50, "test-quant"
    ):
        # Simulate some work
        time.sleep(0.1)
        if torch.cuda.is_available():
            dummy = torch.randn(100, 100).cuda()
            result = torch.matmul(dummy, dummy)
            del dummy, result

    test_profiler.finalize_metrics(25)

    # Check results
    assert len(test_profiler.metrics_history) == 1, "❌ Metrics not recorded"

    metrics = test_profiler.metrics_history[0]
    assert metrics.latency_ms > 50, f"❌ Unexpected latency: {metrics.latency_ms}ms"
    assert (
        metrics.tokens_per_sec > 0
    ), f"❌ Invalid tokens/sec: {metrics.tokens_per_sec}"
    assert metrics.input_tokens == 50, f"❌ Wrong input tokens: {metrics.input_tokens}"
    assert (
        metrics.output_tokens == 25
    ), f"❌ Wrong output tokens: {metrics.output_tokens}"

    print("✅ Basic profiler functionality working")

    # Test VRAM tracking
    if torch.cuda.is_available():
        test_profiler.reset_vram_peak()
        dummy = torch.randn(1000, 1000).cuda()
        peak = test_profiler.get_vram_peak()
        del dummy

        assert peak > 0, f"❌ VRAM tracking failed: {peak}MB"
        print(f"✅ VRAM tracking working: {peak:.1f}MB peak")
    else:
        print("⏭️  VRAM tracking skipped (no CUDA)")

    # Test baseline saving
    test_baseline_path = "outs/smoke_baseline.json"
    test_profiler.baseline_path = test_baseline_path
    test_profiler.save_baseline()

    assert pathlib.Path(test_baseline_path).exists(), "❌ Baseline file not created"

    with open(test_baseline_path, "r") as f:
        baseline = json.load(f)

    assert "metrics" in baseline, "❌ Invalid baseline format"
    assert len(baseline["metrics"]) == 1, "❌ Wrong number of metrics in baseline"

    print("✅ Baseline saving working")

    # Cleanup
    pathlib.Path(test_baseline_path).unlink()

    print("\n🎉 All smoke tests passed!")


# Run smoke test
smoke_test_performance()

In [None]:
# ============================================================================
# Cell 9: Summary & Optimization Recommendations
# ============================================================================


def print_stage_summary():
    """Print Stage 7 notebook summary and next steps"""
    print("\n" + "=" * 70)
    print("📋 STAGE 7 - NOTEBOOK 63 SUMMARY")
    print("=" * 70)

    print("\n✅ COMPLETED:")
    print("   • 效能監控框架 (PerfProfiler) 建立")
    print("   • LLM推理效能測試 (Transformers + 量化)")
    print("   • RAG檢索效能測試 (Embedding + FAISS)")
    print("   • 端到端RAG流程效能測試")
    print("   • VRAM使用分析與優化建議")
    print("   • 效能基準報表生成 (baseline.json)")
    print("   • 延遲、吞吐量、記憶體使用追蹤")

    print("\n🎯 CORE CONCEPTS:")
    print("   • Performance Profiling: 使用context manager追蹤延遲與資源")
    print("   • VRAM Monitoring: torch.cuda.memory API追蹤峰值使用")
    print("   • Tokens/sec Calculation: 輸出token數除以延遲時間")
    print("   • Multi-backend Comparison: 對比不同後端效能差異")
    print("   • Memory Optimization: 量化、批次大小、模型分片策略")

    print("\n⚠️  PITFALLS:")
    print("   • VRAM測量不準確: 需在操作前reset_peak_memory_stats()")
    print("   • Token計算估算: 不同tokenizer的token數差異很大")
    print("   • 冷啟動效應: 首次推理較慢，需多次測量取平均")
    print("   • 記憶體洩漏: 測試後要明確del模型並empty_cache()")
    print("   • 並發干擾: 多GPU或其他程序會影響效能測量")

    print("\n🚀 NEXT ACTIONS:")
    print("   • nb64: 整合所有評估指標產生綜合報表")
    print("   • 建立效能回歸檢測CI")
    print("   • 加入更多模型與硬體配置的基準")
    print("   • 實作自動效能優化建議系統")

    print("\n📊 CURRENT BASELINE:")
    if pathlib.Path("outs/baseline.json").exists():
        with open("outs/baseline.json", "r") as f:
            baseline = json.load(f)

        print(f"   📅 Generated: {time.ctime(baseline['timestamp'])}")
        print(f"   🖥️  Hardware: {baseline['gpu_info']['name']}")
        print(f"   📈 Metrics: {len(baseline['metrics'])} operations recorded")

        # Key stats
        latencies = [m["latency_ms"] for m in baseline["metrics"]]
        token_rates = [
            m["tokens_per_sec"] for m in baseline["metrics"] if m["tokens_per_sec"] > 0
        ]

        if latencies:
            print(f"   ⏱️  Avg Latency: {sum(latencies)/len(latencies):.1f}ms")
        if token_rates:
            print(f"   🔥 Avg Tokens/sec: {sum(token_rates)/len(token_rates):.1f}")

    print("\n" + "=" * 70)


# Print final summary
print_stage_summary()

# Final cleanup
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("\n🎉 Notebook 63 completed successfully!")

In [None]:
# 在 nb63 最後執行這個 cell 驗證功能
def quick_verification():
    """3分鐘內驗證核心功能"""
    print("🔥 Quick Verification (3 min)")

    # 1. 檢查 baseline.json 是否生成
    assert pathlib.Path("outs/baseline.json").exists()
    print("✅ Baseline file created")

    # 2. 檢查是否記錄了效能指標
    assert len(profiler.metrics_history) > 0
    print(f"✅ {len(profiler.metrics_history)} metrics recorded")

    # 3. 檢查關鍵指標存在
    for m in profiler.metrics_history[:3]:
        assert m.latency_ms > 0
        assert m.vram_peak_mb >= 0
        print(f"✅ {m.operation}: {m.latency_ms:.1f}ms, {m.vram_peak_mb:.1f}MB")

    print("🎉 All verifications passed!")


quick_verification()