In [None]:
# nb19_cost_latency_quality.ipynb - Cost/Latency/Quality Trade-off Analysis
# 效能權衡分析：成本、延遲與品質的三角關係

# ================================
# Cell 1: Environment Setup & Dependencies
# ================================

# Shared cache bootstrap
import os, pathlib, torch

AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)

print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# Install additional dependencies for performance measurement
import subprocess
import sys


def install_if_missing(package):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])


packages = [
    "psutil",
    "nvidia-ml-py",
    "matplotlib",
    "seaborn",
    "rouge-score",
    "sacrebleu",
]
for pkg in packages:
    install_if_missing(pkg)

In [None]:
# ================================
# Cell 2: Performance Profiler Classes
# ================================

import time
import psutil
import torch
import json
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

try:
    import pynvml

    pynvml.nvmlInit()
    NVIDIA_ML_AVAILABLE = True
except:
    NVIDIA_ML_AVAILABLE = False
    print("[Warning] nvidia-ml-py not available, GPU metrics will be limited")


@dataclass
class PerformanceMetrics:
    """Performance measurement results"""

    model_name: str
    quantization: str
    prompt_length: int
    generation_length: int

    # Latency metrics (延遲指標)
    time_to_first_token: float  # TTFT - 首個 token 生成時間
    tokens_per_second: float  # TPS - 每秒生成 token 數
    total_time: float  # 總生成時間

    # Resource metrics (資源指標)
    peak_gpu_memory_mb: float  # 峰值 GPU 記憶體使用量
    avg_gpu_utilization: float  # 平均 GPU 使用率
    avg_cpu_percent: float  # 平均 CPU 使用率

    # Quality metrics (品質指標)
    perplexity: Optional[float] = None
    rouge_l: Optional[float] = None
    bleu_score: Optional[float] = None


class PerformanceProfiler:
    """Comprehensive performance profiler for LLM inference"""

    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.gpu_available = torch.cuda.is_available()

        if NVIDIA_ML_AVAILABLE and self.gpu_available:
            self.gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0)

    def measure_inference(
        self,
        model,
        tokenizer,
        prompt: str,
        max_new_tokens: int = 100,
        **generate_kwargs
    ) -> PerformanceMetrics:
        """Measure inference performance for a single generation"""

        # Prepare input
        inputs = tokenizer(prompt, return_tensors="pt").to(self.device)
        prompt_length = inputs.input_ids.shape[1]

        # Pre-generation memory snapshot
        if self.gpu_available:
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
            initial_memory = torch.cuda.memory_allocated()

        # CPU monitoring setup
        cpu_percentages = []
        gpu_utilizations = []

        # Generation with timing
        start_time = time.perf_counter()

        # TTFT measurement (首個 token 時間)
        with torch.inference_mode():
            # Generate first token
            first_token_start = time.perf_counter()
            outputs = model.generate(
                **inputs,
                max_new_tokens=1,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id,
                **generate_kwargs
            )
            ttft = time.perf_counter() - first_token_start

            # Continue generation for remaining tokens
            if max_new_tokens > 1:
                remaining_outputs = model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=True,
                    temperature=0.7,
                    pad_token_id=tokenizer.eos_token_id,
                    **generate_kwargs
                )
                outputs = remaining_outputs

        total_time = time.perf_counter() - start_time

        # Calculate generation metrics
        generated_tokens = outputs.shape[1] - prompt_length
        tps = generated_tokens / total_time if total_time > 0 else 0

        # Memory metrics
        if self.gpu_available:
            peak_memory = torch.cuda.max_memory_allocated()
            peak_memory_mb = peak_memory / 1024 / 1024
        else:
            peak_memory_mb = 0

        # Resource utilization (簡化版本，實際應用中可用更複雜的監控)
        avg_cpu = psutil.cpu_percent(interval=None)

        if NVIDIA_ML_AVAILABLE and self.gpu_available:
            try:
                gpu_util = pynvml.nvmlDeviceGetUtilizationRates(self.gpu_handle)
                avg_gpu = gpu_util.gpu
            except:
                avg_gpu = 0
        else:
            avg_gpu = 0

        return PerformanceMetrics(
            model_name=getattr(model, "name_or_path", "unknown"),
            quantization=getattr(model, "quantization_config", "fp16"),
            prompt_length=prompt_length,
            generation_length=generated_tokens,
            time_to_first_token=ttft,
            tokens_per_second=tps,
            total_time=total_time,
            peak_gpu_memory_mb=peak_memory_mb,
            avg_gpu_utilization=avg_gpu,
            avg_cpu_percent=avg_cpu,
        )

In [None]:
# ================================
# Cell 3: Quality Evaluator
# ================================

from rouge_score import rouge_scorer
from sacrebleu import BLEU


class QualityEvaluator:
    """Evaluate generation quality using multiple metrics"""

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(
            ["rouge1", "rouge2", "rougeL"], use_stemmer=True
        )
        self.bleu = BLEU()

    def calculate_perplexity(self, model, tokenizer, text: str) -> float:
        """Calculate perplexity of generated text"""
        inputs = tokenizer(text, return_tensors="pt")

        with torch.inference_mode():
            outputs = model(**inputs, labels=inputs.input_ids)
            loss = outputs.loss
            perplexity = torch.exp(loss).item()

        return perplexity

    def calculate_rouge(self, generated: str, reference: str) -> float:
        """Calculate ROUGE-L F1 score"""
        scores = self.rouge_scorer.score(reference, generated)
        return scores["rougeL"].fmeasure

    def calculate_bleu(self, generated: str, reference: str) -> float:
        """Calculate BLEU score"""
        score = self.bleu.sentence_score(generated, [reference])
        return score.score / 100.0  # Convert to 0-1 range

    def evaluate_quality(
        self,
        model,
        tokenizer,
        generated_text: str,
        reference_text: Optional[str] = None,
    ) -> Dict[str, float]:
        """Comprehensive quality evaluation"""
        metrics = {}

        # Perplexity (always calculated)
        try:
            metrics["perplexity"] = self.calculate_perplexity(
                model, tokenizer, generated_text
            )
        except Exception as e:
            print(f"[Warning] Perplexity calculation failed: {e}")
            metrics["perplexity"] = float("inf")

        # Reference-based metrics (if reference provided)
        if reference_text:
            try:
                metrics["rouge_l"] = self.calculate_rouge(
                    generated_text, reference_text
                )
                metrics["bleu"] = self.calculate_bleu(generated_text, reference_text)
            except Exception as e:
                print(f"[Warning] Reference-based metrics failed: {e}")
                metrics["rouge_l"] = 0.0
                metrics["bleu"] = 0.0

        return metrics

In [None]:
# ================================
# Cell 4: Test Dataset & Model Configurations
# ================================

# Standard test prompts with varying complexity
TEST_PROMPTS = [
    {
        "prompt": "Explain artificial intelligence in simple terms.",
        "reference": "Artificial intelligence is technology that enables machines to perform tasks that typically require human intelligence, such as learning, reasoning, and problem-solving.",
        "category": "simple",
        "expected_length": 50,
    },
    {
        "prompt": "Write a detailed analysis of climate change impacts on global agriculture, including specific examples and potential solutions.",
        "reference": "Climate change significantly affects global agriculture through altered precipitation patterns, increased temperatures, and extreme weather events. For example, drought in wheat-growing regions reduces yields, while flooding destroys crops. Solutions include drought-resistant crops, improved irrigation, and sustainable farming practices.",
        "category": "complex",
        "expected_length": 150,
    },
    {
        "prompt": "Translate and explain: 'Machine learning algorithms can identify patterns in large datasets.'",
        "reference": "Machine learning algorithms are computational methods that can automatically discover patterns, relationships, and trends within large amounts of data without being explicitly programmed to look for specific patterns.",
        "category": "medium",
        "expected_length": 100,
    },
]

# Model configurations to test
MODEL_CONFIGS = [
    {
        "model_id": "microsoft/DialoGPT-small",  # ~117M parameters
        "name": "DialoGPT-small",
        "size_category": "small",
        "quantization_options": ["fp16", "int8"],
    },
    {
        "model_id": "microsoft/DialoGPT-medium",  # ~345M parameters
        "name": "DialoGPT-medium",
        "size_category": "medium",
        "quantization_options": ["fp16", "int8", "int4"],
    },
]

# Cost calculation parameters (成本計算參數)
COST_PARAMS = {
    "gpu_hourly_cost": {  # USD per hour
        "RTX_3060": 0.15,
        "RTX_4070": 0.25,
        "RTX_4090": 0.50,
        "A100": 2.00,
    },
    "electricity_kwh": 0.12,  # USD per kWh
    "gpu_power_watts": {"RTX_3060": 170, "RTX_4070": 200, "RTX_4090": 450, "A100": 400},
}


def detect_gpu_type() -> str:
    """Detect current GPU type for cost calculation"""
    if not torch.cuda.is_available():
        return "CPU"

    gpu_name = torch.cuda.get_device_name(0).lower()
    if "3060" in gpu_name:
        return "RTX_3060"
    elif "4070" in gpu_name:
        return "RTX_4070"
    elif "4090" in gpu_name:
        return "RTX_4090"
    elif "a100" in gpu_name:
        return "A100"
    else:
        return "RTX_4070"  # Default assumption

In [None]:
# ================================
# Cell 5: Automated Performance Testing
# ================================

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import warnings

warnings.filterwarnings("ignore")


class PerformanceBenchmark:
    """Automated benchmark suite for cost/latency/quality analysis"""

    def __init__(self):
        self.profiler = PerformanceProfiler()
        self.evaluator = QualityEvaluator()
        self.results = []
        self.gpu_type = detect_gpu_type()

    def load_model_with_quantization(self, model_id: str, quantization: str):
        """Load model with specified quantization"""
        print(f"Loading {model_id} with {quantization}...")

        if quantization == "int4":
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True,
            )
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                quantization_config=bnb_config,
                device_map="auto",
                torch_dtype=torch.float16,
            )
        elif quantization == "int8":
            bnb_config = BitsAndBytesConfig(load_in_8bit=True)
            model = AutoModelForCausalLM.from_pretrained(
                model_id, quantization_config=bnb_config, device_map="auto"
            )
        else:  # fp16
            model = AutoModelForCausalLM.from_pretrained(
                model_id, torch_dtype=torch.float16, device_map="auto"
            )

        tokenizer = AutoTokenizer.from_pretrained(model_id)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        return model, tokenizer

    def calculate_cost_per_1k_tokens(self, metrics: PerformanceMetrics) -> float:
        """Calculate cost per 1000 tokens generated"""
        if metrics.tokens_per_second <= 0:
            return float("inf")

        # Time to generate 1000 tokens
        time_for_1k = 1000 / metrics.tokens_per_second

        # GPU cost
        gpu_cost_per_hour = COST_PARAMS["gpu_hourly_cost"].get(self.gpu_type, 0.25)
        gpu_cost_1k = gpu_cost_per_hour * (time_for_1k / 3600)

        # Electricity cost
        power_watts = COST_PARAMS["gpu_power_watts"].get(self.gpu_type, 200)
        electricity_cost_1k = (
            (power_watts / 1000) * (time_for_1k / 3600) * COST_PARAMS["electricity_kwh"]
        )

        return gpu_cost_1k + electricity_cost_1k

    def run_single_benchmark(
        self, model_config: Dict, quantization: str, test_prompt: Dict
    ) -> Dict:
        """Run benchmark for single configuration"""
        try:
            # Load model
            model, tokenizer = self.load_model_with_quantization(
                model_config["model_id"], quantization
            )

            # Performance measurement
            perf_metrics = self.profiler.measure_inference(
                model,
                tokenizer,
                test_prompt["prompt"],
                max_new_tokens=test_prompt["expected_length"],
            )

            # Generate text for quality evaluation
            inputs = tokenizer(test_prompt["prompt"], return_tensors="pt")
            with torch.inference_mode():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=test_prompt["expected_length"],
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id,
                )

            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            generated_only = generated_text[len(test_prompt["prompt"]) :].strip()

            # Quality evaluation
            quality_metrics = self.evaluator.evaluate_quality(
                model, tokenizer, generated_only, test_prompt.get("reference")
            )

            # Cost calculation
            cost_per_1k = self.calculate_cost_per_1k_tokens(perf_metrics)

            # Combine results
            result = {
                "model_name": model_config["name"],
                "quantization": quantization,
                "prompt_category": test_prompt["category"],
                "perf_metrics": perf_metrics,
                "quality_metrics": quality_metrics,
                "cost_per_1k_tokens": cost_per_1k,
                "generated_text": generated_only,
            }

            # Cleanup
            del model, tokenizer
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            return result

        except Exception as e:
            print(f"Benchmark failed for {model_config['name']} + {quantization}: {e}")
            return None

    def run_full_benchmark(self) -> List[Dict]:
        """Run comprehensive benchmark across all configurations"""
        print("🚀 Starting comprehensive performance benchmark...")
        print(f"GPU Type: {self.gpu_type}")

        for model_config in MODEL_CONFIGS:
            for quantization in model_config["quantization_options"]:
                for test_prompt in TEST_PROMPTS:
                    print(
                        f"\n📊 Testing: {model_config['name']} | {quantization} | {test_prompt['category']}"
                    )

                    result = self.run_single_benchmark(
                        model_config, quantization, test_prompt
                    )
                    if result:
                        self.results.append(result)
                        print(
                            f"✅ TPS: {result['perf_metrics'].tokens_per_second:.2f} | "
                            f"Cost: ${result['cost_per_1k_tokens']:.6f}/1K | "
                            f"VRAM: {result['perf_metrics'].peak_gpu_memory_mb:.0f}MB"
                        )

        print(
            f"\n🎯 Benchmark completed! Total configurations tested: {len(self.results)}"
        )
        return self.results


# Run the benchmark
benchmark = PerformanceBenchmark()
results = benchmark.run_full_benchmark()

In [None]:
# ================================
# Cell 6: Results Analysis & Visualization
# ================================

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D


class ResultsAnalyzer:
    """Analyze and visualize benchmark results"""

    def __init__(self, results: List[Dict]):
        self.results = results
        self.df = self._create_dataframe()

    def _create_dataframe(self) -> pd.DataFrame:
        """Convert results to pandas DataFrame for analysis"""
        rows = []
        for result in self.results:
            perf = result["perf_metrics"]
            quality = result["quality_metrics"]

            row = {
                "model_name": result["model_name"],
                "quantization": result["quantization"],
                "prompt_category": result["prompt_category"],
                "tokens_per_second": perf.tokens_per_second,
                "ttft": perf.time_to_first_token,
                "peak_memory_mb": perf.peak_gpu_memory_mb,
                "cost_per_1k": result["cost_per_1k_tokens"],
                "perplexity": quality.get("perplexity", float("inf")),
                "rouge_l": quality.get("rouge_l", 0.0),
                "bleu": quality.get("bleu", 0.0),
                "quality_score": self._calculate_composite_quality(quality),
            }
            rows.append(row)

        return pd.DataFrame(rows)

    def _calculate_composite_quality(self, quality_metrics: Dict) -> float:
        """Calculate composite quality score (0-1, higher is better)"""
        # Inverse of perplexity (lower perplexity = higher quality)
        perp = quality_metrics.get("perplexity", float("inf"))
        perp_score = 1 / (1 + perp) if perp != float("inf") else 0

        rouge = quality_metrics.get("rouge_l", 0.0)
        bleu = quality_metrics.get("bleu", 0.0)

        # Weighted average (adjust weights as needed)
        composite = 0.4 * perp_score + 0.3 * rouge + 0.3 * bleu
        return composite

    def plot_cost_latency_quality_3d(self):
        """Create 3D scatter plot of cost vs latency vs quality"""
        fig = plt.figure(figsize=(12, 8))
        ax = fig.add_subplot(111, projection="3d")

        # Aggregate by model + quantization
        agg_df = (
            self.df.groupby(["model_name", "quantization"])
            .agg(
                {
                    "cost_per_1k": "mean",
                    "tokens_per_second": "mean",
                    "quality_score": "mean",
                    "peak_memory_mb": "mean",
                }
            )
            .reset_index()
        )

        # Color by quantization
        colors = {"fp16": "red", "int8": "green", "int4": "blue"}

        for quant in agg_df["quantization"].unique():
            data = agg_df[agg_df["quantization"] == quant]
            ax.scatter(
                data["cost_per_1k"],
                data["tokens_per_second"],
                data["quality_score"],
                c=colors.get(quant, "gray"),
                label=f"{quant}",
                s=data["peak_memory_mb"] / 10,
                alpha=0.7,
            )

        ax.set_xlabel("Cost per 1K tokens (USD)")
        ax.set_ylabel("Tokens per Second")
        ax.set_zlabel("Quality Score")
        ax.set_title(
            "Cost vs Latency vs Quality Trade-off\n(Bubble size = Memory usage)"
        )
        ax.legend()

        plt.tight_layout()
        plt.show()

    def plot_pareto_frontier(self):
        """Plot Pareto frontier for cost vs quality"""
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))

        # Cost vs Quality
        agg_df = (
            self.df.groupby(["model_name", "quantization"])
            .agg(
                {
                    "cost_per_1k": "mean",
                    "quality_score": "mean",
                    "tokens_per_second": "mean",
                }
            )
            .reset_index()
        )

        sns.scatterplot(
            data=agg_df,
            x="cost_per_1k",
            y="quality_score",
            hue="quantization",
            style="model_name",
            s=100,
            ax=axes[0],
        )
        axes[0].set_title("Cost vs Quality Pareto Frontier")
        axes[0].set_xlabel("Cost per 1K tokens (USD)")
        axes[0].set_ylabel("Quality Score")

        # Speed vs Quality
        sns.scatterplot(
            data=agg_df,
            x="tokens_per_second",
            y="quality_score",
            hue="quantization",
            style="model_name",
            s=100,
            ax=axes[1],
        )
        axes[1].set_title("Speed vs Quality Trade-off")
        axes[1].set_xlabel("Tokens per Second")
        axes[1].set_ylabel("Quality Score")

        plt.tight_layout()
        plt.show()

    def generate_summary_table(self) -> pd.DataFrame:
        """Generate summary table with key metrics"""
        summary = (
            self.df.groupby(["model_name", "quantization"])
            .agg(
                {
                    "tokens_per_second": ["mean", "std"],
                    "cost_per_1k": ["mean", "std"],
                    "quality_score": ["mean", "std"],
                    "peak_memory_mb": ["mean", "max"],
                    "ttft": ["mean", "std"],
                }
            )
            .round(4)
        )

        # Flatten column names
        summary.columns = ["_".join(col).strip() for col in summary.columns]
        summary = summary.reset_index()

        # Add efficiency score (quality per cost)
        summary["efficiency_score"] = (
            summary["quality_score_mean"] / summary["cost_per_1k_mean"]
        )

        return summary.sort_values("efficiency_score", ascending=False)

    def find_optimal_configs(self) -> Dict[str, Dict]:
        """Find optimal configurations for different use cases"""
        agg_df = (
            self.df.groupby(["model_name", "quantization"])
            .agg(
                {
                    "cost_per_1k": "mean",
                    "quality_score": "mean",
                    "tokens_per_second": "mean",
                    "peak_memory_mb": "mean",
                }
            )
            .reset_index()
        )

        recommendations = {}

        # Best for low cost
        lowest_cost = agg_df.loc[agg_df["cost_per_1k"].idxmin()]
        recommendations["lowest_cost"] = {
            "config": f"{lowest_cost['model_name']} + {lowest_cost['quantization']}",
            "cost_per_1k": lowest_cost["cost_per_1k"],
            "quality": lowest_cost["quality_score"],
            "tps": lowest_cost["tokens_per_second"],
            "memory_mb": lowest_cost["peak_memory_mb"],
        }

        # Best for highest quality
        highest_quality = agg_df.loc[agg_df["quality_score"].idxmax()]
        recommendations["highest_quality"] = {
            "config": f"{highest_quality['model_name']} + {highest_quality['quantization']}",
            "cost_per_1k": highest_quality["cost_per_1k"],
            "quality": highest_quality["quality_score"],
            "tps": highest_quality["tokens_per_second"],
            "memory_mb": highest_quality["peak_memory_mb"],
        }

        # Best for speed
        fastest = agg_df.loc[agg_df["tokens_per_second"].idxmax()]
        recommendations["fastest"] = {
            "config": f"{fastest['model_name']} + {fastest['quantization']}",
            "cost_per_1k": fastest["cost_per_1k"],
            "quality": fastest["quality_score"],
            "tps": fastest["tokens_per_second"],
            "memory_mb": fastest["peak_memory_mb"],
        }

        # Best efficiency (quality per cost)
        agg_df["efficiency"] = agg_df["quality_score"] / agg_df["cost_per_1k"]
        most_efficient = agg_df.loc[agg_df["efficiency"].idxmax()]
        recommendations["most_efficient"] = {
            "config": f"{most_efficient['model_name']} + {most_efficient['quantization']}",
            "cost_per_1k": most_efficient["cost_per_1k"],
            "quality": most_efficient["quality_score"],
            "tps": most_efficient["tokens_per_second"],
            "memory_mb": most_efficient["peak_memory_mb"],
            "efficiency": most_efficient["efficiency"],
        }

        return recommendations


# Run analysis
print("📈 Analyzing benchmark results...")
analyzer = ResultsAnalyzer(results)

# Display summary table
print("\n📊 Performance Summary Table:")
summary_table = analyzer.generate_summary_table()
print(summary_table.to_string(index=False))

# Generate visualizations
print("\n🎨 Generating visualizations...")
analyzer.plot_cost_latency_quality_3d()
analyzer.plot_pareto_frontier()

In [None]:
# ================================
# Cell 7: Recommendation Engine
# ================================


class RecommendationEngine:
    """Generate recommendations based on use case requirements"""

    def __init__(self, analyzer: ResultsAnalyzer):
        self.analyzer = analyzer
        self.df = analyzer.df

    def recommend_for_use_case(
        self,
        use_case: str,
        max_cost: float = None,
        min_quality: float = None,
        min_speed: float = None,
        max_memory: float = None,
    ) -> Dict:
        """Recommend optimal configuration for specific use case"""

        # Define use case profiles
        use_case_profiles = {
            "research": {
                "priority": "quality",
                "description": "Research & experimentation - prioritize quality over cost",
                "constraints": {"min_quality": 0.3},
            },
            "production": {
                "priority": "balanced",
                "description": "Production deployment - balance quality, cost, and speed",
                "constraints": {"min_speed": 5.0, "max_cost": 0.01},
            },
            "demo": {
                "priority": "speed",
                "description": "Interactive demos - prioritize speed and low latency",
                "constraints": {"min_speed": 10.0},
            },
            "batch": {
                "priority": "cost",
                "description": "Batch processing - minimize cost per token",
                "constraints": {"max_cost": 0.005},
            },
            "edge": {
                "priority": "memory",
                "description": "Edge deployment - minimize memory usage",
                "constraints": {"max_memory": 4000},
            },
        }

        profile = use_case_profiles.get(use_case.lower(), {})
        constraints = profile.get("constraints", {})

        # Apply user constraints
        if max_cost:
            constraints["max_cost"] = max_cost
        if min_quality:
            constraints["min_quality"] = min_quality
        if min_speed:
            constraints["min_speed"] = min_speed
        if max_memory:
            constraints["max_memory"] = max_memory

        # Filter candidates based on constraints
        candidates = (
            self.df.groupby(["model_name", "quantization"])
            .agg(
                {
                    "cost_per_1k": "mean",
                    "quality_score": "mean",
                    "tokens_per_second": "mean",
                    "peak_memory_mb": "mean",
                    "ttft": "mean",
                }
            )
            .reset_index()
        )

        # Apply filters
        for constraint, value in constraints.items():
            if constraint == "max_cost":
                candidates = candidates[candidates["cost_per_1k"] <= value]
            elif constraint == "min_quality":
                candidates = candidates[candidates["quality_score"] >= value]
            elif constraint == "min_speed":
                candidates = candidates[candidates["tokens_per_second"] >= value]
            elif constraint == "max_memory":
                candidates = candidates[candidates["peak_memory_mb"] <= value]

        if len(candidates) == 0:
            return {"error": "No configurations meet the specified constraints"}

        # Select best candidate based on priority
        priority = profile.get("priority", "balanced")

        if priority == "quality":
            best = candidates.loc[candidates["quality_score"].idxmax()]
        elif priority == "speed":
            best = candidates.loc[candidates["tokens_per_second"].idxmax()]
        elif priority == "cost":
            best = candidates.loc[candidates["cost_per_1k"].idxmin()]
        elif priority == "memory":
            best = candidates.loc[candidates["peak_memory_mb"].idxmin()]
        else:  # balanced
            # Calculate composite score
            candidates["composite_score"] = (
                candidates["quality_score"] * 0.4
                + (1 / candidates["cost_per_1k"]) * 0.3  # Lower cost is better
                + (
                    candidates["tokens_per_second"]
                    / candidates["tokens_per_second"].max()
                )
                * 0.3
            )
            best = candidates.loc[candidates["composite_score"].idxmax()]

        recommendation = {
            "use_case": use_case,
            "description": profile.get("description", "Custom use case"),
            "recommended_config": f"{best['model_name']} + {best['quantization']}",
            "metrics": {
                "cost_per_1k_tokens": f"${best['cost_per_1k']:.6f}",
                "quality_score": f"{best['quality_score']:.3f}",
                "tokens_per_second": f"{best['tokens_per_second']:.1f}",
                "memory_usage_mb": f"{best['peak_memory_mb']:.0f}",
                "time_to_first_token": f"{best['ttft']:.3f}s",
            },
            "trade_offs": self._analyze_trade_offs(best, candidates),
        }

        return recommendation

    def _analyze_trade_offs(
        self, selected: pd.Series, all_candidates: pd.DataFrame
    ) -> Dict:
        """Analyze trade-offs of selected configuration"""
        trade_offs = {}

        # Compare to best in each dimension
        best_cost = all_candidates.loc[all_candidates["cost_per_1k"].idxmin()]
        best_quality = all_candidates.loc[all_candidates["quality_score"].idxmax()]
        best_speed = all_candidates.loc[all_candidates["tokens_per_second"].idxmax()]

        cost_penalty = (selected["cost_per_1k"] / best_cost["cost_per_1k"] - 1) * 100
        quality_penalty = (
            1 - selected["quality_score"] / best_quality["quality_score"]
        ) * 100
        speed_penalty = (
            1 - selected["tokens_per_second"] / best_speed["tokens_per_second"]
        ) * 100

        trade_offs = {
            "cost_vs_cheapest": f"{cost_penalty:+.1f}%",
            "quality_vs_best": f"{quality_penalty:+.1f}%",
            "speed_vs_fastest": f"{speed_penalty:+.1f}%",
        }

        return trade_offs

    def generate_all_recommendations(self) -> Dict:
        """Generate recommendations for all standard use cases"""
        use_cases = ["research", "production", "demo", "batch", "edge"]
        recommendations = {}

        for use_case in use_cases:
            recommendations[use_case] = self.recommend_for_use_case(use_case)

        return recommendations


# Generate recommendations
print("\n🎯 Generating use case recommendations...")
recommender = RecommendationEngine(analyzer)
all_recommendations = recommender.generate_all_recommendations()

for use_case, rec in all_recommendations.items():
    if "error" not in rec:
        print(f"\n📋 {use_case.upper()} USE CASE:")
        print(f"   Description: {rec['description']}")
        print(f"   Recommended: {rec['recommended_config']}")
        print(f"   Cost: {rec['metrics']['cost_per_1k_tokens']}/1K tokens")
        print(f"   Quality: {rec['metrics']['quality_score']}")
        print(f"   Speed: {rec['metrics']['tokens_per_second']} TPS")
        print(f"   Memory: {rec['metrics']['memory_usage_mb']} MB")

In [None]:
# ================================
# Cell 8: Smoke Test & Validation
# ================================


def smoke_test_performance_analysis():
    """Smoke test for performance analysis functionality"""
    print("🧪 Running smoke test for performance analysis...")

    # Test 1: Profiler initialization
    profiler = PerformanceProfiler()
    assert profiler.device.type in ["cuda", "cpu"], "Invalid device type"
    print("✅ Profiler initialization OK")

    # Test 2: Quality evaluator
    evaluator = QualityEvaluator()
    test_metrics = evaluator.evaluate_quality(
        None, None, "This is a test.", "This is a reference."
    )
    assert "rouge_l" in test_metrics, "Rouge metric missing"
    print("✅ Quality evaluator OK")

    # Test 3: Cost calculation
    if len(results) > 0:
        sample_result = results[0]
        cost = benchmark.calculate_cost_per_1k_tokens(sample_result["perf_metrics"])
        assert cost >= 0, "Invalid cost calculation"
        print("✅ Cost calculation OK")

    # Test 4: Analysis functions
    if len(results) > 0:
        analyzer = ResultsAnalyzer(results)
        summary = analyzer.generate_summary_table()
        assert len(summary) > 0, "Summary table empty"
        print("✅ Results analysis OK")

    # Test 5: Recommendation engine
    if len(results) > 0:
        recommender = RecommendationEngine(analyzer)
        rec = recommender.recommend_for_use_case("production")
        assert "recommended_config" in rec, "Recommendation failed"
        print("✅ Recommendation engine OK")

    print("🎉 All smoke tests passed!")


# Run smoke test
smoke_test_performance_analysis()

In [None]:
# ================================
# Cell 9: Export Results & Summary
# ================================

import json
from datetime import datetime


def export_benchmark_results(results: List[Dict], filename: str = None):
    """Export benchmark results to JSON file"""
    if filename is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"performance_benchmark_{timestamp}.json"

    # Prepare exportable data
    export_data = {
        "metadata": {
            "timestamp": datetime.now().isoformat(),
            "gpu_type": benchmark.gpu_type,
            "total_configs_tested": len(results),
            "test_prompts": len(TEST_PROMPTS),
            "models_tested": list(set(r["model_name"] for r in results)),
        },
        "results": [],
    }

    for result in results:
        # Convert PerformanceMetrics to dict
        perf_dict = {
            "model_name": result["perf_metrics"].model_name,
            "quantization": result["perf_metrics"].quantization,
            "tokens_per_second": result["perf_metrics"].tokens_per_second,
            "time_to_first_token": result["perf_metrics"].time_to_first_token,
            "total_time": result["perf_metrics"].total_time,
            "peak_gpu_memory_mb": result["perf_metrics"].peak_gpu_memory_mb,
            "avg_gpu_utilization": result["perf_metrics"].avg_gpu_utilization,
            "avg_cpu_percent": result["perf_metrics"].avg_cpu_percent,
        }

        export_result = {
            "model_name": result["model_name"],
            "quantization": result["quantization"],
            "prompt_category": result["prompt_category"],
            "performance_metrics": perf_dict,
            "quality_metrics": result["quality_metrics"],
            "cost_per_1k_tokens": result["cost_per_1k_tokens"],
        }
        export_data["results"].append(export_result)

    # Save to file
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(export_data, f, indent=2, ensure_ascii=False)

    print(f"📁 Results exported to: {filename}")
    return filename


# Export results
if len(results) > 0:
    export_file = export_benchmark_results(results)

    # Generate final summary report
    print("\n" + "=" * 60)
    print("📊 PERFORMANCE ANALYSIS SUMMARY REPORT")
    print("=" * 60)

    print(f"🖥️  GPU Type: {benchmark.gpu_type}")
    print(f"📈 Total Configurations Tested: {len(results)}")
    print(f"💾 Results exported to: {export_file}")

    # Key findings
    optimal_configs = analyzer.find_optimal_configs()
    print(f"\n🏆 KEY FINDINGS:")
    print(f"   💰 Most Cost-Effective: {optimal_configs['lowest_cost']['config']}")
    print(f"      └── ${optimal_configs['lowest_cost']['cost_per_1k']:.6f}/1K tokens")
    print(f"   🎯 Highest Quality: {optimal_configs['highest_quality']['config']}")
    print(
        f"      └── Quality Score: {optimal_configs['highest_quality']['quality']:.3f}"
    )
    print(f"   ⚡ Fastest: {optimal_configs['fastest']['config']}")
    print(f"      └── {optimal_configs['fastest']['tps']:.1f} tokens/second")
    print(f"   🎖️  Most Efficient: {optimal_configs['most_efficient']['config']}")
    print(
        f"      └── Efficiency: {optimal_configs['most_efficient']['efficiency']:.2f}"
    )

print("\n" + "=" * 60)
print("✨ Performance analysis completed successfully!")
print("=" * 60)

In [None]:
# ================================
# Smoke Test - Performance Analysis Validation
# ================================


def comprehensive_smoke_test():
    """Comprehensive validation of performance analysis components"""
    print("🔬 Running comprehensive smoke test...")

    # Test basic functionality without model loading
    test_results = []

    # 1. Environment Check
    assert torch.cuda.is_available() or True, "GPU check"
    print("✅ Environment check passed")

    # 2. Utility Classes
    profiler = PerformanceProfiler()
    evaluator = QualityEvaluator()
    print("✅ Utility classes initialized")

    # 3. Mock performance metrics
    mock_metrics = PerformanceMetrics(
        model_name="test-model",
        quantization="fp16",
        prompt_length=50,
        generation_length=100,
        time_to_first_token=0.1,
        tokens_per_second=25.0,
        total_time=4.0,
        peak_gpu_memory_mb=2048.0,
        avg_gpu_utilization=75.0,
        avg_cpu_percent=15.0,
    )
    print("✅ Mock metrics created")

    # 4. Cost calculation test
    benchmark_instance = PerformanceBenchmark()
    cost = benchmark_instance.calculate_cost_per_1k_tokens(mock_metrics)
    assert cost > 0, "Cost should be positive"
    print(f"✅ Cost calculation: ${cost:.6f}/1K tokens")

    # 5. Analysis pipeline (with mock data)
    mock_results = [
        {
            "model_name": "test-small",
            "quantization": "fp16",
            "prompt_category": "simple",
            "perf_metrics": mock_metrics,
            "quality_metrics": {"perplexity": 15.0, "rouge_l": 0.65, "bleu": 0.45},
            "cost_per_1k_tokens": cost,
            "generated_text": "This is a test generation.",
        }
    ]

    analyzer = ResultsAnalyzer(mock_results)
    summary = analyzer.generate_summary_table()
    assert len(summary) > 0, "Summary table should not be empty"
    print("✅ Results analysis pipeline")

    # 6. Recommendation engine
    recommender = RecommendationEngine(analyzer)
    rec = recommender.recommend_for_use_case("production", max_cost=0.01)
    print("✅ Recommendation engine")

    print("🎉 All smoke tests completed successfully!")
    return True


# Run smoke test
comprehensive_smoke_test()


## 📋 6. 本章小結

### ✅ 完成項目
- **效能分析框架**: 建立了全面的 LLM 效能測試系統，支援多維度評估
- **成本計算模組**: 實作了基於硬體類型的精確成本計算，包含 GPU 與電力成本
- **品質評估系統**: 整合了多種評估指標 (Perplexity, ROUGE, BLEU)，提供綜合品質分數
- **視覺化分析**: 建立了 3D 權衡圖表與帕累托前沿分析，直觀展示最佳配置
- **智慧推薦引擎**: 針對不同使用場景 (研究/生產/展示/批次/邊緣) 提供最佳配置建議

### 🧠 核心原理要點
- **三角權衡關係**: 成本、延遲、品質之間存在固有的權衡關係，需要根據使用場景選擇平衡點
- **量化策略影響**: INT4/INT8 量化能顯著降低記憶體使用與成本，但會帶來品質損失
- **TTFT vs TPS**: 首個 Token 時間 (TTFT) 影響使用者體驗，每秒 Token 數 (TPS) 影響吞吐量
- **帕累托最佳化**: 在多目標最佳化中，帕累托前沿上的配置代表不可改進的最佳解
- **成本模型**: 總成本包含硬體攤提成本與電力消耗，需要考慮實際使用模式

### ⚠️ 常見陷阱與注意事項
- **測試環境一致性**: 確保所有配置在相同環境下測試，避免外部因素影響結果
- **記憶體洩漏**: 大量模型載入測試時需要適當清理，避免 CUDA OOM 錯誤
- **統計顯著性**: 單次測試結果可能有變異，建議多次測試取平均值
- **成本計算假設**: 成本模型基於特定假設 (電價、硬體價格)，實際部署時需要調整
- **品質指標限制**: 自動化指標無法完全反映人類感知品質，建議輔以人工評估

### 🎯 下一步建議
1. **Part D 微調技術**: 探索 LoRA/QLoRA 微調如何影響效能權衡關係
2. **部署最佳化**: 研究生產環境的推理最佳化技術 (動態批次、KV 快取等)
3. **多模態擴展**: 將效能分析擴展到視覺-語言模型 (VLM)
4. **長文本處理**: 分析不同 context 長度對效能的影響
5. **實時監控**: 建立生產環境的效能監控與自動調優系統

---

**完成 Part C - LLM Applications 階段！** 🎉

已完成 LLM 應用核心的 10 本 notebooks，涵蓋了從基礎文本生成到進階效能分析的完整技術棧。接下來建議進入 **Part D (微調技術)** 或 **Part E (RAG + Agents)**，讓我知道您希望優先學習哪個方向！