In [1]:
# nb03_llm_adapter_llamacpp.ipynb
# Stage 1: LLM Adapter - llama.cpp Backend with GGUF

# %% [1] Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

[Cache] ../ai_warehouse/cache | GPU: True


In [2]:
# %% [2] Install and verify llama-cpp-python
try:
    from llama_cpp import Llama, LlamaGrammar

    print("✓ llama-cpp-python imported successfully")
except ImportError as e:
    print("❌ Missing llama-cpp-python. Install with:")
    print("pip install llama-cpp-python[server]")
    print(
        "For GPU support: CMAKE_ARGS='-DLLAMA_CUBLAS=on' pip install llama-cpp-python --force-reinstall --no-cache-dir"
    )
    raise e

import json
import time
from typing import List, Dict, Any, Optional, Iterator
from pathlib import Path

❌ Missing llama-cpp-python. Install with:
pip install llama-cpp-python[server]
For GPU support: CMAKE_ARGS='-DLLAMA_CUBLAS=on' pip install llama-cpp-python --force-reinstall --no-cache-dir


ModuleNotFoundError: No module named 'llama_cpp'

In [None]:
# %% [3] GGUF Model Download and Path Management
# Setup model cache directory
GGUF_CACHE = Path(AI_CACHE_ROOT) / "gguf_models"
GGUF_CACHE.mkdir(exist_ok=True)

# Default model info (modify as needed)
DEFAULT_MODEL_ID = "Qwen/Qwen2.5-7B-Instruct-GGUF"
DEFAULT_GGUF_FILE = "qwen2.5-7b-instruct-q4_k_m.gguf"


def download_gguf_model(model_id: str, filename: str) -> Path:
    """Download GGUF model from HuggingFace Hub"""
    from huggingface_hub import hf_hub_download

    local_path = GGUF_CACHE / filename
    if local_path.exists():
        print(f"✓ Found cached GGUF: {local_path}")
        return local_path

    print(f"📥 Downloading {model_id}/{filename}...")
    try:
        downloaded_path = hf_hub_download(
            repo_id=model_id,
            filename=filename,
            cache_dir=str(GGUF_CACHE),
            local_dir=str(GGUF_CACHE),
            local_dir_use_symlinks=False,
        )
        print(f"✓ Downloaded to: {downloaded_path}")
        return Path(downloaded_path)
    except Exception as e:
        print(f"❌ Download failed: {e}")
        print("Manual alternatives:")
        print(f"1. Visit: https://huggingface.co/{model_id}")
        print(f"2. Download {filename} to {GGUF_CACHE}/")
        raise


# Auto-download if needed
try:
    model_path = download_gguf_model(DEFAULT_MODEL_ID, DEFAULT_GGUF_FILE)
except:
    # Fallback to manual path specification
    model_path = GGUF_CACHE / DEFAULT_GGUF_FILE
    if not model_path.exists():
        print(f"❌ Model not found at {model_path}")
        print("Please download manually or check model_id/filename")

In [None]:
# %% [4] LlamaCppBackend Implementation
class LlamaCppBackend:
    def __init__(
        self,
        model_path: str,
        n_ctx: int = 4096,
        n_gpu_layers: int = -1,  # -1 = all layers on GPU if available
        n_threads: Optional[int] = None,
        verbose: bool = False,
        **kwargs,
    ):
        """Initialize llama.cpp backend with GGUF model

        Args:
            model_path: Path to GGUF file
            n_ctx: Context window size
            n_gpu_layers: Number of layers to offload to GPU (-1 = all)
            n_threads: CPU threads (None = auto-detect)
            verbose: Enable llama.cpp logging
        """
        self.model_path = Path(model_path)
        if not self.model_path.exists():
            raise FileNotFoundError(f"GGUF model not found: {model_path}")

        # Auto-detect threads if not specified
        if n_threads is None:
            import os

            n_threads = min(8, os.cpu_count() or 4)

        print(f"🦙 Loading GGUF: {self.model_path.name}")
        print(f"   Context: {n_ctx}, GPU layers: {n_gpu_layers}, Threads: {n_threads}")

        self.llama = Llama(
            model_path=str(self.model_path),
            n_ctx=n_ctx,
            n_gpu_layers=n_gpu_layers,
            n_threads=n_threads,
            verbose=verbose,
            **kwargs,
        )

        # Store generation defaults
        self.default_params = {
            "max_tokens": 256,
            "temperature": 0.7,
            "top_p": 0.95,
            "top_k": 40,
            "repeat_penalty": 1.1,
            "stop": ["<|im_end|>", "<|endoftext|>"],
        }

        print("✓ llama.cpp model loaded successfully")

    def format_messages(self, messages: List[Dict[str, str]]) -> str:
        """Convert messages to chat template (Qwen2.5 format)"""
        formatted = ""
        for msg in messages:
            role = msg.get("role", "user")
            content = msg.get("content", "")

            if role == "system":
                formatted += f"<|im_start|>system\n{content}<|im_end|>\n"
            elif role == "user":
                formatted += f"<|im_start|>user\n{content}<|im_end|>\n"
            elif role == "assistant":
                formatted += f"<|im_start|>assistant\n{content}<|im_end|>\n"

        # Add assistant start token for generation
        formatted += "<|im_start|>assistant\n"
        return formatted

    def generate(
        self, messages: List[Dict[str, str]], stream: bool = False, **kwargs
    ) -> str:
        """Generate response from messages"""
        # Merge params
        params = {**self.default_params, **kwargs}

        # Format prompt
        prompt = self.format_messages(messages)

        if stream:
            return self._generate_stream(prompt, **params)
        else:
            return self._generate_sync(prompt, **params)

    def _generate_sync(self, prompt: str, **params) -> str:
        """Synchronous generation"""
        output = self.llama(prompt, **params)
        return output["choices"][0]["text"].strip()

    def _generate_stream(self, prompt: str, **params) -> Iterator[str]:
        """Streaming generation"""
        stream = self.llama(prompt, stream=True, **params)
        for chunk in stream:
            if "choices" in chunk and len(chunk["choices"]) > 0:
                delta = chunk["choices"][0].get("text", "")
                if delta:
                    yield delta

In [None]:
# %% [5] Extended LLMAdapter with llama.cpp Support
class LLMAdapter:
    """Unified LLM interface supporting multiple backends"""

    def __init__(self, model_id: str, backend: str = "transformers", **kwargs):
        self.model_id = model_id
        self.backend = backend
        self.model = None

        if backend == "transformers":
            self._init_transformers(**kwargs)
        elif backend == "llama_cpp":
            self._init_llamacpp(**kwargs)
        else:
            raise ValueError(f"Unsupported backend: {backend}")

    def _init_transformers(self, **kwargs):
        """Initialize transformers backend (from nb02)"""
        from transformers import AutoTokenizer, AutoModelForCausalLM

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, use_fast=True)
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_id, device_map="auto", torch_dtype="auto", **kwargs
        )
        print(f"✓ Transformers model loaded: {self.model_id}")

    def _init_llamacpp(self, model_path: Optional[str] = None, **kwargs):
        """Initialize llama.cpp backend"""
        if model_path is None:
            # Use the globally downloaded model
            model_path = str(model_path) if "model_path" in globals() else None

        if model_path is None:
            raise ValueError("model_path required for llama_cpp backend")

        self.model = LlamaCppBackend(model_path, **kwargs)
        print(f"✓ llama.cpp model loaded: {model_path}")

    def generate(
        self,
        messages: List[Dict[str, str]],
        max_new_tokens: int = 256,
        temperature: float = 0.7,
        stream: bool = False,
        **kwargs,
    ) -> str:
        """Generate response with unified interface"""

        if self.backend == "transformers":
            return self._generate_transformers(
                messages, max_new_tokens, temperature, stream, **kwargs
            )
        elif self.backend == "llama_cpp":
            return self._generate_llamacpp(
                messages, max_new_tokens, temperature, stream, **kwargs
            )

    def _generate_transformers(
        self, messages, max_new_tokens, temperature, stream, **kwargs
    ):
        """Transformers generation"""
        # Convert messages to prompt (simplified)
        prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        with torch.inference_mode():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                **kwargs,
            )

        # Decode only the new tokens
        new_tokens = outputs[0][inputs.input_ids.shape[1] :]
        return self.tokenizer.decode(new_tokens, skip_special_tokens=True)

    def _generate_llamacpp(
        self, messages, max_new_tokens, temperature, stream, **kwargs
    ):
        """llama.cpp generation"""
        llama_params = {
            "max_tokens": max_new_tokens,
            "temperature": temperature,
            **kwargs,
        }

        if stream:
            # Return generator for streaming
            return self.model.generate(messages, stream=True, **llama_params)
        else:
            return self.model.generate(messages, stream=False, **llama_params)

In [None]:
# %% [6] Memory Usage Comparison
def measure_memory_usage():
    """Compare memory usage between backends"""
    import psutil
    import torch.cuda as cuda

    def get_memory_info():
        cpu_mb = psutil.virtual_memory().used / 1024**2
        gpu_mb = cuda.memory_allocated() / 1024**2 if cuda.is_available() else 0
        return cpu_mb, gpu_mb

    print("=== Memory Usage Comparison ===")

    # Baseline
    cpu_base, gpu_base = get_memory_info()
    print(f"Baseline - CPU: {cpu_base:.0f}MB, GPU: {gpu_base:.0f}MB")

    # Test llama.cpp
    if model_path.exists():
        print("\n🦙 Testing llama.cpp (GGUF)...")
        adapter_cpp = LLMAdapter(
            model_id="",
            backend="llama_cpp",
            model_path=str(model_path),
            n_gpu_layers=10,  # Limit GPU layers for comparison
        )

        cpu_cpp, gpu_cpp = get_memory_info()
        print(
            f"llama.cpp - CPU: {cpu_cpp:.0f}MB (+{cpu_cpp-cpu_base:.0f}), GPU: {gpu_cpp:.0f}MB (+{gpu_cpp-gpu_base:.0f})"
        )

        # Clean up
        del adapter_cpp

    print(f"\n💡 llama.cpp typically uses less VRAM due to quantization")

In [None]:
# %% [7] Streaming vs Non-Streaming Demo
def demo_streaming_vs_sync():
    """Compare streaming vs synchronous generation"""
    if not model_path.exists():
        print("❌ GGUF model not available for demo")
        return

    # Initialize adapter
    adapter = LLMAdapter(
        model_id="",
        backend="llama_cpp",
        model_path=str(model_path),
        n_gpu_layers=-1,  # Use all GPU if available
    )

    test_messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": "Explain quantum computing in 2 sentences."},
    ]

    print("=== Generation Comparison ===")

    # Synchronous generation
    print("\n🔄 Synchronous generation:")
    start_time = time.time()
    sync_response = adapter.generate(
        test_messages, max_new_tokens=100, temperature=0.7, stream=False
    )
    sync_time = time.time() - start_time
    print(f"Response: {sync_response}")
    print(f"Time: {sync_time:.2f}s")

    # Streaming generation
    print("\n📡 Streaming generation:")
    start_time = time.time()
    stream_response = ""

    stream_gen = adapter.generate(
        test_messages, max_new_tokens=100, temperature=0.7, stream=True
    )

    if hasattr(stream_gen, "__iter__"):
        print("Response: ", end="", flush=True)
        for chunk in stream_gen:
            print(chunk, end="", flush=True)
            stream_response += chunk
        print()  # New line
    else:
        stream_response = stream_gen
        print(f"Response: {stream_response}")

    stream_time = time.time() - start_time
    print(f"Time: {stream_time:.2f}s")

    print(f"\n📊 Streaming overhead: {stream_time - sync_time:.2f}s")

In [None]:
# %% [8] Backend Performance Benchmark
def benchmark_backends():
    """Simple performance comparison"""
    if not model_path.exists():
        print("❌ GGUF model required for benchmark")
        return

    test_messages = [{"role": "user", "content": "What is machine learning?"}]

    print("=== Backend Benchmark ===")

    # llama.cpp benchmark
    print("\n🦙 llama.cpp (GGUF Q4_K_M):")
    try:
        adapter_cpp = LLMAdapter(
            model_id="",
            backend="llama_cpp",
            model_path=str(model_path),
            n_gpu_layers=-1,
        )

        start = time.time()
        response = adapter_cpp.generate(
            test_messages, max_new_tokens=50, temperature=0.7
        )
        latency = time.time() - start

        token_count = len(response.split())
        tokens_per_sec = token_count / latency if latency > 0 else 0

        print(f"  Latency: {latency:.2f}s")
        print(f"  Tokens/sec: {tokens_per_sec:.1f}")
        print(f"  Response: {response[:100]}...")

        del adapter_cpp

    except Exception as e:
        print(f"  ❌ Failed: {e}")

In [None]:
# %% [9] Smoke Test
def smoke_test():
    """Minimal test to verify llama.cpp adapter works"""
    print("=== Smoke Test ===")

    if not model_path.exists():
        print("❌ GGUF model not found - downloading may be required")
        return False

    try:
        # Initialize adapter
        adapter = LLMAdapter(
            model_id="",
            backend="llama_cpp",
            model_path=str(model_path),
            n_gpu_layers=5,  # Conservative GPU usage
            verbose=False,
        )

        # Test generation
        messages = [{"role": "user", "content": "Say hello!"}]
        response = adapter.generate(messages, max_new_tokens=20, temperature=0.7)

        print(f"✓ Generation successful: '{response[:50]}...'")
        print(f"✓ Backend: {adapter.backend}")
        print(f"✓ Model path: {adapter.model.model_path.name}")

        return True

    except Exception as e:
        print(f"❌ Smoke test failed: {e}")
        return False

In [None]:
# %% [10] Run All Tests
if __name__ == "__main__":
    print("🚀 Running nb03 llama.cpp adapter tests...\n")

    # Core functionality
    measure_memory_usage()
    demo_streaming_vs_sync()
    benchmark_backends()

    # Final verification
    success = smoke_test()

    print(f"\n{'='*50}")
    print(f"📋 nb03 Status: {'✅ PASSED' if success else '❌ FAILED'}")
    print(f"📋 Backend: llama.cpp with GGUF quantization")
    print(f"📋 Model: {model_path.name if model_path.exists() else 'Not found'}")
    print(f"📋 Ready for: nb04 (Ollama backend)")