In [None]:
#  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# Cell 2: Check Ollama Service & Available Models
import requests
import json
import time
from typing import List, Dict, Optional, Iterator, Union


def check_ollama_service(base_url: str = "http://localhost:11434") -> bool:
    """Check if Ollama service is running"""
    try:
        response = requests.get(f"{base_url}/api/tags", timeout=5)
        return response.status_code == 200
    except requests.exceptions.RequestException:
        return False


def list_ollama_models(base_url: str = "http://localhost:11434") -> List[str]:
    """List available Ollama models"""
    try:
        response = requests.get(f"{base_url}/api/tags", timeout=10)
        if response.status_code == 200:
            data = response.json()
            return [model["name"] for model in data.get("models", [])]
    except requests.exceptions.RequestException:
        pass
    return []


# Check service status
ollama_running = check_ollama_service()
print(f"Ollama service running: {ollama_running}")

if ollama_running:
    models = list_ollama_models()
    print(f"Available models: {models}")
    if not models:
        print("No models found. Please run: ollama pull qwen2.5:7b")
else:
    print("Ollama not running. Please start with: ollama serve")

In [None]:
# Cell 3: OllamaAdapter Implementation
class OllamaAdapter:
    """Ollama HTTP backend adapter with unified messages interface"""

    def __init__(
        self, model_name: str = "qwen2.5:7b", base_url: str = "http://localhost:11434"
    ):
        self.model_name = model_name
        self.base_url = base_url
        self.session = requests.Session()

        # Verify service and model availability
        if not check_ollama_service(base_url):
            raise ConnectionError(f"Ollama service not available at {base_url}")

        available_models = list_ollama_models(base_url)
        if model_name not in available_models:
            print(f"Warning: Model {model_name} not found in {available_models}")
            print(f"Attempting to pull model automatically...")
            self._pull_model(model_name)

    def _pull_model(self, model_name: str) -> bool:
        """Pull model if not available"""
        try:
            url = f"{self.base_url}/api/pull"
            payload = {"name": model_name}
            response = self.session.post(url, json=payload, timeout=300, stream=True)

            if response.status_code == 200:
                print(f"Successfully pulled model: {model_name}")
                return True
            else:
                print(f"Failed to pull model: {response.text}")
                return False
        except Exception as e:
            print(f"Error pulling model: {e}")
            return False

    def _messages_to_prompt(self, messages: List[Dict[str, str]]) -> str:
        """Convert messages format to simple prompt"""
        prompt_parts = []
        for msg in messages:
            role = msg["role"]
            content = msg["content"]
            if role == "system":
                prompt_parts.append(f"System: {content}")
            elif role == "user":
                prompt_parts.append(f"User: {content}")
            elif role == "assistant":
                prompt_parts.append(f"Assistant: {content}")

        prompt_parts.append("Assistant:")
        return "\n".join(prompt_parts)

    def generate(
        self,
        messages: List[Dict[str, str]],
        max_new_tokens: int = 256,
        temperature: float = 0.7,
        stop: Optional[List[str]] = None,
        stream: bool = False,
        timeout: int = 60,
    ) -> Union[str, Iterator[str]]:
        """Generate response with unified interface"""

        prompt = self._messages_to_prompt(messages)

        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "stream": stream,
            "options": {
                "temperature": temperature,
                "num_predict": max_new_tokens,
            },
        }

        if stop:
            payload["options"]["stop"] = stop

        try:
            response = self.session.post(
                f"{self.base_url}/api/generate",
                json=payload,
                timeout=timeout,
                stream=stream,
            )
            response.raise_for_status()

            if stream:
                return self._stream_response(response)
            else:
                return self._get_complete_response(response)

        except requests.exceptions.Timeout:
            raise TimeoutError(f"Request timed out after {timeout} seconds")
        except requests.exceptions.RequestException as e:
            raise RuntimeError(f"Ollama API error: {e}")

    def _stream_response(self, response) -> Iterator[str]:
        """Handle streaming response"""
        for line in response.iter_lines(decode_unicode=True):
            if line:
                try:
                    data = json.loads(line)
                    if "response" in data:
                        yield data["response"]
                    if data.get("done", False):
                        break
                except json.JSONDecodeError:
                    continue

    def _get_complete_response(self, response) -> str:
        """Handle non-streaming response"""
        full_response = ""
        for line in response.iter_lines(decode_unicode=True):
            if line:
                try:
                    data = json.loads(line)
                    if "response" in data:
                        full_response += data["response"]
                    if data.get("done", False):
                        break
                except json.JSONDecodeError:
                    continue
        return full_response

    def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
        """Simple chat interface (non-streaming)"""
        return self.generate(messages, stream=False, **kwargs)


print("OllamaAdapter class defined successfully")

In [None]:
# Cell 4: Message Template & Unified Interface
def create_chat_messages(system_prompt: str, user_query: str) -> List[Dict[str, str]]:
    """Create standardized messages format"""
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_query},
    ]


# Test message templates
system_prompt = (
    "You are a helpful AI assistant. Respond concisely in Traditional Chinese."
)
user_query = "請用3句話解釋什麼是大語言模型？"

test_messages = create_chat_messages(system_prompt, user_query)
print("Test messages:", test_messages)

In [None]:
# Cell 5: Streaming Output & Error Handling
def demo_streaming_generation(adapter: OllamaAdapter, messages: List[Dict[str, str]]):
    """Demonstrate streaming generation with error handling"""
    print("=== Streaming Generation ===")
    print("Query:", messages[-1]["content"])
    print("Response:", end=" ")

    try:
        full_response = ""
        for chunk in adapter.generate(
            messages, max_new_tokens=100, temperature=0.7, stream=True
        ):
            print(chunk, end="", flush=True)
            full_response += chunk
        print("\n" + "=" * 50)
        return full_response

    except TimeoutError as e:
        print(f"\nTimeout error: {e}")
        return None
    except Exception as e:
        print(f"\nGeneration error: {e}")
        return None


def demo_non_streaming(adapter: OllamaAdapter, messages: List[Dict[str, str]]):
    """Demonstrate non-streaming generation"""
    print("=== Non-Streaming Generation ===")
    print("Query:", messages[-1]["content"])

    try:
        response = adapter.generate(
            messages, max_new_tokens=100, temperature=0.7, stream=False
        )
        print("Response:", response)
        print("=" * 50)
        return response

    except Exception as e:
        print(f"Generation error: {e}")
        return None

In [None]:
# Cell 6: Smoke Test - Generate Conversation
# Only run if Ollama is available
if ollama_running and models:
    try:
        # Initialize adapter with first available model
        model_to_use = models[0] if "qwen2.5:7b" not in models else "qwen2.5:7b"
        print(f"Using model: {model_to_use}")

        adapter = OllamaAdapter(model_name=model_to_use)

        # Test messages
        messages = create_chat_messages(
            "You are a helpful assistant. Be concise.",
            "What are the key benefits of using Ollama?",
        )

        # Test non-streaming
        response1 = demo_non_streaming(adapter, messages)

        # Test streaming
        time.sleep(1)  # Brief pause
        response2 = demo_streaming_generation(adapter, messages)

        print(
            f"✅ Smoke test passed! Generated {len(response1 or '')} + {len(response2 or '')} characters" # type: ignore
        )

    except Exception as e:
        print(f"❌ Smoke test failed: {e}")
        print("Make sure Ollama is running with: ollama serve")
        print("And pull a model with: ollama pull qwen2.5:7b")

else:
    print("⚠️ Skipping smoke test - Ollama service or models not available")
    print("To run test:")
    print("1. Start Ollama: ollama serve")
    print("2. Pull model: ollama pull qwen2.5:7b")
    print("3. Re-run this notebook")

In [None]:
# Cell 7: Interface Comparison with TransformersAdapter
print("=== Interface Comparison ===")
print("Both adapters should support:")
print("1. generate(messages, max_new_tokens, temperature, stream)")
print("2. chat(messages) - simple wrapper")
print("3. Unified messages format: [{'role': 'user', 'content': '...'}]")
print("4. Error handling for timeouts and model issues")
print("")

# Mock comparison table
comparison_data = [
    ["Feature", "TransformersAdapter", "OllamaAdapter"],
    ["Model Loading", "Direct GPU/CPU", "HTTP Service"],
    ["Memory Usage", "High (model in RAM)", "Low (service handles)"],
    ["Latency", "Fast (local)", "Medium (HTTP overhead)"],
    ["Streaming", "✅ Token-by-token", "✅ Chunk-by-chunk"],
    ["Quantization", "bitsandbytes/GPTQ", "Built-in GGUF"],
    ["Model Switch", "Reload required", "API call only"],
    ["Offline Usage", "✅ Full offline", "❌ Needs Ollama service"],
]

for row in comparison_data:
    print(f"{row[0]:<15} | {row[1]:<20} | {row[2]:<20}")

In [None]:
# Cell 8: Performance Baseline & VRAM Usage
import psutil
import time


def benchmark_ollama_performance(adapter: OllamaAdapter, num_tests: int = 3):
    """Simple performance benchmark"""
    print("=== Performance Benchmark ===")

    test_messages = create_chat_messages(
        "You are a helpful assistant.", "Explain quantum computing in 2 sentences."
    )

    latencies = []
    token_counts = []

    for i in range(num_tests):
        start_time = time.time()

        try:
            response = adapter.generate(
                test_messages, max_new_tokens=50, temperature=0.7, stream=False
            )

            end_time = time.time()
            latency = end_time - start_time
            token_count = len(response.split()) if response else 0

            latencies.append(latency)
            token_counts.append(token_count)

            print(f"Test {i+1}: {latency:.2f}s, ~{token_count} tokens")

        except Exception as e:
            print(f"Test {i+1} failed: {e}")

    if latencies:
        avg_latency = sum(latencies) / len(latencies)
        avg_tokens = sum(token_counts) / len(token_counts)
        tokens_per_sec = avg_tokens / avg_latency if avg_latency > 0 else 0

        print(f"\nAverage latency: {avg_latency:.2f}s")
        print(f"Average tokens: {avg_tokens:.1f}")
        print(f"Tokens/sec: {tokens_per_sec:.1f}")

    # System resource usage
    memory_usage = psutil.virtual_memory().percent
    print(f"System RAM usage: {memory_usage:.1f}%")

    if torch.cuda.is_available():
        print("GPU memory: N/A (Ollama manages GPU separately)")
    else:
        print("Running on CPU")


# Run benchmark if adapter is available
if ollama_running and models and "adapter" in locals():
    try:
        benchmark_ollama_performance(adapter)
    except Exception as e:
        print(f"Benchmark failed: {e}")
else:
    print("⚠️ Skipping benchmark - adapter not initialized")

In [1]:
# Cell 9: What We Built & Key Takeaways
print("=== What We Built ===")
print("✅ OllamaAdapter with HTTP backend")
print("✅ Unified messages interface")
print("✅ Streaming and non-streaming support")
print("✅ Error handling and timeouts")
print("✅ Model availability checking")
print("✅ Performance baseline")
print("")

print("=== Key Parameters ===")
print("• model_name: Ollama model tag (e.g., 'qwen2.5:7b')")
print("• base_url: Ollama service URL (default: http://localhost:11434)")
print("• temperature: 0.1-1.5 (creativity level)")
print("• max_new_tokens: Response length limit")
print("• timeout: Request timeout in seconds")
print("• stream: True for real-time output")
print("")

print("=== Pitfalls & Solutions ===")
print("• Service not running → Check 'ollama serve'")
print("• Model not found → Auto-pull or manual 'ollama pull'")
print("• Slow responses → Reduce max_new_tokens or use GPU")
print("• Memory issues → Ollama handles quantization automatically")
print("• Network timeouts → Increase timeout parameter")
print("")

print("=== When to Use This ===")
print("✅ Low VRAM environments (2-4GB)")
print("✅ Quick model switching without reloading")
print("✅ CPU-only inference")
print("✅ Development/prototyping with multiple models")
print("✅ Shared model serving across applications")
print("❌ Offline deployment")
print("❌ Maximum inference speed (use TransformersAdapter)")
print("❌ Custom model modifications")

=== What We Built ===
✅ OllamaAdapter with HTTP backend
✅ Unified messages interface
✅ Streaming and non-streaming support
✅ Error handling and timeouts
✅ Model availability checking
✅ Performance baseline

=== Key Parameters ===
• model_name: Ollama model tag (e.g., 'qwen2.5:7b')
• base_url: Ollama service URL (default: http://localhost:11434)
• temperature: 0.1-1.5 (creativity level)
• max_new_tokens: Response length limit
• timeout: Request timeout in seconds
• stream: True for real-time output

=== Pitfalls & Solutions ===
• Service not running → Check 'ollama serve'
• Model not found → Auto-pull or manual 'ollama pull'
• Slow responses → Reduce max_new_tokens or use GPU
• Memory issues → Ollama handles quantization automatically
• Network timeouts → Increase timeout parameter

=== When to Use This ===
✅ Low VRAM environments (2-4GB)
✅ Quick model switching without reloading
✅ CPU-only inference
✅ Development/prototyping with multiple models
✅ Shared model serving across applicatio

低RAM優勢

In [None]:
# Ollama 自動處理量化，無需手動設定
adapter = OllamaAdapter("qwen2.5:7b")  # 自動 GGUF 量化

# 支援 CPU 推理
adapter = OllamaAdapter("qwen2.5:7b")  # 無需 device_map 設定

Smoke Test測試

In [None]:
# 最小驗證：服務連通性 + 模型回應
messages = [{"role": "user",
             "content": "Hi, test response in 5 words."}]
response = adapter.generate(messages, max_new_tokens=20)
assert len(response) > 0, "No response generated"
print(f"✅ Success: {response}")