In [None]:
# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# Cell 2: Import & Dependencies
import gradio as gr
import json
import time
from typing import Optional, Dict, Any, List, Tuple
import sys
import os

# Add project root to path
project_root = os.path.abspath("../..")
if project_root not in sys.path:
    sys.path.insert(0, project_root)

try:
    from shared_utils.adapters.llm_adapter import LLMAdapter

    print("✓ LLMAdapter imported successfully")
except ImportError as e:
    print(f"⚠ LLMAdapter import failed: {e}")
    print("Creating minimal fallback...")

    # Minimal fallback for demo
    class LLMAdapter:
        def __init__(self, model_id="demo", backend="transformers", **kwargs):
            self.model_id = model_id
            self.backend = backend
            self.loaded = False

        def generate(self, messages, max_new_tokens=256, temperature=0.7, **kwargs):
            # Simple echo for demo
            user_msg = messages[-1].get("content", "") if messages else ""
            return f"[{self.backend}] Echo: {user_msg[:100]}..."

        def load_model(self):
            self.loaded = True
            return True


# Default models for different backends
DEFAULT_MODELS = {
    "transformers": "Qwen/Qwen2.5-7B-Instruct",
    "llamacpp": "qwen2.5-7b-instruct-q4_k_m.gguf",  # local file
    "ollama": "qwen2.5:7b",
}

In [None]:
# Cell 3: UI State Management
class UIState:
    def __init__(self):
        self.current_adapter: Optional[LLMAdapter] = None
        self.chat_history: List[List[str]] = []
        self.mode = "Chat"  # Chat|RAG|Agents|Game
        self.rag_enabled = False
        self.tools_enabled = False

    def clear_chat(self):
        self.chat_history = []
        return []


# Global state
ui_state = UIState()

In [None]:
# Cell 4: Model Loading Functions
def load_model(
    model_id: str, backend: str, quantization: str = "fp16", device_map: str = "auto"
) -> Tuple[bool, str]:
    """Load model with specified backend and quantization"""
    try:
        # Quantization settings for low-VRAM
        load_kwargs = {"device_map": device_map}

        if backend == "transformers":
            if quantization == "int4":
                load_kwargs.update(
                    {
                        "load_in_4bit": True,
                        "bnb_4bit_compute_dtype": torch.float16,
                        "bnb_4bit_use_double_quant": True,
                    }
                )
            elif quantization == "int8":
                load_kwargs["load_in_8bit"] = True
            elif quantization == "fp16":
                load_kwargs["torch_dtype"] = torch.float16

        # Create adapter
        adapter = LLMAdapter(model_id=model_id, backend=backend, **load_kwargs)

        # Test load
        if hasattr(adapter, "load_model"):
            success = adapter.load_model()
        else:
            success = True

        if success:
            ui_state.current_adapter = adapter
            return True, f"✓ Loaded {model_id} ({backend}, {quantization})"
        else:
            return False, f"✗ Failed to load {model_id}"

    except Exception as e:
        return False, f"✗ Error: {str(e)[:100]}"


def get_model_info() -> str:
    """Get current model information"""
    if ui_state.current_adapter:
        return f"Model: {ui_state.current_adapter.model_id} | Backend: {ui_state.current_adapter.backend}"
    return "No model loaded"

In [None]:
# Cell 5: Chat Function (MVP)
def chat_fn(
    message: str,
    history: List[List[str]],
    mode: str,
    rag_enabled: bool,
    tools_enabled: bool,
) -> Tuple[List[List[str]], str]:
    """Main chat function supporting different modes"""

    if not message.strip():
        return history, ""

    if not ui_state.current_adapter:
        return (
            history + [[message, "⚠ No model loaded. Please load a model first."]],
            "",
        )

    try:
        # Prepare messages in standard format
        messages = []

        # Add system message based on mode
        if mode == "Chat":
            system_msg = "You are a helpful AI assistant. Respond in Traditional Chinese when appropriate."
        elif mode == "RAG":
            system_msg = "You are a RAG-enabled assistant. Use provided context to answer questions with citations."
        elif mode == "Agents":
            system_msg = "You are part of a multi-agent system. Collaborate to solve complex tasks."
        elif mode == "Game":
            system_msg = (
                "You are a text adventure game master. Create engaging narratives."
            )
        else:
            system_msg = "You are a helpful assistant."

        messages.append({"role": "system", "content": system_msg})

        # Add conversation history (last 5 exchanges to save tokens)
        for h in history[-5:]:
            if len(h) >= 2:
                messages.append({"role": "user", "content": h[0]})
                messages.append({"role": "assistant", "content": h[1]})

        # Add current message
        messages.append({"role": "user", "content": message})

        # Generate response
        start_time = time.time()

        # Mode-specific modifications
        generation_kwargs = {"max_new_tokens": 512, "temperature": 0.7}

        if mode == "RAG" and rag_enabled:
            # TODO: Integrate RAG retrieval here
            generation_kwargs["temperature"] = 0.3  # More focused for factual responses

        if mode == "Agents" and tools_enabled:
            # TODO: Add tool calling logic
            generation_kwargs["max_new_tokens"] = 256  # Shorter for tool planning

        response = ui_state.current_adapter.generate(messages, **generation_kwargs)

        # Extract just the assistant's response (remove the full conversation)
        if isinstance(response, str):
            # Simple extraction - get content after last "assistant:"
            if "assistant:" in response.lower():
                response = response.split("assistant:")[-1].strip()

        elapsed = time.time() - start_time

        # Add timing info
        timing_info = f"\n\n*[Generated in {elapsed:.1f}s]*"
        response_with_timing = response + timing_info

        # Update history
        new_history = history + [[message, response_with_timing]]

        return new_history, ""

    except Exception as e:
        error_msg = f"⚠ Generation error: {str(e)[:200]}"
        return history + [[message, error_msg]], ""

In [None]:
# Cell 6: Control Panel Callbacks
def on_model_change(model_id: str, backend: str, quantization: str) -> str:
    """Handle model selection change"""
    if not model_id.strip():
        return "Please enter a model ID"

    success, message = load_model(model_id, backend, quantization)
    return message


def on_mode_change(mode: str) -> Dict[str, Any]:
    """Handle mode change"""
    ui_state.mode = mode

    # Mode-specific UI updates
    updates = {}

    if mode == "RAG":
        updates["rag_enabled"] = gr.update(visible=True)
        updates["tools_enabled"] = gr.update(visible=False)
    elif mode == "Agents":
        updates["rag_enabled"] = gr.update(visible=True)
        updates["tools_enabled"] = gr.update(visible=True)
    elif mode == "Game":
        updates["rag_enabled"] = gr.update(visible=True)
        updates["tools_enabled"] = gr.update(visible=False)
    else:  # Chat
        updates["rag_enabled"] = gr.update(visible=False)
        updates["tools_enabled"] = gr.update(visible=False)

    return updates


def clear_chat_history() -> List[List[str]]:
    """Clear chat history"""
    return ui_state.clear_chat()

In [None]:
# Cell 7: Gradio Interface Assembly
def create_gradio_interface():
    """Create the main Gradio interface"""

    with gr.Blocks(title="ragent-text-lab", theme=gr.themes.Soft()) as demo:
        gr.Markdown("# 🤖 ragent-text-lab | Multi-Mode AI Assistant")

        with gr.Row():
            # Left Column: Chat Interface
            with gr.Column(scale=2):
                gr.Markdown("## 💬 Chat Interface")

                chatbot = gr.Chatbot(
                    value=[], height=400, show_copy_button=True, bubble_full_width=False
                )

                with gr.Row():
                    msg_input = gr.Textbox(
                        placeholder="輸入您的問題...",
                        scale=4,
                        show_label=False,
                        container=False,
                    )
                    send_btn = gr.Button("發送", variant="primary")
                    clear_btn = gr.Button("清除", variant="secondary")

            # Right Column: Control Panel
            with gr.Column(scale=1):
                gr.Markdown("## ⚙️ Control Panel")

                # Model Selection
                with gr.Group():
                    gr.Markdown("### 🎯 Model Selection")

                    model_id = gr.Textbox(
                        value=DEFAULT_MODELS["transformers"],
                        label="Model ID",
                        placeholder="Qwen/Qwen2.5-7B-Instruct",
                    )

                    backend = gr.Dropdown(
                        choices=["transformers", "llamacpp", "ollama"],
                        value="transformers",
                        label="Backend",
                    )

                    quantization = gr.Dropdown(
                        choices=["fp16", "int8", "int4"],
                        value="int4",  # Default to int4 for low VRAM
                        label="Quantization",
                    )

                    load_model_btn = gr.Button("Load Model", variant="primary")
                    model_status = gr.Textbox(
                        value="No model loaded", label="Status", interactive=False
                    )

                # Mode Selection
                with gr.Group():
                    gr.Markdown("### 🎮 Mode Selection")

                    mode = gr.Radio(
                        choices=["Chat", "RAG", "Agents", "Game"],
                        value="Chat",
                        label="Mode",
                    )

                    # Conditional controls
                    rag_enabled = gr.Checkbox(
                        label="Enable RAG", value=False, visible=False
                    )

                    tools_enabled = gr.Checkbox(
                        label="Enable Tools", value=False, visible=False
                    )

                # Advanced Settings
                with gr.Group():
                    gr.Markdown("### 🔧 Advanced Settings")

                    max_tokens = gr.Slider(
                        minimum=64, maximum=2048, value=512, step=64, label="Max Tokens"
                    )

                    temperature = gr.Slider(
                        minimum=0.1,
                        maximum=1.0,
                        value=0.7,
                        step=0.1,
                        label="Temperature",
                    )

                # Info Panel
                with gr.Group():
                    gr.Markdown("### 📊 System Info")

                    system_info = gr.Textbox(
                        value=get_model_info(), label="Current Model", interactive=False
                    )

        # Event Handlers

        # Model loading
        load_model_btn.click(
            fn=on_model_change,
            inputs=[model_id, backend, quantization],
            outputs=[model_status],
        ).then(fn=get_model_info, outputs=[system_info])

        # Auto-fill model ID when backend changes
        def update_model_id(backend_choice):
            return DEFAULT_MODELS.get(backend_choice, "")

        backend.change(fn=update_model_id, inputs=[backend], outputs=[model_id])

        # Mode change
        mode.change(
            fn=on_mode_change, inputs=[mode], outputs=[rag_enabled, tools_enabled]
        )

        # Chat functionality
        def handle_send(message, history, current_mode, rag_on, tools_on):
            return chat_fn(message, history, current_mode, rag_on, tools_on)

        # Send message
        send_btn.click(
            fn=handle_send,
            inputs=[msg_input, chatbot, mode, rag_enabled, tools_enabled],
            outputs=[chatbot, msg_input],
        )

        # Enter key
        msg_input.submit(
            fn=handle_send,
            inputs=[msg_input, chatbot, mode, rag_enabled, tools_enabled],
            outputs=[chatbot, msg_input],
        )

        # Clear chat
        clear_btn.click(fn=clear_chat_history, outputs=[chatbot])

    return demo


# Create interface
demo = create_gradio_interface()

In [None]:
# Cell 8: Smoke Test
print("🧪 Starting Gradio interface smoke test...")

# Test model loading function
print("\n1. Testing model loading...")
try:
    success, msg = load_model("demo-model", "transformers", "fp16")
    print(f"   Load test: {msg}")
except Exception as e:
    print(f"   Load test failed: {e}")

# Test chat function
print("\n2. Testing chat function...")
try:
    test_history = []
    new_history, _ = chat_fn("Hello", test_history, "Chat", False, False)
    print(f"   Chat test: Generated {len(new_history)} exchanges")
except Exception as e:
    print(f"   Chat test failed: {e}")

print("\n3. Interface ready! Run demo.launch() to start.")
print("   - Left panel: Chat interface with history")
print("   - Right panel: Model/mode controls")
print("   - Default: int4 quantization for low VRAM")

# To launch (uncomment when ready):
# demo.launch(
#     server_name="0.0.0.0",
#     server_port=7860,
#     share=False,
#     inbrowser=True
# )

In [None]:
# Cell 9: Key Parameters & Low-VRAM Configuration
print("\n📋 Key Parameters for Low-VRAM Setup:")
print(
    """
🔧 Quantization Options:
- int4: ~2-3GB VRAM (recommend for 8GB cards)
- int8: ~4-5GB VRAM
- fp16: ~6-8GB VRAM

🎯 Model Recommendations by VRAM:
- 8GB:  Qwen2.5-7B (int4) or smaller models
- 12GB: Qwen2.5-7B (int8/fp16)
- 16GB+: Qwen2.5-14B (int4/int8)

⚙️ Backend Trade-offs:
- transformers: Full features, higher VRAM
- llamacpp: CPU/GPU hybrid, GGUF quantization
- ollama: Easy management, good for inference

🚀 Next Extensions:
- Streaming responses (nb51)
- RAG integration from Stage 2
- Agent orchestrator from Stage 4
- File upload & indexing (nb53)
"""
)

In [None]:
# Quick test of the interface
print("🧪 Interface Components Test:")
print("✓ Gradio layout created")
print("✓ Model loading functions defined")
print("✓ Chat pipeline ready")
print("✓ Mode switching prepared")
print("✓ Low-VRAM defaults set (int4)")

# Test the actual launch (comment out for notebook)
# demo.launch(debug=True, share=False)