In [None]:
# nb31_gradio_chat_ui.ipynb
# Gradio 聊天介面 - 整合 LLM + RAG + Function Calling

# === Cell 1: Shared Cache Bootstrap ===
import os, pathlib, torch

AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
paths = {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}
for k, v in paths.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)

print("[Cache] Root:", AI_CACHE_ROOT)
print(
    "[GPU]",
    torch.cuda.is_available(),
    torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU",
)

In [None]:
# === Cell 2: Dependencies Installation ===
import subprocess
import sys


def install_packages():
    """Install required packages for Gradio chat UI"""
    packages = [
        "gradio>=4.0.0",
        "transformers>=4.35.0",
        "accelerate",
        "bitsandbytes",
        "sentence-transformers",
        "faiss-cpu",
        "PyPDF2",
        "langchain-community",
        "opencc-python-reimplemented",
    ]

    for package in packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            print(f"✅ Installed: {package}")
        except subprocess.CalledProcessError:
            print(f"❌ Failed to install: {package}")


# Uncomment to install packages
# install_packages()

import gradio as gr
import json
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime
import threading
import time

In [None]:
# === Cell 3: LLM Adapter (Multi-Backend Support) ===
class LLMAdapter:
    """Unified LLM interface supporting multiple backends"""

    def __init__(
        self,
        backend: str = "transformers",
        model_id: str = "Qwen/Qwen2.5-7B-Instruct",
        **kwargs,
    ):
        self.backend = backend
        self.model_id = model_id
        self.model = None
        self.tokenizer = None
        self._load_model(**kwargs)

    def _load_model(self, **kwargs):
        """Load model based on backend type"""
        if self.backend == "transformers":
            self._load_transformers_model(**kwargs)
        elif self.backend == "ollama":
            self._load_ollama_model(**kwargs)
        else:
            raise ValueError(f"Unsupported backend: {self.backend}")

    def _load_transformers_model(self, **kwargs):
        """Load model using transformers library"""
        from transformers import AutoTokenizer, AutoModelForCausalLM
        import torch

        # Default settings for low VRAM
        default_kwargs = {
            "device_map": "auto",
            "torch_dtype": torch.float16,
            "load_in_4bit": True if torch.cuda.is_available() else False,
            "trust_remote_code": True,
        }
        default_kwargs.update(kwargs)

        try:
            print(f"Loading {self.model_id} with transformers...")
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_id,
                trust_remote_code=True,
                cache_dir=os.environ.get("TRANSFORMERS_CACHE"),
            )

            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_id,
                cache_dir=os.environ.get("TRANSFORMERS_CACHE"),
                **default_kwargs,
            )

            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token

            print(f"✅ Model loaded: {self.model_id}")

        except Exception as e:
            print(f"❌ Failed to load model: {e}")
            # Fallback to CPU
            if "cuda" in str(e).lower():
                print("🔄 Falling back to CPU...")
                default_kwargs.update({"device_map": "cpu", "load_in_4bit": False})
                self._load_transformers_model(**default_kwargs)

    def _load_ollama_model(self, **kwargs):
        """Load model using Ollama (placeholder)"""
        print(f"📝 Ollama backend: {self.model_id} (requires ollama server)")
        # Implementation would connect to local ollama server

    def generate(
        self, messages: List[Dict[str, str]], max_tokens: int = 512, **kwargs
    ) -> str:
        """Generate response from messages"""
        if self.backend == "transformers":
            return self._generate_transformers(messages, max_tokens, **kwargs)
        elif self.backend == "ollama":
            return self._generate_ollama(messages, max_tokens, **kwargs)
        else:
            return "Error: Unsupported backend"

    def _generate_transformers(
        self, messages: List[Dict[str, str]], max_tokens: int, **kwargs
    ) -> str:
        """Generate using transformers"""
        try:
            # Format messages for chat template
            if hasattr(self.tokenizer, "apply_chat_template"):
                prompt = self.tokenizer.apply_chat_template(
                    messages, tokenize=False, add_generation_prompt=True
                )
            else:
                # Fallback formatting
                prompt = ""
                for msg in messages:
                    if msg["role"] == "system":
                        prompt += f"System: {msg['content']}\n"
                    elif msg["role"] == "user":
                        prompt += f"User: {msg['content']}\n"
                    elif msg["role"] == "assistant":
                        prompt += f"Assistant: {msg['content']}\n"
                prompt += "Assistant: "

            # Tokenize and generate
            inputs = self.tokenizer(
                prompt, return_tensors="pt", truncation=True, max_length=2048
            )

            if torch.cuda.is_available() and hasattr(self.model, "device"):
                inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_tokens,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9,
                    pad_token_id=self.tokenizer.eos_token_id,
                    **kwargs,
                )

            # Decode response
            response = self.tokenizer.decode(
                outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
            )

            return response.strip()

        except Exception as e:
            return f"Generation error: {str(e)}"

    def _generate_ollama(
        self, messages: List[Dict[str, str]], max_tokens: int, **kwargs
    ) -> str:
        """Generate using Ollama (placeholder)"""
        return f"Ollama response for {len(messages)} messages (placeholder)"

In [None]:
# === Cell 4: RAG System Core ===
class RAGSystem:
    """Retrieval-Augmented Generation system with document processing"""

    def __init__(self, embedding_model: str = "BAAI/bge-m3", chunk_size: int = 512):
        self.embedding_model = embedding_model
        self.chunk_size = chunk_size
        self.embedder = None
        self.index = None
        self.documents = []
        self.chunks = []
        self._load_embedder()

    def _load_embedder(self):
        """Load sentence transformer for embeddings"""
        try:
            from sentence_transformers import SentenceTransformer

            self.embedder = SentenceTransformer(
                self.embedding_model, cache_folder=os.environ.get("TRANSFORMERS_CACHE")
            )
            print(f"✅ Loaded embedder: {self.embedding_model}")
        except Exception as e:
            print(f"❌ Failed to load embedder: {e}")

    def add_documents(self, files) -> str:
        """Add uploaded documents to knowledge base"""
        if not files:
            return "No files uploaded"

        added_docs = []
        for file in files:
            try:
                content = self._extract_text(file)
                if content:
                    self.documents.append(
                        {
                            "filename": file.name,
                            "content": content,
                            "timestamp": datetime.now().isoformat(),
                        }
                    )
                    added_docs.append(file.name)
            except Exception as e:
                print(f"Error processing {file.name}: {e}")

        if added_docs:
            self._build_index()
            return f"✅ Added {len(added_docs)} documents: {', '.join(added_docs)}"
        else:
            return "❌ No documents could be processed"

    def _extract_text(self, file) -> str:
        """Extract text from uploaded file"""
        if file.name.endswith(".pdf"):
            return self._extract_pdf_text(file)
        elif file.name.endswith((".txt", ".md")):
            return file.read().decode("utf-8", errors="ignore")
        else:
            return ""

    def _extract_pdf_text(self, file) -> str:
        """Extract text from PDF file"""
        try:
            import PyPDF2

            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            return text
        except Exception as e:
            print(f"PDF extraction error: {e}")
            return ""

    def _chunk_text(self, text: str) -> List[str]:
        """Split text into chunks"""
        # Simple chunking - could use more sophisticated methods
        words = text.split()
        chunks = []
        current_chunk = []
        current_length = 0

        for word in words:
            if current_length + len(word) > self.chunk_size and current_chunk:
                chunks.append(" ".join(current_chunk))
                current_chunk = [word]
                current_length = len(word)
            else:
                current_chunk.append(word)
                current_length += len(word) + 1  # +1 for space

        if current_chunk:
            chunks.append(" ".join(current_chunk))

        return chunks

    def _build_index(self):
        """Build FAISS index from documents"""
        if not self.embedder or not self.documents:
            return

        try:
            import faiss
            import numpy as np

            # Create chunks from all documents
            all_chunks = []
            for doc in self.documents:
                chunks = self._chunk_text(doc["content"])
                for chunk in chunks:
                    all_chunks.append(
                        {
                            "text": chunk,
                            "source": doc["filename"],
                            "timestamp": doc["timestamp"],
                        }
                    )

            self.chunks = all_chunks

            # Generate embeddings
            texts = [chunk["text"] for chunk in all_chunks]
            embeddings = self.embedder.encode(texts, show_progress_bar=True)

            # Build FAISS index
            dimension = embeddings.shape[1]
            self.index = faiss.IndexFlatIP(dimension)  # Inner product for similarity

            # Normalize embeddings for cosine similarity
            faiss.normalize_L2(embeddings.astype(np.float32))
            self.index.add(embeddings.astype(np.float32))

            print(f"✅ Built index with {len(all_chunks)} chunks")

        except Exception as e:
            print(f"❌ Index building error: {e}")

    def retrieve(self, query: str, top_k: int = 3) -> List[Dict[str, Any]]:
        """Retrieve relevant chunks for query"""
        if not self.embedder or not self.index or not self.chunks:
            return []

        try:
            import faiss
            import numpy as np

            # Encode query
            query_embedding = self.embedder.encode([query])
            faiss.normalize_L2(query_embedding.astype(np.float32))

            # Search
            scores, indices = self.index.search(
                query_embedding.astype(np.float32), top_k
            )

            # Format results
            results = []
            for score, idx in zip(scores[0], indices[0]):
                if idx < len(self.chunks):
                    chunk = self.chunks[idx]
                    results.append(
                        {
                            "text": chunk["text"],
                            "source": chunk["source"],
                            "score": float(score),
                            "timestamp": chunk["timestamp"],
                        }
                    )

            return results

        except Exception as e:
            print(f"❌ Retrieval error: {e}")
            return []

In [None]:
# === Cell 5: Function Calling Tools ===
class ToolManager:
    """Manage function calling tools"""

    def __init__(self):
        self.tools = {
            "calculator": self.calculator,
            "datetime": self.get_datetime,
            "web_search": self.web_search_placeholder,
        }

    def calculator(self, expression: str) -> str:
        """Simple calculator tool"""
        try:
            # Safe evaluation - only allow basic math
            allowed_chars = set("0123456789+-*/.()% ")
            if not all(c in allowed_chars for c in expression):
                return "Error: Invalid characters in expression"

            result = eval(expression)
            return f"Result: {result}"
        except Exception as e:
            return f"Calculation error: {str(e)}"

    def get_datetime(self, format_type: str = "default") -> str:
        """Get current date and time"""
        now = datetime.now()
        if format_type == "date":
            return now.strftime("%Y-%m-%d")
        elif format_type == "time":
            return now.strftime("%H:%M:%S")
        else:
            return now.strftime("%Y-%m-%d %H:%M:%S")

    def web_search_placeholder(self, query: str) -> str:
        """Placeholder for web search (could integrate DuckDuckGo)"""
        return f"Web search placeholder for: {query}"

    def parse_tool_call(self, text: str) -> Tuple[str, Optional[Dict[str, Any]]]:
        """Parse tool calls from LLM response"""
        # Simple pattern matching for tool calls
        # Format: [TOOL:tool_name(arg1=value1, arg2=value2)]
        import re

        pattern = r"\[TOOL:(\w+)\(([^)]*)\)\]"
        match = re.search(pattern, text)

        if match:
            tool_name = match.group(1)
            args_str = match.group(2)

            # Parse arguments
            args = {}
            if args_str:
                for arg_pair in args_str.split(","):
                    if "=" in arg_pair:
                        key, value = arg_pair.split("=", 1)
                        args[key.strip()] = value.strip().strip("\"'")

            return tool_name, args

        return "", None

    def execute_tool(self, tool_name: str, args: Dict[str, Any]) -> str:
        """Execute a tool with given arguments"""
        if tool_name not in self.tools:
            return f"Unknown tool: {tool_name}"

        try:
            tool_func = self.tools[tool_name]
            return tool_func(**args)
        except Exception as e:
            return f"Tool execution error: {str(e)}"

In [None]:
# === Cell 6: Conversation Memory Manager ===
class ConversationMemory:
    """Manage conversation history and context"""

    def __init__(self, max_history: int = 10):
        self.max_history = max_history
        self.conversations = {}  # user_id -> conversation history
        self.lock = threading.Lock()

    def add_message(self, user_id: str, role: str, content: str):
        """Add message to conversation history"""
        with self.lock:
            if user_id not in self.conversations:
                self.conversations[user_id] = []

            self.conversations[user_id].append(
                {
                    "role": role,
                    "content": content,
                    "timestamp": datetime.now().isoformat(),
                }
            )

            # Keep only recent messages
            if (
                len(self.conversations[user_id]) > self.max_history * 2
            ):  # *2 for user+assistant pairs
                self.conversations[user_id] = self.conversations[user_id][
                    -self.max_history * 2 :
                ]

    def get_conversation(self, user_id: str) -> List[Dict[str, str]]:
        """Get conversation history for LLM context"""
        with self.lock:
            if user_id not in self.conversations:
                return []

            # Return only role and content for LLM
            return [
                {"role": msg["role"], "content": msg["content"]}
                for msg in self.conversations[user_id]
            ]

    def clear_conversation(self, user_id: str):
        """Clear conversation history"""
        with self.lock:
            if user_id in self.conversations:
                del self.conversations[user_id]

In [None]:
# === Cell 7: Gradio Chat UI Core Logic ===
class ChatbotCore:
    """Core chatbot logic integrating LLM, RAG, and tools"""

    def __init__(self):
        self.llm = None
        self.rag = RAGSystem()
        self.tools = ToolManager()
        self.memory = ConversationMemory()
        self.current_model = None
        self.system_prompt = """You are a helpful AI assistant. You can use tools by calling them in the format [TOOL:tool_name(arg1=value1, arg2=value2)].

Available tools:
- calculator(expression="math expression") - for calculations
- datetime(format_type="default"|"date"|"time") - get current date/time
- web_search(query="search terms") - search the web

You can also answer questions based on uploaded documents if available."""

    def load_model(self, model_name: str, backend: str = "transformers") -> str:
        """Load or switch LLM model"""
        try:
            if self.current_model == f"{backend}:{model_name}":
                return f"✅ Model already loaded: {model_name}"

            self.llm = LLMAdapter(backend=backend, model_id=model_name)
            self.current_model = f"{backend}:{model_name}"
            return f"✅ Loaded model: {model_name} ({backend})"

        except Exception as e:
            return f"❌ Failed to load model: {str(e)}"

    def process_message(
        self, message: str, user_id: str = "default", use_rag: bool = True
    ) -> str:
        """Process user message and return response"""
        if not self.llm:
            return "❌ No model loaded. Please select a model first."

        try:
            # Build conversation context
            messages = [{"role": "system", "content": self.system_prompt}]

            # Add conversation history
            history = self.memory.get_conversation(user_id)
            messages.extend(history)

            # Add RAG context if enabled and available
            if use_rag and self.rag.chunks:
                relevant_docs = self.rag.retrieve(message, top_k=3)
                if relevant_docs:
                    rag_context = "\n\nRelevant information from documents:\n"
                    for doc in relevant_docs:
                        rag_context += (
                            f"- {doc['text'][:200]}... (from {doc['source']})\n"
                        )

                    message_with_rag = message + rag_context
                else:
                    message_with_rag = message
            else:
                message_with_rag = message

            # Add current user message
            messages.append({"role": "user", "content": message_with_rag})

            # Generate response
            response = self.llm.generate(messages, max_tokens=512)

            # Check for tool calls
            tool_name, tool_args = self.tools.parse_tool_call(response)
            if tool_name and tool_args is not None:
                tool_result = self.tools.execute_tool(tool_name, tool_args)
                response += f"\n\nTool result: {tool_result}"

            # Save conversation
            self.memory.add_message(user_id, "user", message)
            self.memory.add_message(user_id, "assistant", response)

            return response

        except Exception as e:
            return f"❌ Error processing message: {str(e)}"

    def upload_documents(self, files) -> str:
        """Handle document upload"""
        return self.rag.add_documents(files)

    def clear_conversation(self, user_id: str = "default") -> str:
        """Clear conversation history"""
        self.memory.clear_conversation(user_id)
        return "✅ Conversation cleared"

In [None]:
# === Cell 8: Gradio WebUI Setup and Launch ===
def create_gradio_interface():
    """Create and return Gradio interface"""

    # Initialize chatbot core
    chatbot = ChatbotCore()

    # Model options
    model_options = [
        "Qwen/Qwen2.5-7B-Instruct",
        "Qwen/Qwen2.5-1.5B-Instruct",
        "microsoft/DialoGPT-medium",
        "HuggingFaceH4/zephyr-7b-beta",
    ]

    def load_model_wrapper(model_name, backend):
        return chatbot.load_model(model_name, backend)

    def chat_response(message, history, use_rag):
        if not message.strip():
            return history, ""

        # Generate response
        response = chatbot.process_message(message, use_rag=use_rag)

        # Update history
        history.append([message, response])
        return history, ""

    def upload_files_wrapper(files):
        return chatbot.upload_documents(files)

    def clear_chat_wrapper():
        result = chatbot.clear_conversation()
        return [], result

    # Create Gradio interface
    with gr.Blocks(title="LLM Chat Assistant", theme=gr.themes.Soft()) as iface:

        gr.Markdown("# 🤖 LLM Chat Assistant")
        gr.Markdown("Multi-model chatbot with RAG and function calling capabilities")

        with gr.Row():
            with gr.Column(scale=2):
                # Model selection
                with gr.Group():
                    gr.Markdown("### Model Configuration")
                    model_dropdown = gr.Dropdown(
                        choices=model_options,
                        value=model_options[0],
                        label="Select Model",
                        interactive=True,
                    )
                    backend_radio = gr.Radio(
                        choices=["transformers", "ollama"],
                        value="transformers",
                        label="Backend",
                    )
                    load_btn = gr.Button("Load Model", variant="primary")
                    model_status = gr.Textbox(
                        label="Model Status", value="No model loaded", interactive=False
                    )

                # Document upload
                with gr.Group():
                    gr.Markdown("### Knowledge Base")
                    file_upload = gr.Files(
                        label="Upload Documents (PDF, TXT, MD)",
                        file_types=[".pdf", ".txt", ".md"],
                    )
                    upload_btn = gr.Button("Add to Knowledge Base")
                    upload_status = gr.Textbox(label="Upload Status", interactive=False)

                # Settings
                with gr.Group():
                    gr.Markdown("### Settings")
                    use_rag_checkbox = gr.Checkbox(
                        label="Enable RAG (use uploaded documents)", value=True
                    )
                    clear_btn = gr.Button("Clear Conversation", variant="secondary")
                    clear_status = gr.Textbox(label="Action Status", interactive=False)

            with gr.Column(scale=3):
                # Chat interface
                gr.Markdown("### Chat")
                chatbot_display = gr.Chatbot(
                    label="Conversation", height=500, show_copy_button=True
                )

                with gr.Row():
                    message_input = gr.Textbox(
                        label="Message",
                        placeholder="Type your message here...",
                        lines=2,
                        scale=4,
                    )
                    send_btn = gr.Button("Send", variant="primary", scale=1)

        # Examples
        gr.Markdown("### 💡 Example Messages")
        gr.Examples(
            examples=[
                ["Hello! How can you help me?"],
                ["Calculate 15 * 23 + 45"],
                ["What's the current date and time?"],
                ['[TOOL:calculator(expression="2+2*3")]'],
                ["Summarize the uploaded documents"],
            ],
            inputs=message_input,
        )

        # Event handlers
        load_btn.click(
            load_model_wrapper,
            inputs=[model_dropdown, backend_radio],
            outputs=model_status,
        )

        upload_btn.click(
            upload_files_wrapper, inputs=file_upload, outputs=upload_status
        )

        send_btn.click(
            chat_response,
            inputs=[message_input, chatbot_display, use_rag_checkbox],
            outputs=[chatbot_display, message_input],
        )

        message_input.submit(
            chat_response,
            inputs=[message_input, chatbot_display, use_rag_checkbox],
            outputs=[chatbot_display, message_input],
        )

        clear_btn.click(clear_chat_wrapper, outputs=[chatbot_display, clear_status])

    return iface


# Launch interface
if __name__ == "__main__":
    interface = create_gradio_interface()

    # Launch with public link for sharing (optional)
    interface.launch(
        server_name="0.0.0.0",  # Allow external connections
        server_port=7860,  # Default Gradio port
        share=False,  # Set to True for public ngrok link
        debug=True,
        show_error=True,
    )

print("🚀 Gradio interface ready! Run the cell above to launch.")

In [None]:
# === Cell 9: Smoke Test ===
def test_gradio_components():
    """Quick test of core components"""
    print("🧪 Testing Gradio Chat UI Components...")

    # Test LLM Adapter
    try:
        adapter = LLMAdapter(
            backend="transformers", model_id="microsoft/DialoGPT-medium"
        )
        test_response = adapter.generate(
            [{"role": "user", "content": "Hello"}], max_tokens=10
        )
        print(f"✅ LLM Adapter: {test_response[:50]}...")
    except Exception as e:
        print(f"❌ LLM Adapter: {e}")

    # Test RAG System
    try:
        rag = RAGSystem()
        print(f"✅ RAG System initialized with embedder: {rag.embedding_model}")
    except Exception as e:
        print(f"❌ RAG System: {e}")

    # Test Tool Manager
    try:
        tools = ToolManager()
        calc_result = tools.calculator("2+2")
        time_result = tools.get_datetime("time")
        print(f"✅ Tools: Calculator={calc_result}, Time={time_result}")
    except Exception as e:
        print(f"❌ Tool Manager: {e}")

    # Test Memory
    try:
        memory = ConversationMemory()
        memory.add_message("test", "user", "Hello")
        memory.add_message("test", "assistant", "Hi there!")
        history = memory.get_conversation("test")
        print(f"✅ Memory: {len(history)} messages stored")
    except Exception as e:
        print(f"❌ Memory: {e}")

    print("🏁 Component tests completed!")


# Run smoke test
test_gradio_components()

In [None]:
# === Cell 10: Usage Guide and Deployment Notes ===
print(
    """
📋 使用指南 (Usage Guide)
===========================

🔧 設定步驟 (Setup Steps):
1. 執行環境初始化 Cell (共享快取設定)
2. 安裝依賴套件 (取消註解 install_packages() 並執行)
3. 啟動 Gradio 介面 (執行 Cell 8)
4. 在瀏覽器中開啟顯示的 URL (通常是 http://localhost:7860)

💬 功能特色 (Features):
✅ 多模型支援 - 支援 Qwen, DialoGPT, Zephyr 等
✅ RAG 文件問答 - 上傳 PDF/TXT/MD 檔案進行知識庫問答
✅ 工具調用 - 計算機、日期時間、網路搜尋 (占位符)
✅ 會話記憶 - 保持多輪對話上下文
✅ 低資源友善 - 支援 4bit 量化、CPU fallback

🛠️ 工具使用方式 (Tool Usage):
- 計算: "Calculate 15 * 23" 或 "[TOOL:calculator(expression=\"15*23\")]"
- 日期: "What's the date?" 或 "[TOOL:datetime(format_type=\"date\")]"
- 時間: "Current time?" 或 "[TOOL:datetime(format_type=\"time\")]"

📁 RAG 文件上傳 (Document Upload):
1. 點擊 "Upload Documents" 區域
2. 選擇 PDF、TXT 或 MD 檔案
3. 點擊 "Add to Knowledge Base"
4. 開啟 "Enable RAG" 選項
5. 詢問文件相關問題

⚙️ 模型配置建議 (Model Configuration):
- 8GB VRAM: Qwen2.5-1.5B-Instruct (4bit)
- 12GB VRAM: Qwen2.5-7B-Instruct (4bit)
- 16GB+ VRAM: Qwen2.5-7B-Instruct (fp16)
- CPU only: DialoGPT-medium

🚨 常見問題排解 (Troubleshooting):
- CUDA OOM → 改用更小模型或開啟 4bit 量化
- 載入失敗 → 檢查網路連線與 HuggingFace 可用性
- RAG 無回應 → 確認文件已成功上傳並建立索引
- 工具不執行 → 檢查工具調用格式是否正確

🌐 部署選項 (Deployment Options):
1. 本地部署: 直接執行 notebook，訪問 localhost:7860
2. 區網分享: 設定 server_name="0.0.0.0" 開放區網訪問
3. 公開分享: 設定 share=True 生成 ngrok 公開連結 (開發測試用)
4. 容器化: 可包裝為 Docker 容器進行部署

🔒 安全建議 (Security Recommendations):
- 生產環境請勿開啟 share=True
- 設定適當的網路防火牆規則
- 考慮加入身份驗證機制
- 定期更新依賴套件版本

📈 效能優化 (Performance Optimization):
- 使用 GGUF 量化模型 (llama.cpp backend)
- 啟用 gradient checkpointing
- 調整 chunk_size 與 max_history 參數
- 考慮使用 Redis 做分散式會話儲存

🔄 擴展建議 (Extension Ideas):
- 整合更多工具 (天氣、股價、新聞)
- 加入語音輸入/輸出 (Whisper/TTS)
- 支援圖片上傳與多模態對話
- 加入對話導出/匯入功能
- 實作使用者角色與權限管理
"""
)

In [None]:
# === 最小驗收測試 (5行內) ===
interface = create_gradio_interface()
chatbot = ChatbotCore()
status = chatbot.load_model("microsoft/DialoGPT-medium")
response = chatbot.process_message("Hello, test message", use_rag=False)
print(
    f"✅ Interface created, model loaded: {status[:20]}..., response: {response[:30]}..."
)


## 6. 本章小結

### 🎯 完成項目 (Completed Items)

* **整合式聊天介面** - 建立支援多模型切換的 Gradio WebUI
* **RAG 文件問答系統** - PDF/TXT/MD 文件上傳與 FAISS 檢索整合
* **工具調用功能** - 計算機、日期時間、網路搜尋工具整合
* **會話記憶管理** - 多用戶對話歷史保存與上下文維護
* **低資源優化** - 4bit 量化、CPU fallback、記憶體優化策略
* **統一模型介面** - LLMAdapter 支援 transformers/ollama 多後端
* **使用者友善設計** - 響應式介面、範例提示、狀態顯示

### 🧠 核心原理要點 (Core Concepts)

* **模組化架構設計 (Modular Architecture)** - 各組件獨立可測試、可重用
* **統一抽象介面 (Unified Abstraction)** - LLMAdapter 隔離不同後端實作細節
* **記憶體高效管理 (Memory Efficiency)** - 4bit 量化、gradient checkpointing、動態卸載
* **RAG 檢索增強 (Retrieval-Augmented Generation)** - 文件向量化、相似度搜尋、上下文注入
* **工具調用協議 (Function Calling Protocol)** - 結構化工具調用與結果處理
* **會話狀態管理 (Session State Management)** - 多用戶隔離、歷史截斷、記憶體控制

### ⚠️ 常見陷阱 (Common Pitfalls)

* **記憶體溢出** - 大模型 + 長對話歷史 → 使用 4bit 量化與歷史截斷
* **工具調用格式錯誤** - LLM 輸出格式不一致 → 改進解析邏輯與提示工程
* **RAG 檢索品質** - 分塊策略影響檢索效果 → 調整 chunk_size 與重排器
* **並發安全問題** - 多用戶同時訪問 → 使用 threading.Lock 保護共享狀態
* **模型載入失敗** - 網路或硬體限制 → 提供多層次 fallback 機制

### 🚀 下一步建議 (Next Steps)

**立即可做:**
* 整合更多開源工具（天氣 API、新聞抓取、檔案操作）
* 加入語音輸入/輸出功能（Whisper ASR + TTS）
* 實作對話匯出/匯入與分享功能

**進階擴展:**
* 遷移到 FastAPI 後端以支援更多客戶端
* 加入使用者身份驗證與權限管理
* 實作分散式部署與負載平衡
* 整合向量資料庫（Qdrant/Weaviate）以取代 FAISS

**生產準備:**
* 容器化部署（Docker + docker-compose）
* 監控與日誌收集（Prometheus + Grafana）
* 自動化測試與 CI/CD 流程
* 安全強化與性能調優

---

**🎯 階段 F 總結 - WebUI & API 部署整合**

我們已完成 **Part F** 的第一本重要 notebook，建立了一個功能完整的聊天介面。下一步可以選擇：

1. **nb32_fastapi_docker_deploy.ipynb** - 完成 API 後端與容器化部署
2. **回到 Part D 微調系列** - 實作 LoRA/QLoRA 客製化模型  
3. **回到 Part E 進階應用** - 多代理協作與自動化流程

**建議優先順序：** 先完成 nb32 API 部署以形成完整的部署方案，再回到核心技術深化。