In [None]:
# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# Cell 2: Dependencies and Imports
import gradio as gr
import tempfile
import json
import numpy as np
import faiss
from pathlib import Path
from typing import List, Tuple, Optional
from datetime import datetime

# RAG dependencies
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import trafilatura
import PyPDF2
import re
import opencc

print("✓ Dependencies loaded")

In [None]:
# Cell 3: Document Processing Utils
class DocumentProcessor:
    """Process multiple document formats for RAG indexing"""

    def __init__(self):
        self.cc = opencc.OpenCC("t2s")  # Traditional to Simplified Chinese
        self.splitter = RecursiveCharacterTextSplitter(
            separators=[
                "\n### ",
                "\n## ",
                "\n# ",
                "。",
                "！",
                "？",
                "；",
                "…",
                "\n\n",
                "\n",
                " ",
            ],
            chunk_size=800,
            chunk_overlap=80,
        )

    def extract_text(self, file_path: str, file_type: str) -> str:
        """Extract text from different file formats"""
        try:
            if file_type == "txt":
                with open(file_path, "r", encoding="utf-8") as f:
                    return f.read()

            elif file_type == "md":
                with open(file_path, "r", encoding="utf-8") as f:
                    return f.read()

            elif file_type == "html":
                with open(file_path, "r", encoding="utf-8") as f:
                    html_content = f.read()
                return trafilatura.extract(html_content) or ""

            elif file_type == "pdf":
                text = ""
                with open(file_path, "rb") as f:
                    reader = PyPDF2.PdfReader(f)
                    for page in reader.pages:
                        text += page.extract_text() + "\n"
                return text

            else:
                return f"Unsupported format: {file_type}"

        except Exception as e:
            return f"Error processing {file_type}: {str(e)}"

    def clean_text(self, text: str) -> str:
        """Clean and normalize Chinese text"""
        # Remove excessive whitespace
        text = re.sub(r"\s+", " ", text).strip()

        # Convert traditional to simplified (optional)
        # text = self.cc.convert(text)

        # Remove special characters but keep Chinese punctuation
        text = re.sub(
            r'[^\w\s\u4e00-\u9fff。！？；，、""' "（）【】《》〈〉]", "", text
        )

        return text

    def chunk_document(self, text: str, filename: str) -> List[dict]:
        """Split document into chunks with metadata"""
        clean_text = self.clean_text(text)

        if len(clean_text) < 50:  # Skip very short documents
            return []

        chunks = self.splitter.create_documents([clean_text])

        result = []
        for i, chunk in enumerate(chunks):
            result.append(
                {
                    "id": f"{filename}_{i}",
                    "text": chunk.page_content,
                    "meta": {
                        "source": filename,
                        "chunk_id": i,
                        "length": len(chunk.page_content),
                    },
                }
            )

        return result

In [None]:
# Cell 4: RAG Index Builder
class RAGIndexBuilder:
    """Build and manage FAISS index for uploaded documents"""

    def __init__(self, embedding_model: str = "BAAI/bge-m3"):
        self.embedding_model = embedding_model
        self.encoder = None
        self.index = None
        self.chunks = []
        self.index_built = False

    def load_encoder(self):
        """Load embedding model (lazy loading)"""
        if self.encoder is None:
            print(f"Loading embedding model: {self.embedding_model}")
            self.encoder = SentenceTransformer(self.embedding_model)
        return self.encoder

    def build_index(self, all_chunks: List[dict]) -> Tuple[bool, str]:
        """Build FAISS index from document chunks"""
        try:
            if not all_chunks:
                return False, "No chunks to index"

            # Load model
            encoder = self.load_encoder()

            # Extract texts and encode
            texts = [chunk["text"] for chunk in all_chunks]
            print(f"Encoding {len(texts)} chunks...")

            embeddings = encoder.encode(
                texts, normalize_embeddings=True, batch_size=32, show_progress_bar=True
            ).astype("float32")

            # Build FAISS index
            dim = embeddings.shape[1]
            self.index = faiss.IndexFlatIP(dim)  # Inner product for normalized vectors
            self.index.add(embeddings)

            # Store chunks for retrieval
            self.chunks = all_chunks
            self.index_built = True

            return True, f"✓ Index built: {len(texts)} chunks, {dim}D vectors"

        except Exception as e:
            return False, f"Error building index: {str(e)}"

    def search(self, query: str, top_k: int = 5) -> List[Tuple[str, dict, float]]:
        """Search similar chunks"""
        if not self.index_built:
            return []

        try:
            # Encode query
            query_vec = self.encoder.encode([query], normalize_embeddings=True).astype(
                "float32"
            )

            # Search
            scores, indices = self.index.search(query_vec, top_k)

            results = []
            for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
                if idx < len(self.chunks):
                    chunk = self.chunks[idx]
                    results.append((chunk["text"], chunk["meta"], float(score)))

            return results

        except Exception as e:
            print(f"Search error: {e}")
            return []

In [None]:
# Cell 5: Gradio Interface with Upload and Index
def create_upload_index_interface():
    """Create Gradio interface for document upload and indexing"""

    processor = DocumentProcessor()
    index_builder = RAGIndexBuilder()

    def process_files(files) -> Tuple[str, str]:
        """Process uploaded files and build index"""
        if not files:
            return "No files uploaded", ""

        all_chunks = []
        processing_log = []

        for file in files:
            try:
                # Get file info
                file_path = file.name
                filename = Path(file_path).name
                file_ext = Path(file_path).suffix.lower().lstrip(".")

                processing_log.append(f"📄 Processing: {filename}")

                # Extract text
                text = processor.extract_text(file_path, file_ext)

                if text.startswith("Error") or text.startswith("Unsupported"):
                    processing_log.append(f"❌ {text}")
                    continue

                # Chunk document
                chunks = processor.chunk_document(text, filename)
                all_chunks.extend(chunks)

                processing_log.append(f"✓ {filename}: {len(chunks)} chunks")

            except Exception as e:
                processing_log.append(f"❌ Error processing {filename}: {str(e)}")

        # Build index
        if all_chunks:
            processing_log.append(
                f"\n🔧 Building index for {len(all_chunks)} total chunks..."
            )
            success, message = index_builder.build_index(all_chunks)
            processing_log.append(message)

            if success:
                index_status = (
                    f"✅ Index ready: {len(all_chunks)} chunks from {len(files)} files"
                )
            else:
                index_status = f"❌ Index failed: {message}"
        else:
            index_status = "❌ No valid chunks to index"

        return "\n".join(processing_log), index_status

    def query_index(query: str, top_k: int = 3) -> str:
        """Query the built index"""
        if not query.strip():
            return "Please enter a query"

        if not index_builder.index_built:
            return "❌ No index available. Please upload and process documents first."

        try:
            results = index_builder.search(query, top_k)

            if not results:
                return "No relevant results found"

            response = f"🔍 Query: {query}\n\n"

            for i, (text, meta, score) in enumerate(results, 1):
                response += f"[{i}] Score: {score:.3f} | Source: {meta['source']}\n"
                response += f"Text: {text[:200]}{'...' if len(text) > 200 else ''}\n\n"

            return response

        except Exception as e:
            return f"❌ Search error: {str(e)}"

    # Build interface
    with gr.Blocks(title="Document Upload & RAG Index") as demo:
        gr.Markdown("## 📚 Document Upload & RAG Index Builder")
        gr.Markdown("Upload documents (PDF/TXT/MD/HTML) → Build index → Query")

        with gr.Row():
            with gr.Column(scale=1):
                file_upload = gr.File(
                    label="Upload Documents",
                    file_count="multiple",
                    file_types=[".pdf", ".txt", ".md", ".html"],
                )

                process_btn = gr.Button("🔧 Process & Build Index", variant="primary")

                index_status = gr.Textbox(
                    label="Index Status", value="No index built yet", interactive=False
                )

            with gr.Column(scale=1):
                query_input = gr.Textbox(
                    label="Query", placeholder="Enter your question...", lines=2
                )

                top_k_slider = gr.Slider(
                    minimum=1, maximum=10, value=3, step=1, label="Number of results"
                )

                search_btn = gr.Button("🔍 Search", variant="secondary")

        processing_output = gr.Textbox(
            label="Processing Log", lines=8, interactive=False
        )

        search_output = gr.Textbox(label="Search Results", lines=10, interactive=False)

        # Event handlers
        process_btn.click(
            process_files,
            inputs=[file_upload],
            outputs=[processing_output, index_status],
        )

        search_btn.click(
            query_index, inputs=[query_input, top_k_slider], outputs=[search_output]
        )

        # Allow Enter key for search
        query_input.submit(
            query_index, inputs=[query_input, top_k_slider], outputs=[search_output]
        )

    return demo

In [None]:
# Cell 6: Launch Interface
if __name__ == "__main__":
    demo = create_upload_index_interface()
    demo.launch(server_name="127.0.0.1", server_port=7860, share=False, show_error=True)

In [None]:
# Cell 7: Smoke Test
def smoke_test():
    """Test document processing and indexing functionality"""
    print("🧪 Running smoke test...")

    # Create test document
    test_content = """
    # Test Document

    這是一個測試文檔。我們要測試中文RAG系統的文檔處理能力。

    ## Section 1
    RAG (Retrieval-Augmented Generation) 是一種結合檢索和生成的技術。

    ## Section 2
    FAISS 是一個高效的向量搜索庫，支援大規模相似性搜索。
    """

    # Test processor
    processor = DocumentProcessor()
    chunks = processor.chunk_document(test_content, "test.md")

    print(f"✓ Document chunking: {len(chunks)} chunks")

    # Test index builder
    builder = RAGIndexBuilder()
    success, message = builder.build_index(chunks)

    print(f"✓ Index building: {message}")

    if success:
        # Test search
        results = builder.search("什麼是RAG？", top_k=2)
        print(f"✓ Search test: {len(results)} results")

        if results:
            print(f"  Top result score: {results[0][2]:.3f}")

    print("🎉 Smoke test completed!")


# Run smoke test
smoke_test()

In [None]:
# Cell 8: Key Parameters & Tips
print(
    """
🔧 Key Parameters:
- chunk_size=800, chunk_overlap=80 (中文友好)
- embedding_model="BAAI/bge-m3" (多語言支援)
- FAISS IndexFlatIP (normalized vectors)
- Support: PDF, TXT, MD, HTML

💡 Low-VRAM Options:
- Use bge-small-zh-v1.5 for lighter embedding
- Process documents in smaller batches
- Set batch_size=16 for encoding

⚠️ Important Notes:
- Large files may take time to process
- Index is stored in memory (not persisted)
- Check file encoding for proper Chinese display
- PDF extraction quality depends on source
"""
)

# Cell 9: When to Use This
print(
    """
📋 When to use this notebook:
- Need real-time document indexing in UI
- Want to test RAG with custom documents
- Building document Q&A applications
- Prototyping knowledge base systems

🔄 Integration points:
- Combine with chat interface (nb52)
- Add persistent storage for indices
- Integrate with agent systems (Stage 4)
- Scale with multi-domain routing (nb19)
"""
)

In [None]:
# Quick verification that upload and indexing works
def quick_smoke_test():
    # Test with minimal content
    processor = DocumentProcessor()
    test_text = "RAG是檢索增強生成技術。FAISS是向量搜索庫。"
    chunks = processor.chunk_document(test_text, "test.txt")

    builder = RAGIndexBuilder()
    success, msg = builder.build_index(chunks)

    if success:
        results = builder.search("什麼是RAG", top_k=1)
        print(f"✅ Upload & Index 功能正常: {len(results)} 個結果")
    else:
        print(f"❌ 測試失敗: {msg}")


quick_smoke_test()