In [None]:
# nb14_query_and_citations.ipynb
# RAG 查詢與引用系統實作

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# ========== Cell 2: Dependencies & Setup ==========
import json
import time
import numpy as np
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass

# Core ML libraries
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import faiss
import tiktoken

# Text processing
from langchain.text_splitter import RecursiveCharacterTextSplitter

print("Dependencies loaded successfully")

In [None]:
# ========== Cell 3: Load Previous Modules & Data ==========
# Sample Chinese text data (simulating processed documents)
SAMPLE_DOCS = [
    {
        "text": "檢索增強生成（RAG）是一種結合檢索和生成的方法，它可以從大型文檔庫中找到相關信息，然後基於這些信息生成準確的回答。RAG 技術在問答系統中表現出色。",
        "meta": {"source_id": "doc_001", "title": "RAG技術概述", "page": 1},
    },
    {
        "text": "向量嵌入是將文本轉換為數值向量的技術。bge-m3 是一個優秀的中文嵌入模型，支持多語言和多功能。它在語義相似度任務上表現優異。",
        "meta": {"source_id": "doc_002", "title": "嵌入模型介紹", "page": 1},
    },
    {
        "text": "FAISS（Facebook AI Similarity Search）是一個高效的向量相似度搜索庫。它支持大規模向量檢索，並提供多種索引類型，如 IndexFlatIP 和 IndexIVF。",
        "meta": {"source_id": "doc_003", "title": "FAISS索引技術", "page": 2},
    },
    {
        "text": "中文文本分段需要考慮標點符號和語義完整性。常用的分段策略包括按段落、按句號等標點符號，以及基於語義的智能分段。",
        "meta": {"source_id": "doc_004", "title": "中文分段策略", "page": 1},
    },
    {
        "text": "提示詞工程是優化 LLM 輸出的重要技術。好的提示詞應該清晰、具體，並包含適當的上下文信息。在 RAG 系統中，提示詞需要整合檢索到的相關文檔。",
        "meta": {"source_id": "doc_005", "title": "提示詞工程", "page": 3},
    },
]

print(f"Loaded {len(SAMPLE_DOCS)} sample documents")

In [None]:
# ========== Cell 4: Initialize Embedding Model & Build Index ==========
# Initialize embedding model (low-VRAM friendly)
print("Loading bge-m3 embedding model...")
embedding_model = SentenceTransformer("BAAI/bge-m3", device="auto")

# Extract texts and encode
texts = [doc["text"] for doc in SAMPLE_DOCS]
print(f"Encoding {len(texts)} documents...")

# Encode with normalization for cosine similarity
embeddings = embedding_model.encode(
    texts,
    normalize_embeddings=True,  # Important for cosine similarity
    batch_size=8,
    show_progress_bar=True,
).astype("float32")

print(f"Embeddings shape: {embeddings.shape}")

# Build FAISS index (Inner Product for normalized vectors = cosine similarity)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product index
index.add(embeddings)

print(f"FAISS index built with {index.ntotal} vectors")

In [None]:
# ========== Cell 5: RAG Citation System Core ==========
@dataclass
class RetrievalResult:
    """Single retrieval result with citation info"""

    text: str
    score: float
    source_id: str
    title: str
    page: int
    citation_id: int


class RAGCitationSystem:
    """RAG system with built-in citation tracking"""

    def __init__(
        self, embedding_model, index, documents, tokenizer_name="gpt-3.5-turbo"
    ):
        self.embedding_model = embedding_model
        self.index = index
        self.documents = documents
        self.tokenizer = tiktoken.encoding_for_model(tokenizer_name)

    def retrieve(self, query: str, top_k: int = 5) -> List[RetrievalResult]:
        """Retrieve relevant documents with citation tracking"""
        # Encode query
        query_embedding = self.embedding_model.encode(
            [query], normalize_embeddings=True
        ).astype("float32")

        # Search in FAISS index
        scores, indices = self.index.search(query_embedding, top_k)

        # Build results with citation IDs
        results = []
        for i, (score, doc_idx) in enumerate(zip(scores[0], indices[0])):
            if doc_idx < len(self.documents):  # Valid index
                doc = self.documents[doc_idx]
                result = RetrievalResult(
                    text=doc["text"],
                    score=float(score),
                    source_id=doc["meta"]["source_id"],
                    title=doc["meta"]["title"],
                    page=doc["meta"]["page"],
                    citation_id=i + 1,  # 1-based citation numbering
                )
                results.append(result)

        return results

    def build_context(
        self, results: List[RetrievalResult], max_tokens: int = 2000
    ) -> Tuple[str, str]:
        """Build context string with token budget control"""
        context_parts = []
        citations = []
        current_tokens = 0

        for result in results:
            # Format: [citation_id] text
            chunk_text = f"[{result.citation_id}] {result.text}"
            chunk_tokens = len(self.tokenizer.encode(chunk_text))

            if current_tokens + chunk_tokens > max_tokens:
                break

            context_parts.append(chunk_text)
            citations.append(
                f"[{result.citation_id}] {result.source_id} - {result.title} (p.{result.page})"
            )
            current_tokens += chunk_tokens

        context = "\n\n".join(context_parts)
        citation_list = "\n".join(citations)

        return context, citation_list

    def format_prompt(self, query: str, context: str) -> str:
        """Format RAG prompt with citation requirements"""
        prompt = f"""你是一個專業的問答助手。請根據提供的上下文信息回答問題。

重要要求：
1. 回答必須基於提供的上下文信息
2. 在回答中使用引用標註，格式為 [1], [2] 等
3. 如果上下文中沒有相關信息，請明確說明
4. 回答要準確、簡潔、有條理

上下文信息：
{context}

問題：{query}

請提供詳細回答（包含適當的引用標註）："""

        return prompt


# Initialize RAG system
rag_system = RAGCitationSystem(
    embedding_model=embedding_model, index=index, documents=SAMPLE_DOCS
)

print("RAG Citation System initialized")

In [None]:
# ========== Cell 6: LLM Integration for Complete RAG ==========
# Initialize a small LLM for demonstration (adjust based on available VRAM)
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"  # Can switch to smaller models if needed

print(f"Loading LLM: {MODEL_ID}")
print("Note: This may take a few minutes on first run...")

try:
    # Load with memory optimization
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map="auto",
        torch_dtype=torch.float16,
        load_in_8bit=True,  # 8-bit quantization for VRAM efficiency
        trust_remote_code=True,
    )

    print(f"Model loaded successfully on device: {model.device}")
    llm_available = True

except Exception as e:
    print(f"LLM loading failed: {e}")
    print("Continuing with retrieval-only demo...")
    llm_available = False

In [None]:
# ========== Cell 7: Complete RAG Function ==========
def complete_rag_query(query: str, top_k: int = 3, max_tokens: int = 256) -> Dict:
    """Complete RAG pipeline: retrieve → generate → format citations"""
    start_time = time.time()

    # Step 1: Retrieve relevant documents
    retrieval_results = rag_system.retrieve(query, top_k)

    if not retrieval_results:
        return {
            "query": query,
            "answer": "抱歉，我在知識庫中找不到相關信息來回答您的問題。",
            "citations": [],
            "retrieval_time": time.time() - start_time,
            "generation_time": 0,
        }

    # Step 2: Build context with token budget
    context, citation_list = rag_system.build_context(
        retrieval_results, max_tokens=1500
    )

    # Step 3: Format prompt
    prompt = rag_system.format_prompt(query, context)

    generation_start = time.time()

    # Step 4: Generate answer (if LLM available)
    if llm_available:
        try:
            # Tokenize input
            inputs = tokenizer(
                prompt, return_tensors="pt", truncation=True, max_length=2048
            )
            inputs = {k: v.to(model.device) for k, v in inputs.items()}

            # Generate with controlled parameters
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=max_tokens,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9,
                    repetition_penalty=1.1,
                    pad_token_id=tokenizer.eos_token_id,
                )

            # Decode response
            full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            answer = full_response[len(prompt) :].strip()

        except Exception as e:
            answer = f"生成過程中出現錯誤：{str(e)}"
    else:
        # Fallback: return formatted context
        answer = f"基於檢索到的信息：\n\n{context}\n\n（注意：由於 LLM 未載入，這裡顯示的是原始檢索結果）"

    generation_time = time.time() - generation_start
    total_time = time.time() - start_time

    return {
        "query": query,
        "answer": answer,
        "citations": citation_list.split("\n"),
        "retrieval_results": len(retrieval_results),
        "retrieval_time": generation_start - start_time,
        "generation_time": generation_time,
        "total_time": total_time,
    }


print("Complete RAG function ready")

In [None]:
# ========== Cell 8: Smoke Test - Chinese Q&A with Citations ==========
print("=== RAG 問答系統測試 ===\n")

# Test questions
test_queries = [
    "什麼是 RAG 技術？它有什麼優點？",
    "bge-m3 模型有什麼特點？",
    "FAISS 支援哪些索引類型？",
    "中文文本應該如何分段？",
]

for i, query in enumerate(test_queries, 1):
    print(f"問題 {i}：{query}")
    print("-" * 50)

    result = complete_rag_query(query, top_k=3, max_tokens=200)

    print(f"回答：\n{result['answer']}\n")

    print("引用來源：")
    for citation in result["citations"]:
        print(f"  {citation}")

    print(f"\n效能指標：")
    print(f"  檢索到 {result['retrieval_results']} 個相關文檔")
    print(f"  檢索時間：{result['retrieval_time']:.3f}s")
    print(f"  生成時間：{result['generation_time']:.3f}s")
    print(f"  總時間：{result['total_time']:.3f}s")

    print("\n" + "=" * 80 + "\n")

In [None]:
# ========== Cell 9: Citation Quality Analysis ==========
def analyze_citation_quality(query: str, answer: str, citations: List[str]) -> Dict:
    """Simple citation quality analysis"""

    # Count citation markers in answer
    citation_markers = []
    import re

    markers = re.findall(r"\[(\d+)\]", answer)
    citation_markers = list(set(int(m) for m in markers))

    # Check coverage
    available_citations = len(citations)
    used_citations = len(citation_markers)

    return {
        "total_citations_available": available_citations,
        "citations_used_in_answer": used_citations,
        "citation_markers_found": citation_markers,
        "citation_coverage": used_citations / max(1, available_citations),
        "proper_format": len([m for m in markers if m.isdigit()]) == len(markers),
    }


# Test citation quality
test_query = "RAG 技術的核心組件有哪些？"
result = complete_rag_query(test_query, top_k=4)

print("=== 引用品質分析 ===")
print(f"測試問題：{test_query}\n")

quality = analyze_citation_quality(test_query, result["answer"], result["citations"])

print("引用使用分析：")
for key, value in quality.items():
    print(f"  {key}: {value}")

print(f"\n回答內容：\n{result['answer']}")

In [None]:
# ========== Cell 10: Token Budget Control Demo ==========
def test_token_budget_control():
    """Demonstrate context length management"""
    print("=== Token 預算控制測試 ===\n")

    long_query = "請詳細說明 RAG、FAISS、嵌入模型和提示詞工程的相關技術"

    # Test different budget limits
    budgets = [500, 1000, 2000]

    for budget in budgets:
        print(f"Token 預算限制：{budget}")
        print("-" * 30)

        # Retrieve with budget limit
        results = rag_system.retrieve(long_query, top_k=5)
        context, citations = rag_system.build_context(results, max_tokens=budget)

        # Count actual tokens
        actual_tokens = len(rag_system.tokenizer.encode(context))

        print(f"實際使用 tokens：{actual_tokens}")
        print(f"包含文檔數量：{len(citations.split(chr(10)))}")
        print(f"Context 預覽：{context[:100]}...")
        print()


test_token_budget_control()

In [None]:
# ========== Cell 11: Extensions & Next Steps ==========
print("=== 本筆記本完成 ===")
print("\nWhat we built:")
print("✓ 完整的 RAG 檢索管道（查詢→嵌入→FAISS檢索→排序）")
print("✓ 引用追蹤系統（[1], [2] 格式，含來源信息）")
print("✓ Token 預算控制（避免超出 context window）")
print("✓ 中文問答測試與引用品質分析")
print("✓ 效能監控（檢索/生成時間分離）")

print("\nKey concepts:")
print("• Citation tracking: 每個檢索結果分配唯一編號")
print("• Context assembly: 根據 token 預算組合上下文")
print("• Prompt engineering: 明確要求模型使用引用")
print("• Quality analysis: 檢查引用使用率與格式正確性")

print("\nPitfalls to avoid:")
print("• 引用編號錯位：去重/重排後要重新編號")
print("• Token 估算不準：不同 tokenizer 差異大")
print("• 過度檢索：top_k 太大導致 context 稀釋")
print("• 缺少回退：檢索失敗時要有適當回應")

print("\nNext steps (nb15):")
print("• 加入重排器（bge-reranker）提升檢索精度")
print("• 實現 MMR（最大邊際相關性）去重")
print("• 測試 Hybrid 檢索（關鍵詞+向量）")
print("• 建立評估指標（Recall@k, NDCG）")

print("\nRepro tips:")
print("• 調整 MODEL_ID 和 load_in_8bit 參數適應硬體")
print("• 增加更多樣本文檔測試不同領域")
print("• 修改 max_tokens 參數平衡速度與品質")
print("• 使用 temperature=0.3 獲得更穩定的引用格式")

# RAG 查詢與引用實作：nb14_query_and_citations.ipynb

## Goals（目標）

- 整合前期模組（chunking + embeddings + FAISS index）建立完整 RAG 檢索流程
- 實現**引用標註**系統：查詢→檢索→組合 context→生成答案＋引用
- 測試中文問答準確性與引用格式（`[1]`, `[2]` 格式）
- 驗證 token budget 控制與 context window 管理
- 準備重排器（reranker）的接入點

## Notebook Outline（筆記本大綱）

1. **Shared Cache Bootstrap**（標準第一格）
2. **載入前期模組**：chunking, embeddings, FAISS index
3. **RAG Pipeline 核心**：檢索函數 + context 組裝
4. **Citation System**：引用編號與來源追蹤
5. **Query Template**：提示詞模板（含引用要求）
6. **Complete RAG Function**：端到端問答
7. **Smoke Test**：中文問答＋引用驗證
8. **Token Budget 控制**：context 長度管理
9. **Pitfalls & Extensions**## Core Code Blocks（核心程式碼）


## Key Parameters（關鍵參數）

| 參數 | 預設值 | 低 VRAM 選項 | 說明 |
|------|--------|--------------|------|
| `MODEL_ID` | Qwen2.5-7B-Instruct | Qwen2.5-1.5B-Instruct | LLM 模型選擇 |
| `load_in_8bit` | True | True | 8bit 量化節省 VRAM |
| `top_k` | 3-5 | 3 | 檢索文檔數量 |
| `max_tokens` | 256 | 128 | 生成長度控制 |
| `context_budget` | 1500 | 1000 | Context token 預算 |


## When to Use This（使用時機）

- 需要**可驗證答案**的問答系統（學術、法律、醫療）
- 要求**透明度**與**可追溯性**的 AI 應用
- **多文檔**知識庫檢索與摘要
- 在**有限 VRAM**環境中部署 RAG 系統
- 準備整合**重排器**前的基礎管道測試


In [None]:
### RAG Citation System 核心類別
@dataclass
class RetrievalResult:
    text: str
    score: float
    source_id: str
    title: str
    page: int
    citation_id: int

class RAGCitationSystem:
    def retrieve(self, query: str, top_k: int = 5) -> List[RetrievalResult]:
        # 查詢嵌入 + FAISS 檢索 + 引用編號分配

    def build_context(self, results: List[RetrievalResult], max_tokens: int = 2000):
        # Token 預算控制 + Context 組裝

    def format_prompt(self, query: str, context: str) -> str:
        # 包含引用要求的提示詞模板

In [None]:
### 完整 RAG 查詢流程
def complete_rag_query(query: str, top_k: int = 3, max_tokens: int = 256):
    # 檢索 → 組合 → 生成 → 引用格式化
    retrieval_results = rag_system.retrieve(query, top_k)
    context, citation_list = rag_system.build_context(retrieval_results)
    prompt = rag_system.format_prompt(query, context)
    # LLM 生成 + 時間統計

In [None]:
## Smoke Test Cell（煙霧測試）

# 測試中文問答 + 引用
test_query = "什麼是 RAG 技術？"
result = complete_rag_query(test_query, top_k=3)
print(f"答案：{result['answer']}")
print(f"引用：{result['citations']}")
assert "[1]" in result['answer'] or "檢索失敗" in result['answer']
print("✓ RAG 查詢與引用系統測試通過")