In [None]:
# nb19_multi_domain_indices.ipynb
# Multi-Domain RAG Indices with Smart Routing

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# Cell 2: Dependencies and Domain Sample Data
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import Dict, List, Tuple, Optional
import yaml
from pathlib import Path
import time

# Sample documents for different domains
DOMAIN_SAMPLES = {
    "tech": [
        "Large Language Models (LLMs) like GPT and BERT use transformer architecture for natural language processing.",
        "RAG (Retrieval-Augmented Generation) combines vector databases with language models for grounded responses.",
        "FAISS (Facebook AI Similarity Search) provides efficient similarity search and clustering of dense vectors.",
        "Gradient descent optimization algorithms like Adam and SGD are fundamental to training neural networks.",
        "Docker containers enable consistent deployment environments across development and production systems.",
    ],
    "edu": [
        "Constructivist learning theory emphasizes that students build knowledge through active engagement with materials.",
        "Bloom's taxonomy categorizes learning objectives into six levels from remembering to creating.",
        "Differentiated instruction adapts teaching methods to accommodate diverse learning styles and abilities.",
        "Assessment rubrics provide clear criteria and standards for evaluating student performance.",
        "Project-based learning engages students in real-world problems to develop critical thinking skills.",
    ],
    "legal": [
        "Contract law governs the formation, performance, and enforcement of agreements between parties.",
        "Due process ensures fair treatment through the normal judicial system, especially as a citizen's entitlement.",
        "Intellectual property rights protect creators' ownership of their innovations, writings, and artistic works.",
        "Tort law addresses civil wrongs that cause harm or loss, resulting in legal liability for the actor.",
        "Constitutional law establishes the framework of government and fundamental rights of citizens.",
    ],
    "general": [
        "Climate change refers to long-term shifts in global temperatures and weather patterns.",
        "The scientific method involves observation, hypothesis formation, experimentation, and conclusion.",
        "Renewable energy sources like solar and wind power help reduce carbon emissions.",
        "Biodiversity conservation protects ecosystems and maintains ecological balance.",
        "Sustainable development meets present needs without compromising future generations' ability.",
    ],
}

print("✓ Domain sample data prepared")
for domain, docs in DOMAIN_SAMPLES.items():
    print(f"  {domain}: {len(docs)} documents")

In [None]:
# Cell 3: Domain Index Builder
class DomainIndexBuilder:
    def __init__(self, model_name="BAAI/bge-m3"):
        self.model = SentenceTransformer(model_name)
        self.splitter = RecursiveCharacterTextSplitter(
            separators=["。", "！", "？", "；", "…", "\n\n", "\n", " "],
            chunk_size=400,  # Smaller chunks for domain-specific content
            chunk_overlap=40,
        )

    def build_domain_index(
        self, domain: str, documents: List[str], index_dir: str = "indices"
    ) -> Tuple[faiss.Index, List[Dict]]:
        """Build FAISS index for specific domain"""
        # Create chunks with domain metadata
        chunks_data = []
        all_texts = []

        for doc_id, doc in enumerate(documents):
            chunks = self.splitter.create_documents([doc])
            for chunk_id, chunk in enumerate(chunks):
                chunk_meta = {
                    "id": f"{domain}_{doc_id}_{chunk_id}",
                    "text": chunk.page_content,
                    "domain": domain,
                    "doc_id": doc_id,
                    "chunk_id": chunk_id,
                }
                chunks_data.append(chunk_meta)
                all_texts.append(chunk.page_content)

        # Generate embeddings
        print(f"Embedding {len(all_texts)} chunks for domain '{domain}'...")
        embeddings = self.model.encode(
            all_texts, normalize_embeddings=True, batch_size=16, show_progress_bar=True
        ).astype("float32")

        # Build FAISS index
        index = faiss.IndexFlatIP(embeddings.shape[1])
        index.add(embeddings)

        # Save index and metadata
        Path(index_dir).mkdir(exist_ok=True)
        faiss.write_index(index, f"{index_dir}/{domain}.faiss")

        with open(f"{index_dir}/{domain}_chunks.jsonl", "w", encoding="utf-8") as f:
            for chunk in chunks_data:
                f.write(json.dumps(chunk, ensure_ascii=False) + "\n")

        print(f"✓ Built index for '{domain}': {index.ntotal} vectors")
        return index, chunks_data


# Build indices for all domains
builder = DomainIndexBuilder()
domain_indices = {}
domain_chunks = {}

for domain, docs in DOMAIN_SAMPLES.items():
    index, chunks = builder.build_domain_index(domain, docs)
    domain_indices[domain] = index
    domain_chunks[domain] = chunks

print(f"\n✓ Built {len(domain_indices)} domain indices")

In [None]:
# Cell 4: Smart Query Router
class QueryRouter:
    def __init__(self, model_name="BAAI/bge-m3"):
        self.model = SentenceTransformer(model_name)

        # Domain descriptions for semantic routing
        self.domain_descriptions = {
            "tech": "artificial intelligence, machine learning, programming, software engineering, computer science, technology, algorithms, data science",
            "edu": "education, teaching, learning, pedagogy, instruction, assessment, curriculum, students, classroom management",
            "legal": "law, legal, court, contract, rights, constitution, tort, legislation, justice, attorney, litigation",
            "general": "science, environment, climate, nature, research, sustainability, general knowledge, news, current events",
        }

        # Pre-compute domain description embeddings
        self.domain_embeddings = {}
        for domain, desc in self.domain_descriptions.items():
            emb = self.model.encode([desc], normalize_embeddings=True)[0]
            self.domain_embeddings[domain] = emb.astype("float32")

        # Rule-based keywords
        self.domain_keywords = {
            "tech": [
                "LLM",
                "AI",
                "機器學習",
                "程式",
                "演算法",
                "程式碼",
                "神經網路",
                "transformer",
                "API",
                "資料庫",
            ],
            "edu": [
                "教學",
                "學習",
                "教育",
                "學生",
                "課程",
                "評量",
                "教材",
                "pedagogy",
                "assessment",
                "curriculum",
            ],
            "legal": [
                "法律",
                "合約",
                "憲法",
                "權利",
                "法院",
                "訴訟",
                "contract",
                "tort",
                "legislation",
                "constitutional",
            ],
            "general": [
                "氣候",
                "環境",
                "科學",
                "研究",
                "永續",
                "climate",
                "environment",
                "research",
                "sustainable",
            ],
        }

    def route_semantic(self, query: str, top_k: int = 2) -> List[Tuple[str, float]]:
        """Semantic routing based on query-domain similarity"""
        query_emb = self.model.encode([query], normalize_embeddings=True)[0]

        similarities = []
        for domain, domain_emb in self.domain_embeddings.items():
            sim = np.dot(query_emb, domain_emb)
            similarities.append((domain, float(sim)))

        # Sort by similarity and return top_k
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:top_k]

    def route_rule_based(self, query: str) -> List[Tuple[str, float]]:
        """Rule-based routing using keyword matching"""
        query_lower = query.lower()
        scores = {domain: 0.0 for domain in self.domain_keywords}

        for domain, keywords in self.domain_keywords.items():
            for keyword in keywords:
                if keyword.lower() in query_lower:
                    scores[domain] += 1.0

        # Normalize scores
        max_score = max(scores.values()) if max(scores.values()) > 0 else 1.0
        normalized = [(domain, score / max_score) for domain, score in scores.items()]

        # Filter and sort
        filtered = [(d, s) for d, s in normalized if s > 0]
        filtered.sort(key=lambda x: x[1], reverse=True)

        return filtered if filtered else [("general", 1.0)]

    def route_hybrid(
        self, query: str, semantic_weight: float = 0.7
    ) -> List[Tuple[str, float]]:
        """Hybrid routing combining semantic and rule-based approaches"""
        semantic_routes = self.route_semantic(query, top_k=4)
        rule_routes = self.route_rule_based(query)

        # Combine scores
        combined_scores = {}

        # Add semantic scores
        for domain, score in semantic_routes:
            combined_scores[domain] = semantic_weight * score

        # Add rule-based scores
        rule_dict = dict(rule_routes)
        for domain in combined_scores:
            if domain in rule_dict:
                combined_scores[domain] += (1 - semantic_weight) * rule_dict[domain]

        # Add purely rule-based domains
        for domain, score in rule_routes:
            if domain not in combined_scores:
                combined_scores[domain] = (1 - semantic_weight) * score

        # Sort and return
        result = [(domain, score) for domain, score in combined_scores.items()]
        result.sort(key=lambda x: x[1], reverse=True)

        return result


# Initialize router
router = QueryRouter()

# Test routing examples
test_queries = [
    "What is transformer architecture in machine learning?",
    "How to implement project-based learning in classroom?",
    "What are the key principles of contract law?",
    "How does climate change affect biodiversity?",
]

print("Query Routing Test Results:")
print("=" * 60)

for query in test_queries:
    print(f"\nQuery: {query}")

    semantic = router.route_semantic(query, top_k=2)
    rule_based = router.route_rule_based(query)
    hybrid = router.route_hybrid(query)

    print(f"  Semantic: {semantic}")
    print(f"  Rule-based: {rule_based[:2]}")
    print(f"  Hybrid: {hybrid[:2]}")



In [None]:
# Cell 5: Multi-Domain Index Manager
class MultiDomainIndexManager:
    def __init__(self, model_name="BAAI/bge-m3", index_dir="indices"):
        self.model = SentenceTransformer(model_name)
        self.router = QueryRouter(model_name)
        self.index_dir = Path(index_dir)

        # Load indices and chunks
        self.indices = {}
        self.chunks = {}
        self.load_all_indices()

        # Routing configuration
        self.config = {
            "routing_strategy": "hybrid",  # semantic, rule, hybrid
            "semantic_weight": 0.7,
            "max_domains": 2,
            "fallback_domain": "general",
            "similarity_threshold": 0.1,
        }

    def load_all_indices(self):
        """Load all available domain indices"""
        for domain in ["tech", "edu", "legal", "general"]:
            index_path = self.index_dir / f"{domain}.faiss"
            chunks_path = self.index_dir / f"{domain}_chunks.jsonl"

            if index_path.exists() and chunks_path.exists():
                # Load FAISS index
                index = faiss.read_index(str(index_path))
                self.indices[domain] = index

                # Load chunks metadata
                chunks = []
                with open(chunks_path, "r", encoding="utf-8") as f:
                    for line in f:
                        chunks.append(json.loads(line.strip()))
                self.chunks[domain] = chunks

                print(f"✓ Loaded {domain} index: {index.ntotal} vectors")

    def route_query(self, query: str) -> List[Tuple[str, float]]:
        """Route query to appropriate domains"""
        strategy = self.config["routing_strategy"]

        if strategy == "semantic":
            routes = self.router.route_semantic(query, self.config["max_domains"])
        elif strategy == "rule":
            routes = self.router.route_rule_based(query)[: self.config["max_domains"]]
        else:  # hybrid
            routes = self.router.route_hybrid(query, self.config["semantic_weight"])[
                : self.config["max_domains"]
            ]

        # Filter by threshold
        filtered = [
            (d, s) for d, s in routes if s >= self.config["similarity_threshold"]
        ]

        # Fallback if no routes meet threshold
        if not filtered:
            filtered = [(self.config["fallback_domain"], 1.0)]

        return filtered

    def search_multi_domain(
        self, query: str, k: int = 5
    ) -> List[Tuple[str, Dict, float]]:
        """Search across multiple domains with routing"""
        start_time = time.time()

        # Route query to domains
        domain_routes = self.route_query(query)
        print(f"Routed to domains: {domain_routes}")

        # Generate query embedding
        query_emb = self.model.encode([query], normalize_embeddings=True).astype(
            "float32"
        )

        # Search each routed domain
        all_results = []

        for domain, weight in domain_routes:
            if domain not in self.indices:
                continue

            # Search this domain
            index = self.indices[domain]
            chunks = self.chunks[domain]

            # Get more candidates to account for domain weighting
            search_k = min(k * 2, index.ntotal)
            D, I = index.search(query_emb, search_k)

            # Collect results with domain weighting
            for i, (idx, score) in enumerate(zip(I[0], D[0])):
                if idx >= 0:  # Valid index
                    chunk = chunks[idx]
                    weighted_score = float(score) * weight
                    all_results.append((chunk["text"], chunk, weighted_score))

        # Sort by weighted score and take top k
        all_results.sort(key=lambda x: x[2], reverse=True)
        final_results = all_results[:k]

        search_time = (time.time() - start_time) * 1000
        print(f"Multi-domain search completed in {search_time:.1f}ms")

        return final_results

    def search_single_domain(
        self, query: str, domain: str, k: int = 5
    ) -> List[Tuple[str, Dict, float]]:
        """Search within a specific domain"""
        if domain not in self.indices:
            raise ValueError(f"Domain '{domain}' not available")

        query_emb = self.model.encode([query], normalize_embeddings=True).astype(
            "float32"
        )

        index = self.indices[domain]
        chunks = self.chunks[domain]

        D, I = index.search(query_emb, k)

        results = []
        for idx, score in zip(I[0], D[0]):
            if idx >= 0:
                chunk = chunks[idx]
                results.append((chunk["text"], chunk, float(score)))

        return results


# Initialize multi-domain manager
manager = MultiDomainIndexManager()

print(f"✓ Loaded {len(manager.indices)} domain indices")
print("Available domains:", list(manager.indices.keys()))

In [None]:
# Cell 6: Multi-Domain Retrieval Comparison
def test_retrieval_comparison():
    """Compare single vs multi-domain retrieval performance"""

    test_cases = [
        {
            "query": "What is RAG in machine learning?",
            "expected_domain": "tech",
            "description": "Technical AI query",
        },
        {
            "query": "How to assess student learning outcomes?",
            "expected_domain": "edu",
            "description": "Educational assessment query",
        },
        {
            "query": "What are intellectual property rights?",
            "expected_domain": "legal",
            "description": "Legal concepts query",
        },
        {
            "query": "How does renewable energy help environment?",
            "expected_domain": "general",
            "description": "General science query",
        },
    ]

    print("Retrieval Comparison Results:")
    print("=" * 80)

    for i, test_case in enumerate(test_cases, 1):
        query = test_case["query"]
        expected = test_case["expected_domain"]
        desc = test_case["description"]

        print(f"\n{i}. {desc}")
        print(f"Query: {query}")
        print(f"Expected domain: {expected}")
        print("-" * 40)

        # Multi-domain search
        start = time.time()
        multi_results = manager.search_multi_domain(query, k=3)
        multi_time = (time.time() - start) * 1000

        print(f"Multi-domain results ({multi_time:.1f}ms):")
        for j, (text, meta, score) in enumerate(multi_results, 1):
            domain = meta.get("domain", "unknown")
            print(f"  {j}. [{domain}] (score: {score:.3f})")
            print(f"     {text[:80]}...")

        # Single expected domain search
        if expected in manager.indices:
            start = time.time()
            single_results = manager.search_single_domain(query, expected, k=3)
            single_time = (time.time() - start) * 1000

            print(f"\nSingle-domain '{expected}' results ({single_time:.1f}ms):")
            for j, (text, meta, score) in enumerate(single_results, 1):
                print(f"  {j}. (score: {score:.3f})")
                print(f"     {text[:80]}...")

        print("\n" + "=" * 40)


# Run comparison test
test_retrieval_comparison()

In [None]:
# Cell 7: Smoke Test - End-to-End Multi-Domain RAG
def smoke_test_multidomain_rag():
    """Quick smoke test for multi-domain RAG system"""

    print("🔥 Multi-Domain RAG Smoke Test")
    print("=" * 50)

    test_queries = [
        "Explain transformer architecture",
        "Best practices for classroom assessment",
        "What is due process in law?",
        "Impact of climate change",
    ]

    success_count = 0

    for i, query in enumerate(test_queries, 1):
        try:
            print(f"\n{i}. Testing: {query}")

            # Multi-domain retrieval
            results = manager.search_multi_domain(query, k=2)

            if results:
                print(f"   ✓ Found {len(results)} results")
                best_result = results[0]
                domain = best_result[1].get("domain", "unknown")
                score = best_result[2]
                print(f"   ✓ Best match from '{domain}' domain (score: {score:.3f})")
                print(f"   ✓ Content: {best_result[0][:60]}...")
                success_count += 1
            else:
                print(f"   ✗ No results found")

        except Exception as e:
            print(f"   ✗ Error: {e}")

    print(f"\n🎯 Smoke Test Results: {success_count}/{len(test_queries)} passed")

    if success_count == len(test_queries):
        print("✅ All tests passed! Multi-domain RAG system is working correctly.")
    else:
        print("⚠️  Some tests failed. Check the implementation.")

    return success_count == len(test_queries)


# Run smoke test
smoke_test_result = smoke_test_multidomain_rag()

# Performance summary
print(f"\n📊 Multi-Domain Index Summary:")
print(f"Total domains: {len(manager.indices)}")
print(f"Total vectors: {sum(idx.ntotal for idx in manager.indices.values())}")
print(f"Router strategy: {manager.config['routing_strategy']}")
print(f"Semantic weight: {manager.config['semantic_weight']}")
print(f"Max domains per query: {manager.config['max_domains']}")

# What we built
print(f"\n🔧 What We Built:")
print(f"✓ Multi-domain vector indices (tech/edu/legal/general)")
print(f"✓ Smart query router (semantic + rule-based + hybrid)")
print(f"✓ Index manager with fallback strategies")
print(f"✓ Performance comparison tools")
print(f"✓ End-to-end multi-domain RAG pipeline")

print(f"\n📝 Key Parameters (Low-VRAM options):")
print(f"✓ Model: BAAI/bge-m3 (supports both Chinese/English)")
print(f"✓ Chunk size: 400 tokens (domain-optimized)")
print(f"✓ Batch size: 16 (memory efficient)")
print(f"✓ Normalize embeddings: True (for IP similarity)")
print(f"✓ Index type: FlatIP (simple, no training needed)")

print(f"\n⚠️  Pitfalls to Avoid:")
print(f"• Domain routing accuracy depends on good descriptions")
print(f"• Need sufficient domain-specific training data")
print(f"• Balance semantic vs rule-based weights carefully")
print(f"• Monitor cross-domain contamination in results")
print(f"• Consider index maintenance for incremental updates")

print(f"\n🎯 When to Use Multi-Domain Indices:")
print(f"• Large knowledge bases with distinct topics")
print(f"• Need domain-specific relevance optimization")
print(f"• Want to control/audit which domains are searched")
print(f"• Have computational constraints (search subset)")
print(f"• Building specialized expert systems")