In [None]:
# nb13_faiss_index_build.ipynb
# Stage 2: 中文 RAG 基礎 - FAISS 索引建立與管理

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# === Cell 2: Prerequisites & Imports ===
import numpy as np
import faiss
import json
import pickle
from pathlib import Path
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Tuple, Optional
import time

# Check FAISS installation
print(f"FAISS version: {faiss.__version__}")
print(f"FAISS has GPU support: {hasattr(faiss, 'StandardGpuResources')}")

# Create output directories
Path("indices").mkdir(exist_ok=True)
Path("outs").mkdir(exist_ok=True)

In [None]:
# === Cell 3: Load Demo Data (If Previous Notebooks Not Available) ===
def create_demo_chunks_and_embeddings():
    """Create demo chunks for testing if nb11/nb12 outputs not available"""
    demo_texts = [
        "人工智能（AI）是計算機科學的一個分支，旨在創建能夠執行通常需要人類智能的任務的機器。",
        "機器學習是人工智能的一個子集，它使用統計技術使計算機能夠從數據中學習。",
        "深度學習是機器學習的一個分支，它使用多層神經網絡來建模和理解複雜的模式。",
        "自然語言處理（NLP）是人工智能的一個領域，專注於計算機與人類語言之間的交互。",
        "計算機視覺是一個跨學科科學領域，涉及如何使計算機從數字圖像或視頻中獲得高級理解。",
        "檢索增強生成（RAG）是一種結合信息檢索和文本生成的技術。",
        "向量數據庫用於存儲和檢索高維向量，常用於相似性搜索。",
        "FAISS是Facebook開發的高效相似性搜索和聚類庫。",
    ]

    chunks = []
    for i, text in enumerate(demo_texts):
        chunks.append(
            {
                "id": i,
                "text": text,
                "meta": {
                    "source_id": f"demo_doc_{i//3}",
                    "section": f"section_{i%3}",
                    "page": i // 4 + 1,
                },
            }
        )

    return chunks


# Try to load from previous notebooks, fallback to demo data
try:
    with open("outs/chunks_with_embeddings.json", "r", encoding="utf-8") as f:
        chunks_data = json.load(f)
    print(f"✓ Loaded {len(chunks_data)} chunks from previous notebooks")
except FileNotFoundError:
    print("⚠ Previous notebook outputs not found, creating demo data...")
    chunks_data = create_demo_chunks_and_embeddings()

print(f"Working with {len(chunks_data)} text chunks")

In [None]:
# === Cell 4: Load Embedding Model ===
print("Loading bge-m3 embedding model...")
embedding_model = SentenceTransformer("BAAI/bge-m3")
embedding_model.max_seq_length = 512  # Optimize for memory

# Generate embeddings if not available
if "embedding" not in chunks_data[0]:
    print("Generating embeddings for chunks...")
    texts = [chunk["text"] for chunk in chunks_data]

    # Batch processing for memory efficiency
    batch_size = 8
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]
        batch_embeddings = embedding_model.encode(
            batch_texts, normalize_embeddings=True, show_progress_bar=True
        )
        embeddings.extend(batch_embeddings)

    # Add embeddings to chunks
    for i, chunk in enumerate(chunks_data):
        chunk["embedding"] = embeddings[i].tolist()

    # Save updated chunks
    with open("outs/chunks_with_embeddings.json", "w", encoding="utf-8") as f:
        json.dump(chunks_data, f, ensure_ascii=False, indent=2)

print(f"✓ Embeddings ready, dimension: {len(chunks_data[0]['embedding'])}")

In [None]:
# === Cell 5: FAISS Index Creation ===
class FAISSIndexManager:
    """Manages FAISS index creation, saving, loading, and searching"""

    def __init__(self, dimension: int, metric: str = "ip"):
        self.dimension = dimension
        self.metric = metric.lower()
        self.index = None
        self.id_to_chunk = {}

    def create_index(self, embeddings: np.ndarray, chunk_ids: List[int]):
        """Create FAISS index from embeddings"""
        if self.metric == "ip":
            # Inner Product (for normalized embeddings)
            self.index = faiss.IndexFlatIP(self.dimension)
        elif self.metric == "l2":
            # L2 distance
            self.index = faiss.IndexFlatL2(self.dimension)
        else:
            raise ValueError(f"Unsupported metric: {self.metric}")

        # Add embeddings to index
        self.index.add(embeddings.astype(np.float32))
        print(f"✓ Created FAISS index with {self.index.ntotal} vectors")

        # Store chunk ID mapping
        for i, chunk_id in enumerate(chunk_ids):
            self.id_to_chunk[i] = chunk_id

    def save_index(self, index_path: str, mapping_path: str):
        """Save index and ID mapping to disk"""
        faiss.write_index(self.index, index_path)
        with open(mapping_path, "wb") as f:
            pickle.dump(self.id_to_chunk, f)
        print(f"✓ Saved index to {index_path} and mapping to {mapping_path}")

    def load_index(self, index_path: str, mapping_path: str):
        """Load index and ID mapping from disk"""
        self.index = faiss.read_index(index_path)
        with open(mapping_path, "rb") as f:
            self.id_to_chunk = pickle.load(f)
        print(f"✓ Loaded index with {self.index.ntotal} vectors")

    def search(
        self, query_embeddings: np.ndarray, k: int = 5
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Search for similar vectors"""
        if self.index is None:
            raise ValueError("Index not created or loaded")

        distances, indices = self.index.search(query_embeddings.astype(np.float32), k)
        return distances, indices

    def add_vectors(self, new_embeddings: np.ndarray, new_chunk_ids: List[int]):
        """Add new vectors to existing index (incremental update)"""
        if self.index is None:
            raise ValueError("Index not created or loaded")

        start_idx = self.index.ntotal
        self.index.add(new_embeddings.astype(np.float32))

        # Update ID mapping
        for i, chunk_id in enumerate(new_chunk_ids):
            self.id_to_chunk[start_idx + i] = chunk_id

        print(f"✓ Added {len(new_chunk_ids)} vectors, total: {self.index.ntotal}")


# Create embeddings matrix
embeddings_matrix = np.array([chunk["embedding"] for chunk in chunks_data])
chunk_ids = [chunk["id"] for chunk in chunks_data]

print(f"Embeddings matrix shape: {embeddings_matrix.shape}")
print(f"Embedding dimension: {embeddings_matrix.shape[1]}")

# Initialize and create index
index_manager = FAISSIndexManager(dimension=embeddings_matrix.shape[1], metric="ip")
index_manager.create_index(embeddings_matrix, chunk_ids)

In [None]:
# === Cell 6: Save Index and Chunks Metadata ===
# Save FAISS index and mappings
index_path = "indices/general.faiss"
mapping_path = "indices/id_mapping.pkl"
chunks_path = "indices/chunks.jsonl"

index_manager.save_index(index_path, mapping_path)

# Save chunks as JSONL for easy access
with open(chunks_path, "w", encoding="utf-8") as f:
    for chunk in chunks_data:
        # Remove embedding from JSONL to save space
        chunk_no_embed = {k: v for k, v in chunk.items() if k != "embedding"}
        f.write(json.dumps(chunk_no_embed, ensure_ascii=False) + "\n")

print(f"✓ Saved {len(chunks_data)} chunks metadata to {chunks_path}")

In [None]:
# === Cell 7: Load and Validate Index ===
def load_chunks_from_jsonl(jsonl_path: str) -> Dict[int, Dict]:
    """Load chunks from JSONL file"""
    chunks = {}
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            chunk = json.loads(line.strip())
            chunks[chunk["id"]] = chunk
    return chunks


# Test loading
test_index_manager = FAISSIndexManager(dimension=embeddings_matrix.shape[1])
test_index_manager.load_index(index_path, mapping_path)

# Load chunks metadata
chunks_lookup = load_chunks_from_jsonl(chunks_path)
print(f"✓ Loaded index with {test_index_manager.index.ntotal} vectors")
print(f"✓ Loaded {len(chunks_lookup)} chunks metadata")

In [None]:
# === Cell 8: Search Functionality ===
def search_similar_chunks(query: str, k: int = 3) -> List[Dict]:
    """Search for similar chunks given a text query"""
    # Generate query embedding
    query_embedding = embedding_model.encode([query], normalize_embeddings=True)

    # Search in index
    distances, indices = test_index_manager.search(query_embedding, k)

    results = []
    for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
        if idx == -1:  # No more results
            break

        chunk_id = test_index_manager.id_to_chunk[idx]
        chunk = chunks_lookup[chunk_id]

        results.append(
            {
                "rank": i + 1,
                "score": float(dist),
                "chunk_id": chunk_id,
                "text": chunk["text"],
                "meta": chunk["meta"],
            }
        )

    return results


# Test search functionality
test_query = "什麼是機器學習？"
results = search_similar_chunks(test_query, k=3)

print(f"Query: {test_query}")
print("-" * 50)
for result in results:
    print(f"Rank {result['rank']} (Score: {result['score']:.4f})")
    print(f"Text: {result['text'][:100]}...")
    print(f"Source: {result['meta']['source_id']}")
    print()

In [None]:
# === Cell 9: Incremental Update Demo ===
def add_new_chunks_to_index(new_texts: List[str], start_id: int = None):
    """Add new chunks to existing index"""
    if start_id is None:
        start_id = max(chunks_lookup.keys()) + 1

    # Create new chunks
    new_chunks = []
    for i, text in enumerate(new_texts):
        chunk_id = start_id + i
        new_chunks.append(
            {
                "id": chunk_id,
                "text": text,
                "meta": {"source_id": f"new_doc_{i}", "section": "intro", "page": 1},
            }
        )

    # Generate embeddings for new chunks
    new_embeddings = embedding_model.encode(new_texts, normalize_embeddings=True)

    # Add to index
    new_chunk_ids = [chunk["id"] for chunk in new_chunks]
    test_index_manager.add_vectors(new_embeddings, new_chunk_ids)

    # Update chunks lookup
    for chunk in new_chunks:
        chunks_lookup[chunk["id"]] = chunk

    # Append to JSONL file
    with open(chunks_path, "a", encoding="utf-8") as f:
        for chunk in new_chunks:
            f.write(json.dumps(chunk, ensure_ascii=False) + "\n")

    return new_chunks


# Demo incremental update
new_texts = [
    "大語言模型（LLM）是使用深度學習技術訓練的大規模神經網絡模型。",
    "RAG 系統結合了檢索和生成，能夠提供更準確的答案。",
]

new_chunks = add_new_chunks_to_index(new_texts)
print(f"✓ Added {len(new_chunks)} new chunks to index")

# Test search with new chunks
results = search_similar_chunks("大語言模型是什麼？", k=3)
print("\nSearch results after incremental update:")
for result in results:
    print(
        f"Rank {result['rank']}: {result['text'][:80]}... (Score: {result['score']:.4f})"
    )

In [None]:
# === Cell 10: Performance Analysis ===
def analyze_index_performance():
    """Analyze index size and search performance"""
    import psutil
    import os

    # Index file size
    index_size_mb = os.path.getsize(index_path) / (1024 * 1024)

    # Memory usage
    process = psutil.Process()
    memory_mb = process.memory_info().rss / (1024 * 1024)

    print("=== Index Performance Analysis ===")
    print(f"Index file size: {index_size_mb:.2f} MB")
    print(f"Current memory usage: {memory_mb:.2f} MB")
    print(f"Vectors in index: {test_index_manager.index.ntotal}")
    print(f"Vector dimension: {test_index_manager.dimension}")

    # Search latency test
    test_queries = [
        "人工智能的應用",
        "機器學習算法",
        "深度學習網絡",
        "自然語言處理",
        "向量數據庫",
    ]

    latencies = []
    for query in test_queries:
        start_time = time.time()
        _ = search_similar_chunks(query, k=5)
        latency = (time.time() - start_time) * 1000  # ms
        latencies.append(latency)

    avg_latency = np.mean(latencies)
    p95_latency = np.percentile(latencies, 95)

    print(f"\nSearch Performance:")
    print(f"Average latency: {avg_latency:.2f} ms")
    print(f"P95 latency: {p95_latency:.2f} ms")
    print(f"Queries per second: {1000/avg_latency:.1f}")

    return {
        "index_size_mb": index_size_mb,
        "memory_mb": memory_mb,
        "avg_latency_ms": avg_latency,
        "p95_latency_ms": p95_latency,
    }


perf_metrics = analyze_index_performance()

In [None]:
# === Cell 11: Advanced Index Configuration ===
def create_optimized_index(embeddings: np.ndarray, index_type: str = "flat"):
    """Create different types of FAISS indexes for comparison"""
    dimension = embeddings.shape[1]

    if index_type == "flat":
        # Standard flat index (exact search)
        index = faiss.IndexFlatIP(dimension)
    elif index_type == "ivf":
        # IVF index for faster approximate search
        nlist = min(100, embeddings.shape[0] // 10)  # Number of clusters
        quantizer = faiss.IndexFlatIP(dimension)
        index = faiss.IndexIVFFlat(quantizer, dimension, nlist)

        # Train the index
        index.train(embeddings.astype(np.float32))
        index.nprobe = min(10, nlist)  # Search in top 10 clusters
    elif index_type == "hnsw":
        # HNSW index for very fast approximate search
        M = 16  # Number of connections
        index = faiss.IndexHNSWFlat(dimension, M)
        index.hnsw.efConstruction = 200
        index.hnsw.efSearch = 100
    else:
        raise ValueError(f"Unsupported index type: {index_type}")

    index.add(embeddings.astype(np.float32))
    return index


# Compare different index types
print("=== Index Type Comparison ===")
for idx_type in ["flat", "ivf"]:  # Skip HNSW for small datasets
    try:
        start_time = time.time()
        test_index = create_optimized_index(embeddings_matrix, idx_type)
        creation_time = time.time() - start_time

        # Test search speed
        query_emb = embedding_model.encode(["測試查詢"], normalize_embeddings=True)
        start_time = time.time()
        _, _ = test_index.search(query_emb.astype(np.float32), 5)
        search_time = (time.time() - start_time) * 1000

        print(
            f"{idx_type.upper()}: Creation={creation_time:.3f}s, Search={search_time:.2f}ms"
        )
    except Exception as e:
        print(f"{idx_type.upper()}: Failed - {e}")

In [None]:
# === Cell 12: Smoke Test ===
def smoke_test():
    """Comprehensive smoke test for FAISS index functionality"""
    print("=== FAISS Index Smoke Test ===")

    tests_passed = 0
    total_tests = 6

    # Test 1: Index creation
    try:
        test_mgr = FAISSIndexManager(dimension=1024)
        dummy_embeddings = np.random.rand(10, 1024).astype(np.float32)
        test_mgr.create_index(dummy_embeddings, list(range(10)))
        assert test_mgr.index.ntotal == 10
        print("✓ Test 1: Index creation - PASSED")
        tests_passed += 1
    except Exception as e:
        print(f"✗ Test 1: Index creation - FAILED ({e})")

    # Test 2: Save/Load
    try:
        test_path = "outs/test_index.faiss"
        test_mapping = "outs/test_mapping.pkl"
        test_mgr.save_index(test_path, test_mapping)

        load_mgr = FAISSIndexManager(dimension=1024)
        load_mgr.load_index(test_path, test_mapping)
        assert load_mgr.index.ntotal == 10
        print("✓ Test 2: Save/Load - PASSED")
        tests_passed += 1
    except Exception as e:
        print(f"✗ Test 2: Save/Load - FAILED ({e})")

    # Test 3: Search functionality
    try:
        query = np.random.rand(1, 1024).astype(np.float32)
        distances, indices = load_mgr.search(query, k=3)
        assert len(distances[0]) == 3
        assert len(indices[0]) == 3
        print("✓ Test 3: Search - PASSED")
        tests_passed += 1
    except Exception as e:
        print(f"✗ Test 3: Search - FAILED ({e})")

    # Test 4: Incremental update
    try:
        new_embeddings = np.random.rand(3, 1024).astype(np.float32)
        load_mgr.add_vectors(new_embeddings, [10, 11, 12])
        assert load_mgr.index.ntotal == 13
        print("✓ Test 4: Incremental update - PASSED")
        tests_passed += 1
    except Exception as e:
        print(f"✗ Test 4: Incremental update - FAILED ({e})")

    # Test 5: Real embedding search
    try:
        results = search_similar_chunks("測試查詢", k=2)
        assert len(results) >= 1
        assert "text" in results[0]
        assert "score" in results[0]
        print("✓ Test 5: Real embedding search - PASSED")
        tests_passed += 1
    except Exception as e:
        print(f"✗ Test 5: Real embedding search - FAILED ({e})")

    # Test 6: Index file exists and is readable
    try:
        assert os.path.exists(index_path)
        test_index = faiss.read_index(index_path)
        assert test_index.ntotal > 0
        print("✓ Test 6: Index file validation - PASSED")
        tests_passed += 1
    except Exception as e:
        print(f"✗ Test 6: Index file validation - FAILED ({e})")

    print(f"\n=== Smoke Test Results: {tests_passed}/{total_tests} PASSED ===")

    if tests_passed == total_tests:
        print("🎉 All tests passed! FAISS index is working correctly.")
    else:
        print("⚠ Some tests failed. Check the implementation.")

    return tests_passed == total_tests


# Run smoke test
smoke_test_passed = smoke_test()

In [None]:
# === Cell 13: Summary and Next Steps ===
print("=== nb13 Summary: FAISS Index Build ===")
print("\n✅ Completed:")
print("- FAISS IndexFlatIP creation with normalized embeddings")
print("- Index persistence (save/load) with ID mapping")
print("- Incremental vector addition capability")
print("- Search functionality with similarity scoring")
print("- Performance analysis and optimization options")
print("- Comprehensive smoke testing")

print(f"\n📊 Key Metrics:")
print(f"- Index size: {perf_metrics['index_size_mb']:.2f} MB")
print(f"- Average search latency: {perf_metrics['avg_latency_ms']:.2f} ms")
print(f"- Total vectors: {test_index_manager.index.ntotal}")

print("\n🔧 Key Parameters (Low-VRAM friendly):")
print("- batch_size=8 for embedding generation")
print("- normalize_embeddings=True for IP metric")
print("- max_seq_length=512 for memory efficiency")

print("\n⚠ Pitfalls:")
print("- Always normalize embeddings when using Inner Product")
print("- Index ID mapping is separate from chunk metadata")
print("- Incremental updates require careful ID management")
print("- Large indexes may need IVF/HNSW for speed")

print("\n🚀 Next Steps:")
print("- nb14: Query processing and citation formatting")
print("- nb15: BGE reranker integration")
print("- nb16: Context optimization and token budgeting")

print(f"\n📁 Generated Files:")
print(f"- {index_path} (FAISS index)")
print(f"- {mapping_path} (ID mapping)")
print(f"- {chunks_path} (chunks metadata)")
print("- outs/chunks_with_embeddings.json (full chunks)")

# Final validation
if smoke_test_passed:
    print("\n🎯 Ready for next notebook: nb14_query_and_citations.ipynb")
else:
    print("\n❌ Please fix failing tests before proceeding")

Smoke Test

In [None]:
# Quick smoke test - run this to verify everything works
query_test = "人工智能技術"
results = search_similar_chunks(query_test, k=2)
assert len(results) >= 1, "Search should return results"
assert results[0]["score"] > 0, "Similarity score should be positive"
assert test_index_manager.index.ntotal >= len(
    chunks_data
), "Index should contain all chunks"
print(f"✅ Smoke test passed! Found {len(results)} results for '{query_test}'")
print(f"📊 Index contains {test_index_manager.index.ntotal} vectors")