In [None]:
# nb17_incremental_update.ipynb
# Goals: Incremental document embedding and index updates

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# Cell 2: Import Dependencies and Reuse RAG Modules
import json
import time
import shutil
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Tuple
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Create directories
for dir_name in ["indices", "outs", "data"]:
    Path(dir_name).mkdir(exist_ok=True)

print("✓ Dependencies loaded successfully")

In [None]:
# Cell 3: Build Initial Index (Simulate Existing Knowledge Base)
def create_initial_index():
    """Create a baseline index with some sample documents"""

    # Sample Chinese documents (simulate existing knowledge base)
    initial_docs = [
        "人工智慧是電腦科學的一個分支，致力於創建能夠執行通常需要人類智慧的任務的系統。",
        "機器學習是人工智慧的一個子領域，專注於開發能夠從資料中學習的演算法。",
        "深度學習使用人工神經網路來模擬人腦的運作方式，在影像識別和自然語言處理方面表現出色。",
        "自然語言處理（NLP）是人工智慧的一個分支，專門處理電腦與人類語言之間的互動。",
        "RAG（檢索增強生成）結合了資訊檢索和語言生成，提供更準確的問答系統。",
    ]

    # Text splitter for Chinese
    splitter = RecursiveCharacterTextSplitter(
        separators=["。", "！", "？", "；", "…", "\n\n", "\n", " "],
        chunk_size=200,
        chunk_overlap=20,
    )

    # Create documents with metadata
    documents = []
    for i, text in enumerate(initial_docs):
        doc = Document(
            page_content=text,
            metadata={
                "source_id": f"initial_doc_{i:03d}",
                "domain": "ai_basics",
                "created_at": "2024-01-01T00:00:00",
                "version": 1,
            },
        )
        documents.append(doc)

    # Initialize embedding model (low VRAM)
    embedding_model = SentenceTransformer("BAAI/bge-small-zh-v1.5")

    # Split documents
    all_chunks = []
    for doc in documents:
        chunks = splitter.split_documents([doc])
        all_chunks.extend(chunks)

    # Generate embeddings
    texts = [chunk.page_content for chunk in all_chunks]
    vectors = embedding_model.encode(
        texts, normalize_embeddings=True, batch_size=16
    ).astype(np.float32)

    # Create FAISS index
    dimension = vectors.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(vectors)

    # Save chunks metadata
    chunks_data = []
    for i, chunk in enumerate(all_chunks):
        chunks_data.append(
            {"id": i, "text": chunk.page_content, "metadata": chunk.metadata}
        )

    # Save to files
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    faiss.write_index(index, f"indices/knowledge_base_{timestamp}.faiss")

    with open(f"indices/chunks_{timestamp}.jsonl", "w", encoding="utf-8") as f:
        for chunk_data in chunks_data:
            f.write(json.dumps(chunk_data, ensure_ascii=False) + "\n")

    # Create current symlinks
    if Path("indices/current.faiss").exists():
        Path("indices/current.faiss").unlink()
    if Path("indices/current_chunks.jsonl").exists():
        Path("indices/current_chunks.jsonl").unlink()

    Path("indices/current.faiss").symlink_to(f"knowledge_base_{timestamp}.faiss")
    Path("indices/current_chunks.jsonl").symlink_to(f"chunks_{timestamp}.jsonl")

    print(f"✓ Initial index created with {len(chunks_data)} chunks")
    print(f"  Index dimension: {dimension}")
    print(f"  Index total: {index.ntotal}")
    print(f"  Files: knowledge_base_{timestamp}.faiss, chunks_{timestamp}.jsonl")

    return embedding_model, index, chunks_data, timestamp


# Create initial index
embedding_model, current_index, current_chunks, base_timestamp = create_initial_index()

In [None]:
# Cell 4: Incremental Update Functions
class IncrementalRAGUpdater:
    """Handles incremental updates to RAG index"""

    def __init__(self, embedding_model: SentenceTransformer):
        self.embedding_model = embedding_model
        self.splitter = RecursiveCharacterTextSplitter(
            separators=["。", "！", "？", "；", "…", "\n\n", "\n", " "],
            chunk_size=200,
            chunk_overlap=20,
        )

    def add_documents(
        self, new_docs: List[Dict[str, Any]], backup: bool = True
    ) -> Tuple[faiss.Index, List[Dict], str]:
        """
        Add new documents to existing index

        Args:
            new_docs: List of {"text": str, "metadata": dict}
            backup: Whether to backup current index before update

        Returns:
            Updated index, updated chunks list, new timestamp
        """
        print(f"📥 Processing {len(new_docs)} new documents...")

        # Load current index and chunks
        current_index = faiss.read_index("indices/current.faiss")
        current_chunks = []
        with open("indices/current_chunks.jsonl", "r", encoding="utf-8") as f:
            for line in f:
                current_chunks.append(json.loads(line.strip()))

        print(
            f"📊 Current index stats: {current_index.ntotal} vectors, {len(current_chunks)} chunks"
        )

        # Backup if requested
        if backup:
            backup_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + "_backup"
            shutil.copy(
                "indices/current.faiss",
                f"indices/knowledge_base_{backup_timestamp}.faiss",
            )
            shutil.copy(
                "indices/current_chunks.jsonl",
                f"indices/chunks_{backup_timestamp}.jsonl",
            )
            print(f"💾 Backup created: {backup_timestamp}")

        # Process new documents
        new_chunks = []
        for doc in new_docs:
            # Create document object
            document = Document(
                page_content=doc["text"],
                metadata={
                    **doc.get("metadata", {}),
                    "added_at": datetime.now().isoformat(),
                    "version": doc.get("metadata", {}).get("version", 1),
                },
            )

            # Split into chunks
            chunks = self.splitter.split_documents([document])
            new_chunks.extend(chunks)

        if not new_chunks:
            print("⚠️ No new chunks generated from documents")
            return current_index, current_chunks, ""

        # Generate embeddings for new chunks
        new_texts = [chunk.page_content for chunk in new_chunks]
        print(f"🔢 Generating embeddings for {len(new_texts)} new chunks...")

        new_vectors = self.embedding_model.encode(
            new_texts, normalize_embeddings=True, batch_size=16, show_progress_bar=True
        ).astype(np.float32)

        # Add to index
        start_id = len(current_chunks)
        current_index.add(new_vectors)

        # Update chunks metadata
        for i, chunk in enumerate(new_chunks):
            chunk_data = {
                "id": start_id + i,
                "text": chunk.page_content,
                "metadata": chunk.metadata,
            }
            current_chunks.append(chunk_data)

        # Save updated index
        new_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        new_index_path = f"indices/knowledge_base_{new_timestamp}.faiss"
        new_chunks_path = f"indices/chunks_{new_timestamp}.jsonl"

        faiss.write_index(current_index, new_index_path)

        with open(new_chunks_path, "w", encoding="utf-8") as f:
            for chunk_data in current_chunks:
                f.write(json.dumps(chunk_data, ensure_ascii=False) + "\n")

        # Update current symlinks
        Path("indices/current.faiss").unlink()
        Path("indices/current_chunks.jsonl").unlink()
        Path("indices/current.faiss").symlink_to(
            f"knowledge_base_{new_timestamp}.faiss"
        )
        Path("indices/current_chunks.jsonl").symlink_to(f"chunks_{new_timestamp}.jsonl")

        print(f"✅ Index updated successfully!")
        print(
            f"  New total: {current_index.ntotal} vectors, {len(current_chunks)} chunks"
        )
        print(f"  Added: {len(new_chunks)} chunks from {len(new_docs)} documents")
        print(f"  Files: {new_index_path}, {new_chunks_path}")

        return current_index, current_chunks, new_timestamp

    def search(self, query: str, k: int = 5) -> List[Tuple[str, Dict, float]]:
        """Search updated index"""
        # Load current index and chunks
        index = faiss.read_index("indices/current.faiss")
        chunks = []
        with open("indices/current_chunks.jsonl", "r", encoding="utf-8") as f:
            for line in f:
                chunks.append(json.loads(line.strip()))

        # Encode query
        query_vector = self.embedding_model.encode(
            [query], normalize_embeddings=True
        ).astype(np.float32)

        # Search
        scores, indices = index.search(query_vector, k)

        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx < len(chunks):
                chunk = chunks[idx]
                results.append((chunk["text"], chunk["metadata"], float(score)))

        return results


# Initialize updater
updater = IncrementalRAGUpdater(embedding_model)

In [None]:
# Cell 5: Version Management and Backup Strategy
def list_index_versions():
    """List all available index versions"""
    indices_dir = Path("indices")
    versions = []

    for file_path in indices_dir.glob("knowledge_base_*.faiss"):
        timestamp = file_path.stem.replace("knowledge_base_", "")
        size = file_path.stat().st_size

        # Count chunks if corresponding file exists
        chunks_file = indices_dir / f"chunks_{timestamp}.jsonl"
        chunk_count = 0
        if chunks_file.exists():
            with open(chunks_file, "r", encoding="utf-8") as f:
                chunk_count = sum(1 for _ in f)

        versions.append(
            {
                "timestamp": timestamp,
                "size_mb": round(size / 1024 / 1024, 2),
                "chunk_count": chunk_count,
                "is_current": file_path.name
                == Path("indices/current.faiss").resolve().name,
            }
        )

    # Sort by timestamp
    versions.sort(key=lambda x: x["timestamp"], reverse=True)
    return versions


def cleanup_old_versions(keep_latest: int = 5):
    """Clean up old index versions, keeping only the latest N"""
    versions = list_index_versions()

    if len(versions) <= keep_latest:
        print(f"📁 Only {len(versions)} versions found, no cleanup needed")
        return

    # Keep current + latest N-1
    to_delete = versions[keep_latest:]

    print(f"🗑️ Cleaning up {len(to_delete)} old versions...")
    for version in to_delete:
        timestamp = version["timestamp"]

        # Skip if it's current (safety check)
        if version["is_current"]:
            continue

        # Delete files
        faiss_file = Path(f"indices/knowledge_base_{timestamp}.faiss")
        chunks_file = Path(f"indices/chunks_{timestamp}.jsonl")

        if faiss_file.exists():
            faiss_file.unlink()
            print(f"  Deleted: {faiss_file.name}")
        if chunks_file.exists():
            chunks_file.unlink()
            print(f"  Deleted: {chunks_file.name}")


def rollback_to_version(timestamp: str):
    """Rollback to a specific version"""
    target_faiss = Path(f"indices/knowledge_base_{timestamp}.faiss")
    target_chunks = Path(f"indices/chunks_{timestamp}.jsonl")

    if not target_faiss.exists() or not target_chunks.exists():
        print(f"❌ Version {timestamp} not found")
        return False

    # Update symlinks
    Path("indices/current.faiss").unlink()
    Path("indices/current_chunks.jsonl").unlink()
    Path("indices/current.faiss").symlink_to(f"knowledge_base_{timestamp}.faiss")
    Path("indices/current_chunks.jsonl").symlink_to(f"chunks_{timestamp}.jsonl")

    print(f"⏪ Rolled back to version {timestamp}")
    return True


# Display current versions
print("📋 Current index versions:")
versions = list_index_versions()
for v in versions:
    current_mark = " (CURRENT)" if v["is_current"] else ""
    print(
        f"  {v['timestamp']}: {v['chunk_count']} chunks, {v['size_mb']} MB{current_mark}"
    )

In [None]:
# Cell 6: Consistency Check and Validation
def validate_index_consistency():
    """Validate that index and chunks are consistent"""
    print("🔍 Validating index consistency...")

    # Load index and chunks
    try:
        index = faiss.read_index("indices/current.faiss")
        chunks = []
        with open("indices/current_chunks.jsonl", "r", encoding="utf-8") as f:
            for line in f:
                chunks.append(json.loads(line.strip()))
    except Exception as e:
        print(f"❌ Error loading files: {e}")
        return False

    # Check counts match
    if index.ntotal != len(chunks):
        print(
            f"❌ Count mismatch: index has {index.ntotal} vectors, chunks file has {len(chunks)} entries"
        )
        return False

    # Check chunk IDs are sequential
    expected_ids = set(range(len(chunks)))
    actual_ids = set(chunk["id"] for chunk in chunks)
    if expected_ids != actual_ids:
        missing = expected_ids - actual_ids
        extra = actual_ids - expected_ids
        if missing:
            print(f"❌ Missing chunk IDs: {sorted(missing)[:10]}...")
        if extra:
            print(f"❌ Extra chunk IDs: {sorted(extra)[:10]}...")
        return False

    # Sample validation: re-embed a few chunks and check similarity
    sample_size = min(5, len(chunks))
    sample_chunks = chunks[:sample_size]
    sample_texts = [chunk["text"] for chunk in sample_chunks]

    # Re-generate embeddings
    sample_vectors = embedding_model.encode(
        sample_texts, normalize_embeddings=True
    ).astype(np.float32)

    # Get original vectors from index
    original_vectors = index.reconstruct_batch(list(range(sample_size)))

    # Check similarity (should be very close to 1.0)
    similarities = []
    for i in range(sample_size):
        sim = np.dot(sample_vectors[i], original_vectors[i])
        similarities.append(sim)

    avg_similarity = np.mean(similarities)
    if avg_similarity < 0.95:
        print(f"❌ Vector similarity too low: {avg_similarity:.4f}")
        return False

    print(f"✅ Index validation passed!")
    print(f"  Vectors: {index.ntotal}")
    print(f"  Chunks: {len(chunks)}")
    print(f"  Vector similarity: {avg_similarity:.4f}")
    return True


# Run validation
is_consistent = validate_index_consistency()

In [None]:
# Cell 7: Performance Comparison (Incremental vs Full Rebuild)
def benchmark_update_methods(new_docs: List[Dict[str, Any]]):
    """Compare incremental update vs full rebuild performance"""
    print("⚡ Benchmarking update methods...")

    # Save current state
    current_chunks_backup = []
    with open("indices/current_chunks.jsonl", "r", encoding="utf-8") as f:
        for line in f:
            current_chunks_backup.append(json.loads(line.strip()))

    original_count = len(current_chunks_backup)

    # Method 1: Incremental Update
    print("\n🔄 Testing incremental update...")
    start_time = time.time()

    _, updated_chunks, _ = updater.add_documents(new_docs, backup=False)

    incremental_time = time.time() - start_time
    incremental_count = len(updated_chunks)

    print(f"  Time: {incremental_time:.2f}s")
    print(f"  Final count: {incremental_count}")

    # Method 2: Full Rebuild (simulation)
    print("\n🔄 Simulating full rebuild...")
    start_time = time.time()

    # Combine all documents
    all_texts = [chunk["text"] for chunk in current_chunks_backup]
    all_texts.extend([doc["text"] for doc in new_docs])

    # Re-embed everything (simulation - don't actually rebuild)
    rebuild_vectors = embedding_model.encode(
        all_texts, normalize_embeddings=True, batch_size=16, show_progress_bar=False
    ).astype(np.float32)

    rebuild_time = time.time() - start_time

    print(f"  Time: {rebuild_time:.2f}s")
    print(f"  Final count: {len(all_texts)}")

    # Results
    print(f"\n📊 Performance Comparison:")
    print(f"  Incremental: {incremental_time:.2f}s")
    print(f"  Full rebuild: {rebuild_time:.2f}s")
    print(f"  Speedup: {rebuild_time/incremental_time:.1f}x faster")
    print(f"  Added documents: {len(new_docs)}")
    print(f"  Original chunks: {original_count}")
    print(f"  New total: {incremental_count}")

    return {
        "incremental_time": incremental_time,
        "rebuild_time": rebuild_time,
        "speedup": rebuild_time / incremental_time,
        "docs_added": len(new_docs),
        "original_count": original_count,
        "final_count": incremental_count,
    }


# Prepare test documents
test_new_docs = [
    {
        "text": "大型語言模型（LLM）如GPT、BERT等，在自然語言理解和生成方面展現了驚人的能力。這些模型通過在大規模文本資料上進行預訓練，學習到了豐富的語言知識。",
        "metadata": {
            "source_id": "new_doc_001",
            "domain": "llm_research",
            "author": "AI研究員",
        },
    },
    {
        "text": "向量資料庫是現代AI應用的重要基礎設施，它能夠高效地儲存和檢索高維向量資料。FAISS、Pinecone、Weaviate等是常用的向量資料庫解決方案。",
        "metadata": {
            "source_id": "new_doc_002",
            "domain": "vector_db",
            "author": "資料工程師",
        },
    },
    {
        "text": "提示工程（Prompt Engineering）是優化AI模型輸出的關鍵技術。透過精心設計的提示詞，我們可以引導模型產生更準確、更有用的回應。",
        "metadata": {
            "source_id": "new_doc_003",
            "domain": "prompt_engineering",
            "author": "AI應用專家",
        },
    },
]

# Run benchmark
benchmark_results = benchmark_update_methods(test_new_docs)

In [None]:
# Cell 8: Smoke Test - Verify New Content is Retrievable
print("🧪 Smoke Test: Verifying new content is retrievable...")

# Test queries that should hit new content
test_queries = [
    "什麼是大型語言模型？",
    "向量資料庫有哪些？",
    "提示工程的作用",
    "FAISS向量檢索",
]

print("\n🔍 Testing retrieval of new content:")
for query in test_queries:
    print(f"\nQuery: {query}")
    results = updater.search(query, k=3)

    for i, (text, metadata, score) in enumerate(results):
        source_id = metadata.get("source_id", "unknown")
        is_new = source_id.startswith("new_doc_")
        new_flag = " 🆕" if is_new else ""
        print(f"  {i+1}. [{source_id}]{new_flag} (score: {score:.3f})")
        print(f"     {text[:100]}...")

# Check if new documents are being retrieved
new_retrievals = 0
total_retrievals = 0

for query in test_queries:
    results = updater.search(query, k=3)
    for text, metadata, score in results:
        total_retrievals += 1
        if metadata.get("source_id", "").startswith("new_doc_"):
            new_retrievals += 1

new_content_ratio = new_retrievals / total_retrievals if total_retrievals > 0 else 0

print(f"\n📈 Smoke Test Results:")
print(f"  Total retrievals: {total_retrievals}")
print(f"  New content hits: {new_retrievals}")
print(f"  New content ratio: {new_content_ratio:.1%}")

if new_content_ratio > 0:
    print("✅ PASS: New content is being retrieved successfully")
else:
    print("❌ FAIL: New content not being retrieved - check embedding/indexing")

In [None]:
# Cell 9: Production Recommendations and Pitfalls
print(
    """
🏭 Production Environment Recommendations:

✅ Best Practices:
1. **Backup Strategy**: Always backup before updates (set backup=True)
2. **Version Management**: Keep 3-5 recent versions for rollback
3. **Batch Updates**: Group multiple documents for efficiency
4. **Consistency Checks**: Validate after each update
5. **Monitoring**: Track index size, update times, retrieval quality

⚠️ Common Pitfalls:
1. **ID Conflicts**: Ensure chunk IDs remain sequential and unique
2. **Memory Issues**: Large batch updates may cause OOM
3. **Index Corruption**: Always validate after updates
4. **Symlink Issues**: Check file permissions in production
5. **Vector Drift**: Recompute embeddings if model changes

🔧 Scaling Considerations:
1. **Large Indices**: Consider IVF/HNSW for >100K vectors
2. **Distributed Updates**: Use sharded indices for very large datasets
3. **Async Processing**: Queue updates for high-frequency scenarios
4. **Monitoring**: Set up alerts for failed updates

📊 Key Metrics to Track:
- Update latency vs batch size
- Index size growth rate
- Retrieval quality degradation
- Memory usage during updates
- Backup storage requirements
"""
)

# Save benchmark results
benchmark_summary = {
    "timestamp": datetime.now().isoformat(),
    "test_config": {
        "embedding_model": "BAAI/bge-small-zh-v1.5",
        "chunk_size": 200,
        "chunk_overlap": 20,
        "new_docs_count": len(test_new_docs),
    },
    "results": benchmark_results,
    "smoke_test": {
        "total_retrievals": total_retrievals,
        "new_content_hits": new_retrievals,
        "success_ratio": new_content_ratio,
    },
}

with open("outs/nb17_incremental_update_results.json", "w", encoding="utf-8") as f:
    json.dumps(benchmark_summary, f, ensure_ascii=False, indent=2)

print(f"\n💾 Results saved to: outs/nb17_incremental_update_results.json")
print(f"📁 Index versions available: {len(list_index_versions())}")
print(
    f"🎯 Final index size: {faiss.read_index('indices/current.faiss').ntotal} vectors"
)

Smoke Test

In [None]:
# Quick verification that incremental update works
test_doc = {
    "text": "測試文檔：RAG增量更新功能驗證",
    "metadata": {"source_id": "test_001"},
}
updater.add_documents([test_doc])

# Search for test content
results = updater.search("RAG增量更新", k=3)
assert any(
    "test_001" in result[1].get("source_id", "") for result in results
), "New content not retrievable"
print("✅ Smoke test passed: New content successfully indexed and retrievable")