In [None]:
# nb12_embeddings_bge_m3.ipynb
# Stage 2 - 嵌入模型 BGE-M3 實作

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
## Cell 2: BGE-M3 模型載入與設定
from sentence_transformers import SentenceTransformer
import numpy as np
import json
from typing import List, Union, Optional
import time
import hashlib


class BGEEmbedder:
    """BGE-M3 嵌入模型封裝，支援中英文與批次處理"""

    def __init__(
        self,
        model_name: str = "BAAI/bge-m3",
        device: str = "auto",
        normalize: bool = True,
        max_seq_length: int = 512,
        cache_dir: Optional[str] = None,
    ):

        self.model_name = model_name
        self.normalize = normalize
        self.max_seq_length = max_seq_length
        self.cache_dir = cache_dir or f"{AI_CACHE_ROOT}/embeddings"
        pathlib.Path(self.cache_dir).mkdir(parents=True, exist_ok=True)

        print(f"[BGE] Loading model: {model_name}")
        self.model = SentenceTransformer(
            model_name,
            device=device if device != "auto" else None,
            cache_folder=f"{AI_CACHE_ROOT}/hf/sentence-transformers",
        )

        # Set max sequence length for efficiency
        if hasattr(self.model, "max_seq_length"):
            self.model.max_seq_length = max_seq_length

        print(f"[BGE] Model loaded on device: {self.model.device}")
        print(f"[BGE] Max sequence length: {max_seq_length}")
        print(f"[BGE] Normalize embeddings: {normalize}")


# Initialize BGE-M3 embedder
embedder = BGEEmbedder(model_name="BAAI/bge-m3", normalize=True, max_seq_length=512)

In [None]:
## Cell 3: 文本嵌入基礎功能（單一文本）
def encode_single_text(text: str, show_info: bool = True) -> np.ndarray:
    """編碼單一文本為向量"""
    start_time = time.time()

    # Encode text to embedding
    embedding = embedder.model.encode(
        text, normalize_embeddings=embedder.normalize, convert_to_numpy=True
    )

    elapsed = time.time() - start_time

    if show_info:
        print(f"[Encode] Text length: {len(text)} chars")
        print(f"[Encode] Embedding shape: {embedding.shape}")
        print(f"[Encode] Embedding dtype: {embedding.dtype}")
        print(f"[Encode] Time elapsed: {elapsed:.3f}s")

        if embedder.normalize:
            norm = np.linalg.norm(embedding)
            print(f"[Encode] L2 norm (should be ~1.0): {norm:.6f}")

    return embedding


# Test with Chinese text
zh_text = "檢索增強生成（RAG）是一種結合了檢索與生成的自然語言處理技術。"
zh_embedding = encode_single_text(zh_text)

# Test with English text
en_text = "Retrieval-Augmented Generation (RAG) combines retrieval and generation for NLP tasks."
en_embedding = encode_single_text(en_text)

In [None]:
## Cell 4: 批次嵌入與正規化
def encode_batch_texts(
    texts: List[str], batch_size: int = 32, show_progress: bool = True
) -> np.ndarray:
    """批次編碼文本列表為向量矩陣"""

    if not texts:
        return np.array([])

    total_texts = len(texts)
    print(f"[Batch] Encoding {total_texts} texts with batch_size={batch_size}")

    start_time = time.time()

    # Use sentence-transformers batch encoding
    embeddings = embedder.model.encode(
        texts,
        batch_size=batch_size,
        normalize_embeddings=embedder.normalize,
        convert_to_numpy=True,
        show_progress_bar=show_progress,
    )

    elapsed = time.time() - start_time

    print(f"[Batch] Output shape: {embeddings.shape}")
    print(f"[Batch] Output dtype: {embeddings.dtype}")
    print(f"[Batch] Total time: {elapsed:.3f}s")
    print(f"[Batch] Average time per text: {elapsed/total_texts:.4f}s")

    return embeddings


# Test batch encoding with mixed Chinese/English
test_texts = [
    "人工智慧的發展正在改變世界。",
    "Artificial intelligence is transforming the world.",
    "機器學習是人工智慧的重要分支。",
    "Machine learning is a crucial branch of AI.",
    "深度學習使用神經網路來學習複雜的模式。",
    "Deep learning uses neural networks to learn complex patterns.",
    "自然語言處理讓電腦理解人類語言。",
    "Natural language processing helps computers understand human language.",
]

batch_embeddings = encode_batch_texts(test_texts, batch_size=4)

In [None]:
## Cell 5: 嵌入快取系統
class EmbeddingCache:
    """嵌入向量快取系統，避免重複計算"""

    def __init__(self, cache_dir: str):
        self.cache_dir = pathlib.Path(cache_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        self.cache_file = self.cache_dir / "embedding_cache.jsonl"
        self.memory_cache = {}

    def _get_text_hash(self, text: str) -> str:
        """生成文本的MD5雜湊值作為快取鍵"""
        return hashlib.md5(text.encode("utf-8")).hexdigest()

    def get(self, text: str) -> Optional[np.ndarray]:
        """從快取中獲取嵌入向量"""
        text_hash = self._get_text_hash(text)

        # Check memory cache first
        if text_hash in self.memory_cache:
            return self.memory_cache[text_hash]

        # Check disk cache
        if self.cache_file.exists():
            try:
                with open(self.cache_file, "r", encoding="utf-8") as f:
                    for line in f:
                        data = json.loads(line.strip())
                        if data["hash"] == text_hash:
                            embedding = np.array(data["embedding"], dtype=np.float32)
                            self.memory_cache[text_hash] = embedding
                            return embedding
            except Exception as e:
                print(f"[Cache] Error reading cache: {e}")

        return None

    def set(self, text: str, embedding: np.ndarray):
        """將嵌入向量存入快取"""
        text_hash = self._get_text_hash(text)

        # Store in memory cache
        self.memory_cache[text_hash] = embedding

        # Append to disk cache
        try:
            cache_entry = {
                "hash": text_hash,
                "text_preview": text[:100],  # First 100 chars for debugging
                "embedding": embedding.tolist(),
                "timestamp": time.time(),
            }

            with open(self.cache_file, "a", encoding="utf-8") as f:
                f.write(json.dumps(cache_entry, ensure_ascii=False) + "\n")

        except Exception as e:
            print(f"[Cache] Error writing cache: {e}")

    def encode_with_cache(self, text: str) -> np.ndarray:
        """帶快取的文本編碼"""
        cached_embedding = self.get(text)
        if cached_embedding is not None:
            print(f"[Cache] Hit for text: {text[:50]}...")
            return cached_embedding

        print(f"[Cache] Miss, encoding: {text[:50]}...")
        embedding = encode_single_text(text, show_info=False)
        self.set(text, embedding)
        return embedding


# Initialize embedding cache
cache = EmbeddingCache(f"{AI_CACHE_ROOT}/embeddings")

# Test caching with repeated texts
test_text = "這是一個測試文本，用來驗證快取功能是否正常運作。"

print("First encoding (should miss cache):")
emb1 = cache.encode_with_cache(test_text)

print("\nSecond encoding (should hit cache):")
emb2 = cache.encode_with_cache(test_text)

print(f"\nEmbeddings are identical: {np.allclose(emb1, emb2)}")

In [None]:
## Cell 6: 向量相似度計算與檢索
def compute_similarity(
    query_embedding: np.ndarray, doc_embeddings: np.ndarray, metric: str = "cosine"
) -> np.ndarray:
    """計算查詢向量與文檔向量之間的相似度"""

    if metric == "cosine":
        # For normalized embeddings, cosine similarity = dot product
        if len(query_embedding.shape) == 1:
            query_embedding = query_embedding.reshape(1, -1)

        similarities = np.dot(doc_embeddings, query_embedding.T).flatten()

    elif metric == "euclidean":
        # Euclidean distance (lower is more similar)
        distances = np.linalg.norm(doc_embeddings - query_embedding, axis=1)
        similarities = 1 / (1 + distances)  # Convert to similarity score

    else:
        raise ValueError(f"Unsupported metric: {metric}")

    return similarities


def find_most_similar(
    query: str, documents: List[str], top_k: int = 3, metric: str = "cosine"
) -> List[tuple]:
    """找到與查詢最相似的文檔"""

    print(f"[Search] Query: {query}")
    print(f"[Search] Searching in {len(documents)} documents")

    # Encode query and documents
    query_emb = cache.encode_with_cache(query)
    doc_embs = []

    for doc in documents:
        doc_emb = cache.encode_with_cache(doc)
        doc_embs.append(doc_emb)

    doc_embs = np.array(doc_embs)

    # Compute similarities
    similarities = compute_similarity(query_emb, doc_embs, metric)

    # Get top-k results
    top_indices = np.argsort(similarities)[::-1][:top_k]

    results = []
    for i, idx in enumerate(top_indices):
        results.append(
            (
                idx,  # Document index
                documents[idx],  # Document text
                float(similarities[idx]),  # Similarity score
            )
        )
        print(
            f"[Result {i+1}] Score: {similarities[idx]:.4f} | Doc: {documents[idx][:100]}..."
        )

    return results


# Test similarity search
query_text = "人工智慧的應用"
search_results = find_most_similar(query_text, test_texts, top_k=3)

In [None]:
## Cell 7: 中英文混合文本測試
def test_multilingual_embeddings():
    """測試中英文混合文本的嵌入效果"""

    multilingual_texts = [
        "AI人工智慧 artificial intelligence",
        "機器學習 machine learning algorithms",
        "Natural language processing 自然語言處理",
        "Deep neural networks 深度神經網路",
        "Computer vision 電腦視覺技術",
        "語音識別 speech recognition systems",
    ]

    print("[Multilingual] Testing mixed language embeddings...")

    # Encode all texts
    embeddings = encode_batch_texts(
        multilingual_texts, batch_size=6, show_progress=False
    )

    # Test cross-language similarity
    zh_query = "人工智慧技術"
    en_query = "artificial intelligence technology"

    zh_query_emb = cache.encode_with_cache(zh_query)
    en_query_emb = cache.encode_with_cache(en_query)

    # Cross-language similarity (should be high)
    cross_lang_sim = compute_similarity(zh_query_emb, en_query_emb.reshape(1, -1))[0]
    print(f"[Multilingual] ZH-EN cross-language similarity: {cross_lang_sim:.4f}")

    # Find similar documents for both queries
    print(f"\n[Multilingual] Similar docs for '{zh_query}':")
    zh_results = find_most_similar(zh_query, multilingual_texts, top_k=2)

    print(f"\n[Multilingual] Similar docs for '{en_query}':")
    en_results = find_most_similar(en_query, multilingual_texts, top_k=2)

    return cross_lang_sim, zh_results, en_results


multilingual_test_results = test_multilingual_embeddings()

In [None]:
## Cell 8: Smoke Test - 完整嵌入流水線
def smoke_test_embeddings():
    """煙霧測試：驗證完整嵌入流水線"""

    print("=== BGE-M3 Embedding Pipeline Smoke Test ===")

    # Test data
    sample_docs = [
        "檢索增強生成是結合檢索與生成的技術",
        "RAG combines retrieval and generation",
        "向量資料庫用於儲存文檔嵌入",
        "Vector databases store document embeddings",
        "語義搜索基於向量相似度匹配",
    ]

    # 1. Batch encoding test
    print("\n1. Testing batch encoding...")
    batch_embs = encode_batch_texts(sample_docs, batch_size=3, show_progress=False)
    assert batch_embs.shape[0] == len(sample_docs), "Batch size mismatch"
    assert batch_embs.shape[1] > 0, "Empty embeddings"
    print("✓ Batch encoding passed")

    # 2. Cache functionality test
    print("\n2. Testing cache functionality...")
    test_doc = sample_docs[0]
    emb1 = cache.encode_with_cache(test_doc)
    emb2 = cache.encode_with_cache(test_doc)  # Should hit cache
    assert np.allclose(emb1, emb2), "Cache inconsistency"
    print("✓ Cache functionality passed")

    # 3. Similarity search test
    print("\n3. Testing similarity search...")
    query = "向量檢索技術"
    results = find_most_similar(query, sample_docs, top_k=2)
    assert len(results) == 2, "Incorrect number of results"
    assert all(score > 0 for _, _, score in results), "Invalid similarity scores"
    print("✓ Similarity search passed")

    # 4. Normalization test
    print("\n4. Testing embedding normalization...")
    test_emb = encode_single_text("測試正規化", show_info=False)
    norm = np.linalg.norm(test_emb)
    assert abs(norm - 1.0) < 1e-5, f"Embedding not normalized: {norm}"
    print("✓ Normalization passed")

    print("\n=== All tests passed! ===")
    return True


# Run smoke test
smoke_test_result = smoke_test_embeddings()

In [None]:
## Cell 9: 效能測試與記憶體監控
def performance_benchmark():
    """效能基準測試"""

    print("=== Performance Benchmark ===")

    # Generate test data
    test_sizes = [10, 50, 100]
    base_text = "這是一個用於測試嵌入模型效能的範例文本。"

    for size in test_sizes:
        print(f"\n--- Testing {size} documents ---")

        # Generate test documents
        docs = [f"{base_text} 文檔編號 {i}" for i in range(size)]

        # Measure encoding time
        start_time = time.time()
        embeddings = encode_batch_texts(docs, batch_size=16, show_progress=False)
        encoding_time = time.time() - start_time

        # Measure similarity search time
        query = "測試查詢文本"
        search_start = time.time()
        results = find_most_similar(query, docs, top_k=5)
        search_time = time.time() - search_start

        # Memory usage (approximate)
        emb_memory_mb = embeddings.nbytes / (1024 * 1024)

        print(
            f"Encoding time: {encoding_time:.3f}s ({encoding_time/size:.4f}s per doc)"
        )
        print(f"Search time: {search_time:.3f}s")
        print(f"Memory usage: {emb_memory_mb:.2f} MB")
        print(f"Throughput: {size/encoding_time:.1f} docs/sec")

    # GPU memory check if available
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / (1024**3)
        cached = torch.cuda.memory_reserved() / (1024**3)
        print(f"\nGPU Memory - Allocated: {allocated:.2f}GB, Cached: {cached:.2f}GB")


performance_benchmark()

In [None]:
## Cell 10: Key Parameters & When to Use
print(
    """
=== Key Parameters for BGE-M3 Embeddings ===

Model Configuration:
- model_name: "BAAI/bge-m3" (multilingual, recommended)
- normalize: True (enables cosine similarity via dot product)
- max_seq_length: 512 (balance between quality and speed)
- batch_size: 16-32 (adjust based on GPU memory)

Low-VRAM Options:
- Use CPU: device="cpu"
- Smaller batch: batch_size=8
- Alternative model: "BAAI/bge-small-zh-v1.5" (lighter)

Cache Settings:
- Enable caching for repeated texts
- Cache directory: AI_CACHE_ROOT/embeddings
- Memory + disk dual-layer cache

Performance Tips:
- Batch encoding is much faster than individual encoding
- Normalized embeddings allow fast cosine similarity via dot product
- Cache frequently accessed embeddings
- Monitor GPU memory usage

=== When to Use BGE-M3 ===

✓ Good for:
- Multilingual text (Chinese + English)
- Semantic search and similarity matching
- RAG retrieval systems
- Document clustering and classification
- Cross-language information retrieval

✗ Avoid for:
- Very long documents (>512 tokens) without chunking
- Real-time applications requiring <50ms latency
- Environments with <4GB VRAM without CPU fallback

=== Next Steps ===
- Use these embeddings in nb13 for FAISS index building
- Integrate with chunked documents from nb11
- Add reranking in nb15 for better retrieval quality
"""
)

print(f"\n=== Notebook nb12 Completed Successfully ===")
print(f"BGE-M3 embedder ready for Stage 2 RAG pipeline!")