In [None]:
# nb11_zh_chunking_strategies.ipynb
# 中文分段策略：階層式切分與參數優化

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
## Cell 2: Dependencies & Sample Data Preparation
import re
import json
from typing import List, Dict, Tuple
from dataclasses import dataclass
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import tiktoken

# Create sample Chinese documents for testing
SAMPLE_ZH_DOCS = {
    "tech_article": """# 大型語言模型的檢索增強生成技術

## 1. 概述

檢索增強生成（Retrieval-Augmented Generation, RAG）是一種結合外部知識檢索與語言模型生成的技術。它能夠有效解決大型語言模型在處理特定領域知識時的局限性。

RAG 系統主要包含三個核心組件：文檔編碼器、檢索器和生成器。當用戶提出問題時，系統首先從知識庫中檢索相關文檔，然後將這些文檔與原始問題一起輸入生成器，產生最終答案。

## 2. 技術架構

### 2.1 文檔處理流程

文檔處理包括以下步驟：文本清理、分段處理、向量化編碼。其中，分段處理是關鍵環節，需要在保持語義完整性的同時，確保片段長度適合檢索模型。

對於中文文本，分段策略需要考慮以下因素：
- 句號、問號、感嘆號等強停頓標點
- 分號、冒號等中等停頓標點
- 段落結構和章節標題
- 語義單元的完整性

### 2.2 檢索機制

傳統的關鍵詞檢索方法在處理語義相似但詞彙不同的查詢時效果有限。向量檢索通過將文本映射到高維語義空間，能夠捕捉更深層的語義關係。

混合檢索結合了關鍵詞檢索的精確性和向量檢索的語義理解能力，通常能獲得更好的檢索效果。

## 3. 實際應用

RAG 技術已經在多個領域得到應用，包括智能客服、知識問答、文檔分析等。在實際部署中，需要根據具體業務場景調整檢索策略和生成參數。

性能優化方面，可以考慮使用快取機制、批量處理、模型量化等技術來提升系統響應速度和降低資源消耗。""",
    "education_content": """# 第一章 學習方法論

## 1.1 主動學習策略

主動學習是一種以學習者為中心的教學方法。它強調學習者在學習過程中的主動參與和思考。

有效的主動學習包括以下要素：
1. 明確的學習目標
2. 積極的參與態度
3. 反思與總結

### 提問技巧

提問是主動學習的重要工具。好的問題能夠：
- 引發深度思考
- 促進知識連結
- 檢驗理解程度

## 1.2 記憶技巧

### 間隔重複法

間隔重複是基於遺忘曲線理論的記憶方法。通過在特定時間間隔內重複學習，可以有效提高長期記憶效果。

具體實施方法：
第一次複習：1天後
第二次複習：3天後
第三次複習：1週後
第四次複習：2週後
第五次複習：1個月後

### 聯想記憶法

聯想記憶通過建立新舊知識間的連結來幫助記憶。常用技巧包括：
- 圖像聯想
- 故事串聯
- 邏輯歸納

## 1.3 批判性思維

批判性思維是現代教育的核心能力之一。它包括：

分析能力：能夠拆解複雜問題，識別關鍵要素和內在邏輯。
評估能力：對信息來源的可靠性、論證的有效性進行判斷。
推理能力：基於已知信息得出合理結論。
反思能力：檢視自己的思維過程和結論。

培養批判性思維需要持續的練習和指導。""",
}

# Save sample documents
os.makedirs("data", exist_ok=True)
for doc_name, content in SAMPLE_ZH_DOCS.items():
    with open(f"data/{doc_name}_zh.txt", "w", encoding="utf-8") as f:
        f.write(content)

print("✅ Sample Chinese documents prepared")

In [None]:
## Cell 3: Chinese-Friendly Splitter Implementation
@dataclass
class ChunkingConfig:
    """Configuration for Chinese text chunking"""

    chunk_size: int = 800  # Optimal for Chinese (vs 1000 for English)
    chunk_overlap: int = 80  # 10% overlap
    separators: List[str] = None
    add_start_index: bool = True
    strip_whitespace: bool = True

    def __post_init__(self):
        if self.separators is None:
            # Hierarchical separators for Chinese
            self.separators = [
                "\n### ",  # H3 headers
                "\n## ",  # H2 headers
                "\n# ",  # H1 headers
                "\n第.*?章",  # Chapter markers (第一章, 第二章...)
                "\n第.*?節",  # Section markers
                "。",  # Period (strongest sentence boundary)
                "！",  # Exclamation
                "？",  # Question mark
                "；",  # Semicolon
                "：",  # Colon
                "…",  # Ellipsis
                "\n\n",  # Double newline (paragraph)
                "\n",  # Single newline
                " ",  # Space (weakest)
            ]


class ChineseTextSplitter(RecursiveCharacterTextSplitter):
    """Enhanced splitter for Chinese text with semantic awareness"""

    def __init__(self, config: ChunkingConfig):
        super().__init__(
            separators=config.separators,
            chunk_size=config.chunk_size,
            chunk_overlap=config.chunk_overlap,
            add_start_index=config.add_start_index,
            strip_whitespace=config.strip_whitespace,
            length_function=self._chinese_length_function,
        )
        self.config = config

    def _chinese_length_function(self, text: str) -> int:
        """Custom length function considering Chinese character density"""
        # Chinese characters are typically "denser" in meaning
        # Adjust token estimation accordingly
        try:
            encoding = tiktoken.get_encoding("cl100k_base")
            return len(encoding.encode(text))
        except:
            # Fallback: rough estimation (Chinese char ≈ 1.5 tokens)
            chinese_chars = len(re.findall(r"[\u4e00-\u9fff]", text))
            other_chars = len(text) - chinese_chars
            return int(chinese_chars * 1.5 + other_chars * 0.8)

    def create_documents_with_metadata(
        self, texts: List[str], metadatas: List[Dict] = None
    ) -> List[Document]:
        """Create documents with enhanced metadata for Chinese content"""
        if metadatas is None:
            metadatas = [{}] * len(texts)

        documents = []
        for i, text in enumerate(texts):
            # Create chunks
            chunks = self.split_text(text)

            for j, chunk in enumerate(chunks):
                # Enhanced metadata for Chinese content
                chunk_meta = {
                    **metadatas[i],
                    "chunk_id": f"{i}_{j}",
                    "chunk_index": j,
                    "total_chunks": len(chunks),
                    "char_count": len(chunk),
                    "est_tokens": self._chinese_length_function(chunk),
                    "has_title": bool(re.search(r"^#+\s+|^第.*?[章節]", chunk.strip())),
                    "punctuation_density": len(re.findall(r"[。！？；：]", chunk))
                    / max(1, len(chunk)),
                }

                documents.append(Document(page_content=chunk, metadata=chunk_meta))

        return documents


# Initialize different chunking strategies for comparison
configs = {
    "conservative": ChunkingConfig(chunk_size=600, chunk_overlap=60),
    "balanced": ChunkingConfig(chunk_size=800, chunk_overlap=80),
    "aggressive": ChunkingConfig(chunk_size=1200, chunk_overlap=120),
}

splitters = {name: ChineseTextSplitter(config) for name, config in configs.items()}

print("✅ Chinese text splitters initialized")
print(f"Available strategies: {list(splitters.keys())}")

In [None]:
## Cell 4: Chunking Strategy Comparison
def analyze_chunking_results(documents: List[Document], strategy_name: str) -> Dict:
    """Analyze chunking quality metrics"""
    if not documents:
        return {}

    chunk_lengths = [doc.metadata.get("char_count", 0) for doc in documents]
    token_counts = [doc.metadata.get("est_tokens", 0) for doc in documents]
    punct_densities = [doc.metadata.get("punctuation_density", 0) for doc in documents]

    return {
        "strategy": strategy_name,
        "total_chunks": len(documents),
        "avg_char_length": sum(chunk_lengths) / len(chunk_lengths),
        "avg_token_count": sum(token_counts) / len(token_counts),
        "avg_punct_density": sum(punct_densities) / len(punct_densities),
        "min_length": min(chunk_lengths),
        "max_length": max(chunk_lengths),
        "chunks_with_titles": sum(
            1 for doc in documents if doc.metadata.get("has_title", False)
        ),
    }


# Test all strategies on sample documents
results = {}

for doc_name, content in SAMPLE_ZH_DOCS.items():
    print(f"\n📄 Processing: {doc_name}")
    print(f"Original length: {len(content)} characters")

    results[doc_name] = {}

    for strategy_name, splitter in splitters.items():
        # Create metadata for this document
        metadata = {
            "source": doc_name,
            "language": "zh",
            "content_type": "educational" if "education" in doc_name else "technical",
        }

        # Split the document
        documents = splitter.create_documents_with_metadata([content], [metadata])

        # Analyze results
        analysis = analyze_chunking_results(documents, strategy_name)
        results[doc_name][strategy_name] = {
            "analysis": analysis,
            "documents": documents,
        }

        print(
            f"  {strategy_name:>12}: {analysis['total_chunks']:2d} chunks, "
            f"avg {analysis['avg_char_length']:.0f} chars, "
            f"avg {analysis['avg_token_count']:.0f} tokens"
        )

print("\n✅ Chunking analysis completed")

In [None]:
## Cell 5: Chunk Quality Assessment
def assess_chunk_quality(chunk: str) -> Dict[str, float]:
    """Assess semantic completeness and quality of a chunk"""

    # Check for complete sentences
    complete_sentences = len(re.findall(r"[。！？](?:\s|$)", chunk))
    total_possible_sentences = len(re.findall(r"[。！？]", chunk))
    sentence_completeness = complete_sentences / max(1, total_possible_sentences)

    # Check for structural elements
    has_heading = bool(re.search(r"^#+\s+|^第.*?[章節]", chunk.strip()))
    has_list_structure = bool(re.search(r"^\d+\.|^-\s+|^•\s+", chunk, re.MULTILINE))

    # Coherence indicators (simplified)
    transition_words = [
        "因此",
        "所以",
        "但是",
        "然而",
        "此外",
        "另外",
        "首先",
        "其次",
        "最後",
    ]
    transition_score = (
        sum(1 for word in transition_words if word in chunk) / len(chunk) * 1000
    )

    # Content density (Chinese characters vs total)
    chinese_chars = len(re.findall(r"[\u4e00-\u9fff]", chunk))
    content_density = chinese_chars / max(1, len(chunk))

    return {
        "sentence_completeness": sentence_completeness,
        "has_heading": has_heading,
        "has_structure": has_list_structure,
        "transition_density": min(1.0, transition_score),
        "content_density": content_density,
        "overall_quality": (
            sentence_completeness + content_density + transition_score / 2
        )
        / 2.5,
    }


# Evaluate chunk quality for each strategy
print("🎯 Chunk Quality Assessment\n")

for doc_name in SAMPLE_ZH_DOCS.keys():
    print(f"📄 Document: {doc_name}")

    for strategy_name in splitters.keys():
        documents = results[doc_name][strategy_name]["documents"]

        # Calculate average quality metrics
        quality_scores = [assess_chunk_quality(doc.page_content) for doc in documents]

        avg_quality = {
            metric: sum(score[metric] for score in quality_scores) / len(quality_scores)
            for metric in quality_scores[0].keys()
        }

        print(
            f"  {strategy_name:>12}: "
            f"Quality={avg_quality['overall_quality']:.2f}, "
            f"Completeness={avg_quality['sentence_completeness']:.2f}, "
            f"Density={avg_quality['content_density']:.2f}"
        )

        # Store quality metrics
        results[doc_name][strategy_name]["quality"] = avg_quality

    print()

In [None]:
## Cell 6: Optimal Strategy Selection & Recommendations
def recommend_strategy(
    document_type: str, content_characteristics: Dict
) -> Tuple[str, str]:
    """Recommend optimal chunking strategy based on content analysis"""

    if content_characteristics.get("has_many_headings", False):
        return "conservative", "文檔結構豐富，使用保守策略保持標題完整性"

    if content_characteristics.get("high_punctuation_density", False):
        return "balanced", "標點密度高，平衡策略可保持語義完整"

    if document_type == "technical":
        return "aggressive", "技術文檔通常邏輯性強，可使用較大塊"

    return "balanced", "一般文檔推薦使用平衡策略"


# Generate recommendations and best practices
print("💡 Chunking Strategy Recommendations\n")

best_strategies = {}
for doc_name in SAMPLE_ZH_DOCS.keys():
    print(f"📄 {doc_name}:")

    # Find strategy with best overall quality
    best_strategy = max(
        results[doc_name].keys(),
        key=lambda s: results[doc_name][s]["quality"]["overall_quality"],
    )

    best_quality = results[doc_name][best_strategy]["quality"]["overall_quality"]
    best_strategies[doc_name] = best_strategy

    print(f"  🏆 Best strategy: {best_strategy} (quality: {best_quality:.3f})")

    # Show sample chunk
    sample_chunk = results[doc_name][best_strategy]["documents"][0].page_content[:200]
    print(f"  📝 Sample chunk: {sample_chunk}...")
    print()

# Save chunked documents for next notebook
print("💾 Saving chunked documents for RAG indexing...")

os.makedirs("data/chunks", exist_ok=True)

for doc_name in SAMPLE_ZH_DOCS.keys():
    best_strategy = best_strategies[doc_name]
    documents = results[doc_name][best_strategy]["documents"]

    # Save as JSONL for easy loading
    output_file = f"data/chunks/{doc_name}_chunked.jsonl"
    with open(output_file, "w", encoding="utf-8") as f:
        for doc in documents:
            chunk_data = {"text": doc.page_content, "metadata": doc.metadata}
            f.write(json.dumps(chunk_data, ensure_ascii=False) + "\n")

    print(f"  ✅ Saved {len(documents)} chunks to {output_file}")

print("\n🎉 Chunking strategies analysis completed!")

In [None]:
## Cell 7: Smoke Test - Quick Validation
# Smoke test: Verify chunking works correctly
print("🧪 Smoke Test: Chinese Chunking")

test_text = """# 測試文檔

這是一個測試段落。它包含多個句子；用來驗證分段效果！

## 小節標題

另一個段落的內容。包含：列表項目、技術詞彙。最後一句話。"""

# Test default balanced strategy
splitter = splitters["balanced"]
chunks = splitter.split_text(test_text)

print(f"✅ Input: {len(test_text)} chars")
print(f"✅ Output: {len(chunks)} chunks")
print(f"✅ Avg chunk size: {sum(len(c) for c in chunks) / len(chunks):.0f} chars")

# Verify chunks are semantically meaningful
for i, chunk in enumerate(chunks):
    print(f"✅ Chunk {i+1}: {chunk.strip()[:50]}...")

assert len(chunks) > 0, "Should produce at least one chunk"
assert all(len(chunk.strip()) > 0 for chunk in chunks), "All chunks should have content"

print("\n🎯 Smoke test passed! Chunking strategy working correctly.")

In [None]:
## Cell 8: When to Use This & Next Steps
print(
    """
🎯 When to Use This Notebook:

1. **文檔預處理階段** - 在建立 RAG 系統之前，優化文本分段策略
2. **多語言處理** - 處理中文文檔時，需要考慮語言特性
3. **檢索品質優化** - 當檢索結果不理想時，重新評估分段策略
4. **系統性能調優** - 平衡檢索精度與計算效率

🔧 Key Parameters to Remember:
- chunk_size: 600-1200 (中文建議 800)
- chunk_overlap: 10-15% of chunk_size
- separators: 階層式標點符號優先級
- length_function: 考慮中文字符密度

⚠️ Common Pitfalls:
- 忽略中文標點符號的語義邊界
- chunk_size 設置過大導致語義混雜
- overlap 太小失去上下文，太大浪費資源
- 未考慮文檔結構（標題、列表）

🚀 Next Steps:
- nb12: 使用 bge-m3 對分段文本進行向量化
- nb13: 建立 FAISS 索引進行高效檢索
- nb14: 實現查詢與引用功能

📁 Generated Files:
- data/chunks/*.jsonl - 分段結果，供後續 notebook 使用
"""
)

# nb11_zh_chunking_strategies.ipynb 實作計畫

## 目標（Goals）

1. **中文分段策略**：實作適合中文文本的分段方法，處理無空格分詞特性
2. **多層級分段**：章節標題 → 段落 → 標點符號的階層式切分
3. **參數優化**：針對中文調整 chunk_size/overlap，考慮標點密度與語義完整性
4. **分段品質評估**：比較不同策略的語義保持度與檢索效果
5. **實際應用**：為後續 RAG 流程提供高品質的文本片段

## Notebook 大綱（Cells & Purpose）## 核心代碼要點（Core Code Blocks）

### 1. 中文特化分段器
```python
class ChineseTextSplitter(RecursiveCharacterTextSplitter):
    def __init__(self, config: ChunkingConfig):
        # 階層式分隔符：標題 → 章節 → 強標點 → 弱標點
        separators = ["\n### ", "\n## ", "\n# ", "。", "！", "？", "；", "："]
        super().__init__(separators=separators, chunk_size=800, chunk_overlap=80)
```

### 2. 中文長度計算函數
```python
def _chinese_length_function(self, text: str) -> int:
    # 中文字符密度較高，需要調整 token 估算
    chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
    return int(chinese_chars * 1.5 + (len(text) - chinese_chars) * 0.8)
```

### 3. 品質評估指標
```python
def assess_chunk_quality(chunk: str) -> Dict[str, float]:
    # 句子完整性、結構性、連貫性、內容密度
    complete_sentences = len(re.findall(r'[。！？](?:\s|$)', chunk))
    content_density = len(re.findall(r'[\u4e00-\u9fff]', chunk)) / len(chunk)
    return {"overall_quality": (completeness + density) / 2}
```

## Smoke Test（煙霧測試）

```python
# 快速驗證分段功能
test_text = """# 測試\n這是第一段。包含多句話！\n## 小節\n另一段內容；結束。"""
chunks = splitter.split_text(test_text)
assert len(chunks) > 0 and all(len(c.strip()) > 0 for c in chunks)
print(f"✅ 生成 {len(chunks)} 個語義完整的片段")
```



## Stage 2 進度總結

### Completed（已完成）
- ✅ **nb10**: 文檔載入與清理（HTML/PDF → 純文字）
- ✅ **nb11**: 中文分段策略（階層式切分 + 品質評估）

### Core Concepts（核心概念）
- **階層式分段**：標題 → 段落 → 標點符號的優先級
- **中文特化處理**：考慮字符密度與標點語義邊界
- **品質評估**：句子完整性、結構性、內容密度指標
- **參數調優**：chunk_size/overlap 針對中文優化

### Pitfalls（常見陷阱）
- ⚠️ chunk_size 設置需考慮中文 token 密度（建議 600-1200）
- ⚠️ 分隔符順序影響語義邊界保持
- ⚠️ overlap 太小失去上下文，太大浪費計算資源
- ⚠️ 忽略文檔結構（標題、列表）的完整性

### Next Actions（下一步）
1. **nb12**: bge-m3 嵌入模型對分段文本向量化
2. **nb13**: FAISS 索引建立與存儲
3. **nb14**: 查詢檢索與引用格式化
4. **nb15**: bge-reranker 重排器提升檢索精度

分段是 RAG 系統的基礎，高品質的分段直接影響後續檢索與生成效果。nb11 為我們建立了中文友好的分段基線，接下來進入向量化階段！