In [None]:
# Stage 2 - 中文 RAG 基礎
# nb10_loaders_and_cleaning.ipynb
# Goals: PDF/MD/HTML 讀取、trafilatura 抽取、繁簡轉換、文字正規化

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# ============================================================================
# Cell 2: Import Dependencies
# ============================================================================

import re
import json
import hashlib
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass

# Core text processing
try:
    from trafilatura import extract, bare_extraction

    print("✓ trafilatura available")
except ImportError:
    print("⚠ Installing trafilatura...")
    os.system("pip install trafilatura>=1.6.0")
    from trafilatura import extract, bare_extraction

try:
    from opencc import OpenCC

    print("✓ opencc available")
except ImportError:
    print("⚠ Installing opencc...")
    os.system("pip install opencc>=1.1.0")
    from opencc import OpenCC

try:
    from rapidfuzz import fuzz

    print("✓ rapidfuzz available")
except ImportError:
    print("⚠ Installing rapidfuzz...")
    os.system("pip install rapidfuzz>=2.13.0")
    from rapidfuzz import fuzz

# File handling
try:
    import PyPDF2

    print("✓ PyPDF2 available")
except ImportError:
    print("⚠ Installing PyPDF2...")
    os.system("pip install PyPDF2>=3.0.0")
    import PyPDF2

In [None]:
# ============================================================================
# Cell 3: Data Structures
# ============================================================================


@dataclass
class DocumentMeta:
    """Document metadata for source tracking"""

    source_id: str
    uri: Optional[str] = None
    title: Optional[str] = None
    page: Optional[int] = None
    section: Optional[str] = None
    lang: Optional[str] = None
    content_type: str = "text"

    def to_dict(self) -> Dict:
        return {k: v for k, v in self.__dict__.items() if v is not None}


@dataclass
class CleanDocument:
    """Cleaned document with metadata"""

    text: str
    meta: DocumentMeta
    word_count: int = 0
    char_count: int = 0

    def __post_init__(self):
        self.word_count = len(self.text.split())
        self.char_count = len(self.text)

    def to_dict(self) -> Dict:
        return {
            "text": self.text,
            "meta": self.meta.to_dict(),
            "word_count": self.word_count,
            "char_count": self.char_count,
        }

In [None]:
# ============================================================================
# Cell 4: Text Normalizer
# ============================================================================

class ChineseTextNormalizer:
    """Chinese text normalization utilities"""

    def __init__(self, t2s: bool = False, s2t: bool = False):
        self.converter = None
        if t2s:
            self.converter = OpenCC('t2s')  # Traditional to Simplified
            print("Initialized T2S converter")
        elif s2t:
            self.converter = OpenCC('s2t')  # Simplified to Traditional
            print("Initialized S2T converter")

    def normalize_whitespace(self, text: str) -> str:
        """Normalize whitespace and line breaks"""
        # Replace multiple spaces with single space
        text = re.sub(r' +', ' ', text)
        # Replace multiple newlines with double newline
        text = re.sub(r'\n{3,}', '\n\n', text)
        # Clean up mixed whitespace
        text = re.sub(r'[ \t]+\n', '\n', text)
        text = re.sub(r'\n[ \t]+', '\n', text)
        return text.strip()

    def normalize_punctuation(self, text: str) -> str:
        """Normalize Chinese punctuation marks"""
        # Full/half-width punctuation mapping
        punct_map = {
            '，': '，', '。': '。', '！': '！', '？': '？',
            '；': '；', '：': '：', '"': '"', '"': '"',
            ''': ''', ''': ''', '（': '（', '）': '）',
            '【': '【', '】': '】', '《': '《', '》': '》'
        }

        for half, full in punct_map.items():
            text = text.replace(half, full)

        # Remove duplicate punctuation
        text = re.sub(r'([。！？；]){2,}', r'\1', text)
        text = re.sub(r'([，：]){2,}', r'\1', text)

        return text

    def full_normalize(self, text: str) -> str:
        """Apply all normalization steps"""
        if not text or len(text.strip()) == 0:
            return ""

        # Basic cleanup
        text = self.normalize_whitespace(text)
        text = self.normalize_punctuation(text)

        # Traditional/Simplified conversion
        if self.converter:
            text = self.converter.convert(text)

        return text

In [None]:
# ============================================================================
# Cell 5: Document Loaders
# ============================================================================


class DocumentLoader:
    """Multi-format document loader with Chinese support"""

    def __init__(self, normalizer: Optional[ChineseTextNormalizer] = None):
        self.normalizer = normalizer or ChineseTextNormalizer()

    def load_html(
        self, content: str, uri: str = "", source_id: str = ""
    ) -> Optional[CleanDocument]:
        """Extract text from HTML using trafilatura"""
        try:
            # Basic extraction
            text = extract(content, include_links=False, include_images=False)
            if not text:
                # Fallback to bare extraction
                result = bare_extraction(content)
                text = result.get("text", "") if result else ""

            if not text or len(text.strip()) < 10:
                return None

            # Normalize text
            clean_text = self.normalizer.full_normalize(text)

            # Extract metadata
            meta_result = bare_extraction(content, include_links=False)
            title = meta_result.get("title", "") if meta_result else ""

            meta = DocumentMeta(
                source_id=source_id
                or f"html_{hashlib.md5(content[:100].encode()).hexdigest()[:8]}",
                uri=uri,
                title=title,
                content_type="html",
            )

            return CleanDocument(text=clean_text, meta=meta)

        except Exception as e:
            print(f"HTML extraction failed: {e}")
            return None

    def load_pdf(self, file_path: str, source_id: str = "") -> List[CleanDocument]:
        """Extract text from PDF page by page"""
        documents = []

        try:
            with open(file_path, "rb") as f:
                reader = PyPDF2.PdfReader(f)

                for page_num, page in enumerate(reader.pages):
                    text = page.extract_text()
                    if not text or len(text.strip()) < 10:
                        continue

                    clean_text = self.normalizer.full_normalize(text)

                    meta = DocumentMeta(
                        source_id=source_id or f"pdf_{Path(file_path).stem}",
                        uri=f"file://{file_path}",
                        page=page_num + 1,
                        content_type="pdf",
                    )

                    documents.append(CleanDocument(text=clean_text, meta=meta))

        except Exception as e:
            print(f"PDF extraction failed: {e}")

        return documents

    def load_markdown(
        self, content: str, source_id: str = "", uri: str = ""
    ) -> CleanDocument:
        """Load and clean Markdown content"""
        # Basic markdown cleanup (remove common markdown syntax)
        text = re.sub(r"^#{1,6}\s+", "", content, flags=re.MULTILINE)  # Headers
        text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)  # Bold
        text = re.sub(r"\*(.*?)\*", r"\1", text)  # Italic
        text = re.sub(r"`(.*?)`", r"\1", text)  # Inline code
        text = re.sub(r"```.*?```", "", text, flags=re.DOTALL)  # Code blocks
        text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)  # Links

        clean_text = self.normalizer.full_normalize(text)

        meta = DocumentMeta(
            source_id=source_id
            or f"md_{hashlib.md5(content[:100].encode()).hexdigest()[:8]}",
            uri=uri,
            content_type="markdown",
        )

        return CleanDocument(text=clean_text, meta=meta)

    def load_text(
        self, content: str, source_id: str = "", uri: str = ""
    ) -> CleanDocument:
        """Load plain text content"""
        clean_text = self.normalizer.full_normalize(content)

        meta = DocumentMeta(
            source_id=source_id
            or f"txt_{hashlib.md5(content[:100].encode()).hexdigest()[:8]}",
            uri=uri,
            content_type="text",
        )

        return CleanDocument(text=clean_text, meta=meta)

In [None]:
# ============================================================================
# Cell 6: Deduplication
# ============================================================================


class DocumentDeduplicator:
    """Remove duplicate or highly similar documents"""

    def __init__(self, similarity_threshold: float = 0.95):
        self.threshold = similarity_threshold

    def get_text_hash(self, text: str) -> str:
        """Generate hash for exact duplicate detection"""
        # Normalize for hashing (remove all whitespace differences)
        normalized = re.sub(r"\s+", " ", text.lower().strip())
        return hashlib.md5(normalized.encode("utf-8")).hexdigest()

    def calculate_similarity(self, text1: str, text2: str) -> float:
        """Calculate text similarity using rapidfuzz"""
        if len(text1) < 10 or len(text2) < 10:
            return 0.0

        # Use token_sort_ratio for better Chinese text comparison
        return fuzz.token_sort_ratio(text1, text2) / 100.0

    def deduplicate(
        self, documents: List[CleanDocument]
    ) -> Tuple[List[CleanDocument], int]:
        """Remove duplicates and highly similar documents"""
        if not documents:
            return [], 0

        unique_docs = []
        seen_hashes = set()
        removed_count = 0

        for doc in documents:
            # Check exact duplicates first
            text_hash = self.get_text_hash(doc.text)
            if text_hash in seen_hashes:
                removed_count += 1
                continue

            # Check similarity with existing docs
            is_duplicate = False
            for existing_doc in unique_docs:
                similarity = self.calculate_similarity(doc.text, existing_doc.text)
                if similarity >= self.threshold:
                    print(f"Similar doc found: {similarity:.3f} similarity")
                    is_duplicate = True
                    removed_count += 1
                    break

            if not is_duplicate:
                unique_docs.append(doc)
                seen_hashes.add(text_hash)

        return unique_docs, removed_count

In [None]:
# ============================================================================
# Cell 7: Document Pipeline
# ============================================================================


class DocumentPipeline:
    """Complete document processing pipeline"""

    def __init__(
        self,
        convert_to_simplified: bool = False,
        convert_to_traditional: bool = False,
        dedup_threshold: float = 0.95,
    ):

        self.normalizer = ChineseTextNormalizer(
            t2s=convert_to_simplified, s2t=convert_to_traditional
        )
        self.loader = DocumentLoader(self.normalizer)
        self.deduplicator = DocumentDeduplicator(dedup_threshold)

    def process_files(self, file_paths: List[str]) -> List[CleanDocument]:
        """Process multiple files and return clean documents"""
        all_docs = []

        for file_path in file_paths:
            path_obj = Path(file_path)
            if not path_obj.exists():
                print(f"File not found: {file_path}")
                continue

            print(f"Processing: {file_path}")

            try:
                if path_obj.suffix.lower() == ".pdf":
                    docs = self.loader.load_pdf(file_path, source_id=path_obj.stem)
                    all_docs.extend(docs)

                elif path_obj.suffix.lower() in [".html", ".htm"]:
                    with open(file_path, "r", encoding="utf-8") as f:
                        content = f.read()
                    doc = self.loader.load_html(
                        content, uri=f"file://{file_path}", source_id=path_obj.stem
                    )
                    if doc:
                        all_docs.append(doc)

                elif path_obj.suffix.lower() in [".md", ".markdown"]:
                    with open(file_path, "r", encoding="utf-8") as f:
                        content = f.read()
                    doc = self.loader.load_markdown(
                        content, source_id=path_obj.stem, uri=f"file://{file_path}"
                    )
                    all_docs.append(doc)

                elif path_obj.suffix.lower() == ".txt":
                    with open(file_path, "r", encoding="utf-8") as f:
                        content = f.read()
                    doc = self.loader.load_text(
                        content, source_id=path_obj.stem, uri=f"file://{file_path}"
                    )
                    all_docs.append(doc)

                else:
                    print(f"Unsupported file type: {path_obj.suffix}")

            except Exception as e:
                print(f"Error processing {file_path}: {e}")

        # Deduplicate
        clean_docs, removed = self.deduplicator.deduplicate(all_docs)
        print(f"Processed {len(all_docs)} documents, removed {removed} duplicates")

        return clean_docs

    def save_documents(self, documents: List[CleanDocument], output_path: str):
        """Save documents to JSONL format"""
        output_dir = Path(output_path).parent
        output_dir.mkdir(parents=True, exist_ok=True)

        with open(output_path, "w", encoding="utf-8") as f:
            for doc in documents:
                f.write(json.dumps(doc.to_dict(), ensure_ascii=False) + "\n")

        print(f"Saved {len(documents)} documents to {output_path}")

In [None]:
# ============================================================================
# Cell 8: Smoke Test
# ============================================================================

# Create test data directory
test_data_dir = Path("data/test_samples")
test_data_dir.mkdir(parents=True, exist_ok=True)

# Sample HTML content (Chinese)
sample_html = """
<html>
<head><title>人工智慧簡介</title></head>
<body>
<h1>什麼是人工智慧？</h1>
<p>人工智慧（Artificial Intelligence，簡稱AI）是電腦科學的一個分支，致力於創造能夠執行通常需要人類智慧的任務的機器。</p>
<p>AI的應用領域包括：</p>
<ul>
<li>機器學習（Machine Learning）</li>
<li>自然語言處理（Natural Language Processing）</li>
<li>電腦視覺（Computer Vision）</li>
</ul>
<p>隨著深度學習技術的發展，AI在許多領域都取得了突破性的進展。</p>
</body>
</html>
"""

# Sample Markdown content (Traditional Chinese)
sample_markdown = """
# RAG 系統架構

## 什麼是 RAG？
檢索增強生成（Retrieval-Augmented Generation，RAG）是一種結合資訊檢索與文本生成的AI技術。

## 核心組件
1. **文檔載入器** - 處理各種格式的文檔
2. **向量化** - 將文本轉換為向量表示
3. **檢索器** - 根據查詢找到相關文檔
4. **生成器** - 基於檢索結果生成回答

## 優勢
- 提供有據可依的回答
- 可以處理最新資訊
- 減少模型幻覺問題
"""

# Save test files
(test_data_dir / "sample.html").write_text(sample_html, encoding="utf-8")
(test_data_dir / "sample.md").write_text(sample_markdown, encoding="utf-8")

print("✓ Test data created")

# Initialize pipeline (convert Traditional to Simplified for demo)
pipeline = DocumentPipeline(convert_to_simplified=True, dedup_threshold=0.95)

# Process test files
test_files = [str(test_data_dir / "sample.html"), str(test_data_dir / "sample.md")]

documents = pipeline.process_files(test_files)

# Display results
print(f"\n📄 Processed {len(documents)} documents:")
for i, doc in enumerate(documents):
    print(f"\nDoc {i+1}:")
    print(f"  Source: {doc.meta.source_id}")
    print(f"  Type: {doc.meta.content_type}")
    print(f"  Chars: {doc.char_count}")
    print(
        f"  Preview: {doc.text[:100]}..."
        if len(doc.text) > 100
        else f"  Text: {doc.text}"
    )

# Save to JSONL
output_dir = Path("outs/rag")
output_dir.mkdir(parents=True, exist_ok=True)
pipeline.save_documents(documents, "outs/rag/clean_docs.jsonl")

print("\n✅ Smoke test completed successfully!")
print("📁 Output saved to: outs/rag/clean_docs.jsonl")

In [None]:
# ============================================================================
# Cell 9: Key Parameters & Configuration
# ============================================================================

print("🔧 Key Parameters & Configuration:")
print(
    """
DocumentPipeline 主要參數：
├── convert_to_simplified: bool = False    # T2S 繁簡轉換
├── convert_to_traditional: bool = False   # S2T 繁簡轉換
├── dedup_threshold: float = 0.95          # 去重相似度門檻

ChineseTextNormalizer 功能：
├── normalize_whitespace()                 # 空白字符正規化
├── normalize_punctuation()               # 中文標點正規化
└── full_normalize()                      # 完整正規化流程

支援格式：
├── .pdf    → 分頁處理，保留頁碼資訊
├── .html   → trafilatura 抽取，去除標籤
├── .md     → Markdown 語法清理
└── .txt    → 純文字正規化

DocumentMeta 追蹤欄位：
├── source_id: 來源識別ID
├── uri: 檔案或網址路徑
├── page: 頁碼（PDF適用）
├── content_type: 文檔類型
└── title: 標題（如果有）

Low-VRAM 建議：
- 大檔案分批處理（batch_size < 100）
- 啟用 incremental processing
- 限制 dedup_threshold 降低記憶體使用
"""
)

In [None]:
# ============================================================================
# Cell 10: When to Use This & Next Steps
# ============================================================================

print("🎯 When to Use This:")
print(
    """
適用場景：
✅ 中文文檔RAG系統的資料預處理階段
✅ 多格式文檔統一清理與正規化
✅ 需要繁簡轉換的跨地區文本處理
✅ 建立乾淨、可追蹤來源的文檔語料庫

不適用：
❌ 需要保留格式資訊的場景（如表格結構）
❌ 即時線上文檔處理（批次處理設計）
❌ 非文字內容提取（圖片、影音）

Next Steps (nb11-nb19)：
→ nb11: 中文分段策略（chunk_size/overlap 調優）
→ nb12: bge-m3 嵌入向量化
→ nb13: FAISS 索引建構與儲存
→ nb14: 查詢與引用系統
"""
)

print("\n🔍 Pitfalls & Tips:")
print(
    """
常見陷阱：
1. encoding 問題 → 統一使用 utf-8
2. 大檔案 OOM → 分批載入，控制記憶體使用
3. trafilatura 失效 → 提供 bare_extraction 備用方案
4. 過度去重 → threshold 過低會誤刪有用內容

Reproducibility Tips：
- 固定 dedup_threshold 和轉換設定
- 保存 pipeline 配置到 configs/rag.yaml
- 記錄處理統計資訊（文檔數、去重數、字符數）
"""
)