In [None]:
# Stage 3 - Tools & Function Calling
# nb23_content_extraction.ipynb
# 目標：trafilatura 內容抽取、去 HTML、中文友好處理

# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# ============================================================================
# Cell 2: Dependencies and Imports
# ============================================================================

# Install required packages (run once)
# !pip install trafilatura requests beautifulsoup4 chardet opencc-python-reimplemented

import trafilatura
import requests
import chardet
from bs4 import BeautifulSoup
from opencc import OpenCC
import re
import json
import urllib.parse
from typing import Dict, List, Optional, Union
from dataclasses import dataclass
import time

print("Dependencies loaded successfully")

In [None]:
# ============================================================================
# Cell 3: Basic HTML Content Extraction
# ============================================================================

def extract_basic_content(html_content: str) -> Dict[str, str]:
    """
    Basic content extraction using trafilatura

    Args:
        html_content: Raw HTML string

    Returns:
        Dictionary with extracted content
    """

    # Extract main content
    text = trafilatura.extract(html_content, include_links=False)

    # Extract with metadata
    metadata = trafilatura.extract_metadata(html_content)

    # Extract with format preservation
    formatted_text = trafilatura.extract(
        html_content,
        include_formatting=True,
        include_links=False
    )

    result = {
        "text": text or "",
        "formatted_text": formatted_text or "",
        "title": metadata.title if metadata else "",
        "author": metadata.author if metadata else "",
        "date": metadata.date if metadata else "",
        "description": metadata.description if metadata else "",
        "language": metadata.language if metadata else ""
    }

    return result

# Test with sample HTML
sample_html = """
<!DOCTYPE html>
<html>
<head>
    <title>測試頁面 - AI 技術分享</title>
    <meta name="author" content="張三">
    <meta name="description" content="關於大型語言模型的技術文章">
</head>
<body>
    <header>
        <nav>導航菜單</nav>
    </header>
    <main>
        <h1>大型語言模型技術解析</h1>
        <p>本文將深入探討 LLM 的核心原理。</p>
        <h2>Transformer 架構</h2>
        <p>Transformer 是現代 LLM 的基礎架構，包含注意力機制等關鍵組件。</p>
        <ul>
            <li>自注意力機制</li>
            <li>位置編碼</li>
            <li>前饋網絡</li>
        </ul>
    </main>
    <footer>版權信息</footer>
    <script>console.log("ads");</script>
</body>
</html>
"""

extracted = extract_basic_content(sample_html)
print("Extracted content:")
for key, value in extracted.items():
    print(f"{key}: {value[:100]}...")

In [None]:
# ============================================================================
# Cell 4: Advanced Extraction Options
# ============================================================================


def extract_advanced_content(html_content: str, config: Dict = None) -> Dict[str, str]:
    """
    Advanced content extraction with configurable options

    Args:
        html_content: Raw HTML string
        config: Configuration options

    Returns:
        Dictionary with extracted content and metadata
    """

    default_config = {
        "include_links": True,
        "include_images": False,
        "include_tables": True,
        "include_formatting": True,
        "deduplicate": True,
        "favor_precision": True,
        "favor_recall": False,
    }

    if config:
        default_config.update(config)

    # Extract main content with options
    text = trafilatura.extract(
        html_content,
        include_links=default_config["include_links"],
        include_images=default_config["include_images"],
        include_tables=default_config["include_tables"],
        include_formatting=default_config["include_formatting"],
        deduplicate=default_config["deduplicate"],
        favor_precision=default_config["favor_precision"],
        favor_recall=default_config["favor_recall"],
    )

    # Extract metadata
    metadata = trafilatura.extract_metadata(html_content)

    # Extract links separately if needed
    links = []
    if default_config["include_links"]:
        soup = BeautifulSoup(html_content, "html.parser")
        for link in soup.find_all("a", href=True):
            links.append({"text": link.get_text().strip(), "href": link["href"]})

    result = {
        "text": text or "",
        "title": metadata.title if metadata else "",
        "author": metadata.author if metadata else "",
        "date": str(metadata.date) if metadata and metadata.date else "",
        "description": metadata.description if metadata else "",
        "language": metadata.language if metadata else "",
        "sitename": metadata.sitename if metadata else "",
        "links": links,
        "word_count": len((text or "").split()),
        "char_count": len(text or ""),
    }

    return result


# Test advanced extraction
advanced_config = {
    "include_links": True,
    "include_formatting": True,
    "favor_precision": True,
}

advanced_result = extract_advanced_content(sample_html, advanced_config)
print("Advanced extraction results:")
print(f"Word count: {advanced_result['word_count']}")
print(f"Character count: {advanced_result['char_count']}")
print(f"Links found: {len(advanced_result['links'])}")

In [None]:
# ============================================================================
# Cell 5: Chinese Web Page Processing
# ============================================================================


def detect_encoding(content: bytes) -> str:
    """Detect encoding of web content"""
    detected = chardet.detect(content)
    return detected["encoding"] if detected["encoding"] else "utf-8"


def normalize_chinese_text(text: str, convert_variant: str = None) -> str:
    """
    Normalize Chinese text

    Args:
        text: Input text
        convert_variant: 't2s' for traditional to simplified, 's2t' for reverse

    Returns:
        Normalized text
    """
    if not text:
        return ""

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)
    text = text.strip()

    # Convert between traditional/simplified if requested
    if convert_variant:
        try:
            cc = OpenCC(convert_variant)
            text = cc.convert(text)
        except:
            print(f"Warning: OpenCC conversion {convert_variant} failed")

    # Normalize punctuation
    punctuation_map = {
        "（": "(",
        "）": ")",
        "？": "?",
        "！": "!",
        "，": ",",
        "。": ".",
        "；": ";",
        "：": ":",
    }

    for old, new in punctuation_map.items():
        text = text.replace(old, new)

    return text


def extract_chinese_content(
    html_content: Union[str, bytes], url: str = ""
) -> Dict[str, str]:
    """
    Extract content from Chinese web pages with proper encoding handling

    Args:
        html_content: HTML content (string or bytes)
        url: Source URL for context

    Returns:
        Extracted content dictionary
    """

    # Handle encoding if bytes
    if isinstance(html_content, bytes):
        encoding = detect_encoding(html_content)
        try:
            html_content = html_content.decode(encoding)
        except:
            html_content = html_content.decode("utf-8", errors="ignore")

    # Basic extraction
    result = extract_advanced_content(html_content)

    # Normalize Chinese text
    if result["text"]:
        result["text_normalized"] = normalize_chinese_text(result["text"])
        result["text_simplified"] = normalize_chinese_text(result["text"], "t2s")
        result["text_traditional"] = normalize_chinese_text(result["text"], "s2t")

    # Add source info
    result["source_url"] = url
    result["extraction_time"] = time.time()

    return result


# Test Chinese content processing
chinese_html = """
<!DOCTYPE html>
<html lang="zh-TW">
<head>
    <meta charset="UTF-8">
    <title>人工智慧技術發展趨勢</title>
</head>
<body>
    <h1>２０２４年人工智慧技術發展趨勢</h1>
    <p>近年來，大型語言模型（ＬＬＭ）技術突飛猛進。主要發展包括：</p>
    <ul>
        <li>模型規模持續擴大</li>
        <li>推理能力顯著提升</li>
        <li>多模態整合加速</li>
    </ul>
    <p>這些技術將對各行各業產生深遠影響。</p>
</body>
</html>
"""

chinese_result = extract_chinese_content(chinese_html, "https://example.com/ai-trends")
print("Chinese content extraction:")
print(f"Original: {chinese_result['text'][:100]}...")
print(f"Normalized: {chinese_result['text_normalized'][:100]}...")
print(f"Simplified: {chinese_result['text_simplified'][:100]}...")

In [None]:
# ============================================================================
# Cell 6: Batch URL Processing Tool
# ============================================================================


@dataclass
class ExtractionResult:
    url: str
    title: str
    text: str
    status: str
    error: str = ""
    metadata: Dict = None


def fetch_and_extract(url: str, timeout: int = 30) -> ExtractionResult:
    """
    Fetch and extract content from a single URL

    Args:
        url: Target URL
        timeout: Request timeout in seconds

    Returns:
        ExtractionResult object
    """

    try:
        # Set headers to mimic browser
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        }

        # Fetch content
        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()

        # Extract content
        extracted = extract_chinese_content(response.content, url)

        return ExtractionResult(
            url=url,
            title=extracted.get("title", ""),
            text=extracted.get("text", ""),
            status="success",
            metadata=extracted,
        )

    except requests.RequestException as e:
        return ExtractionResult(
            url=url, title="", text="", status="network_error", error=str(e)
        )
    except Exception as e:
        return ExtractionResult(
            url=url, title="", text="", status="extraction_error", error=str(e)
        )


def batch_extract_urls(urls: List[str], delay: float = 1.0) -> List[ExtractionResult]:
    """
    Extract content from multiple URLs with rate limiting

    Args:
        urls: List of URLs to process
        delay: Delay between requests in seconds

    Returns:
        List of ExtractionResult objects
    """

    results = []

    for i, url in enumerate(urls):
        print(f"Processing {i+1}/{len(urls)}: {url}")

        result = fetch_and_extract(url)
        results.append(result)

        # Rate limiting
        if i < len(urls) - 1:
            time.sleep(delay)

    return results


# Test batch processing with sample URLs
sample_urls = [
    "https://httpbin.org/html",  # Simple test page
    "https://example.com",  # Basic example page
]

print("Batch URL processing test:")
batch_results = batch_extract_urls(
    sample_urls[:1], delay=0.5
)  # Test with first URL only

for result in batch_results:
    print(f"URL: {result.url}")
    print(f"Status: {result.status}")
    print(f"Title: {result.title}")
    print(f"Text length: {len(result.text)}")
    if result.error:
        print(f"Error: {result.error}")
    print("-" * 50)

In [None]:
# ============================================================================
# Cell 7: Safety Checks and Content Filtering
# ============================================================================


def is_safe_url(url: str) -> bool:
    """
    Basic URL safety check

    Args:
        url: URL to check

    Returns:
        True if URL appears safe
    """

    # Parse URL
    try:
        parsed = urllib.parse.urlparse(url)
    except:
        return False

    # Check scheme
    if parsed.scheme not in ["http", "https"]:
        return False

    # Check for suspicious patterns
    suspicious_patterns = [
        "javascript:",
        "data:",
        "file:",
        "ftp:",
    ]

    for pattern in suspicious_patterns:
        if pattern in url.lower():
            return False

    # Check domain blacklist (basic example)
    domain_blacklist = ["malware.com", "phishing.site"]

    if parsed.netloc.lower() in domain_blacklist:
        return False

    return True


def filter_extracted_content(content: str) -> str:
    """
    Filter potentially harmful content from extracted text

    Args:
        content: Extracted text content

    Returns:
        Filtered content
    """

    if not content:
        return ""

    # Remove script-like patterns
    content = re.sub(
        r"<script.*?</script>", "", content, flags=re.DOTALL | re.IGNORECASE
    )
    content = re.sub(r"<style.*?</style>", "", content, flags=re.DOTALL | re.IGNORECASE)

    # Remove suspicious URLs
    content = re.sub(r"javascript:[^\\s]*", "[FILTERED]", content, flags=re.IGNORECASE)
    content = re.sub(r"data:[^\\s]*", "[FILTERED]", content, flags=re.IGNORECASE)

    # Limit excessive repetition
    content = re.sub(r"(.{10,}?)\\1{3,}", r"\\1[REPEATED]", content)

    return content


def safe_extract_content(url_or_html: str, is_url: bool = True) -> Dict[str, str]:
    """
    Safely extract content with security checks

    Args:
        url_or_html: URL to fetch or HTML content
        is_url: True if input is URL, False if HTML content

    Returns:
        Extracted content with safety metadata
    """

    result = {"text": "", "title": "", "status": "error", "safety_warnings": []}

    try:
        if is_url:
            # Check URL safety
            if not is_safe_url(url_or_html):
                result["safety_warnings"].append("Unsafe URL detected")
                result["status"] = "unsafe_url"
                return result

            # Fetch content
            extraction_result = fetch_and_extract(url_or_html)
            if extraction_result.status != "success":
                result["status"] = extraction_result.status
                result["error"] = extraction_result.error
                return result

            content = extraction_result.text
            title = extraction_result.title
        else:
            # Extract from HTML directly
            extracted = extract_chinese_content(url_or_html)
            content = extracted.get("text", "")
            title = extracted.get("title", "")

        # Filter content
        filtered_content = filter_extracted_content(content)

        # Check if significant content was filtered
        if len(filtered_content) < len(content) * 0.8:
            result["safety_warnings"].append("Significant content filtered")

        result.update(
            {
                "text": filtered_content,
                "title": title,
                "status": "success",
                "original_length": len(content),
                "filtered_length": len(filtered_content),
            }
        )

    except Exception as e:
        result["status"] = "extraction_error"
        result["error"] = str(e)

    return result


# Test safety filtering
malicious_html = """
<html>
<head><title>Test Page</title></head>
<body>
    <h1>Normal Content</h1>
    <p>This is normal text content.</p>
    <script>alert('malicious code');</script>
    <p>More normal content here.</p>
    <a href="javascript:alert('xss')">Suspicious Link</a>
    <p>This sentence repeats. This sentence repeats. This sentence repeats. This sentence repeats.</p>
</body>
</html>
"""

safe_result = safe_extract_content(malicious_html, is_url=False)
print("Safety filtering test:")
print(f"Status: {safe_result['status']}")
print(f"Warnings: {safe_result['safety_warnings']}")
print(f"Original length: {safe_result.get('original_length', 0)}")
print(f"Filtered length: {safe_result.get('filtered_length', 0)}")
print(f"Filtered content: {safe_result['text'][:200]}...")

In [None]:
# ============================================================================
# Cell 8: Smoke Test - Real World Example
# ============================================================================


def smoke_test_extraction():
    """
    Comprehensive smoke test for content extraction
    """

    print("=== Content Extraction Smoke Test ===")

    # Test 1: Basic HTML extraction
    print("\\nTest 1: Basic HTML extraction")
    test_html = """
    <html>
    <head><title>測試文章</title></head>
    <body>
        <h1>AI 技術發展</h1>
        <p>人工智慧技術正在快速發展。</p>
        <p>主要包括機器學習、深度學習等領域。</p>
    </body>
    </html>
    """

    result1 = extract_basic_content(test_html)
    assert len(result1["text"]) > 0, "Basic extraction failed"
    assert "AI 技術發展" in result1["text"], "Title not extracted"
    print("✓ Basic extraction working")

    # Test 2: Chinese text normalization
    print("\\nTest 2: Chinese text normalization")
    chinese_text = "這是測試文字（包含全形括號）。"
    normalized = normalize_chinese_text(chinese_text)
    assert "(" in normalized and ")" in normalized, "Punctuation normalization failed"
    print("✓ Chinese normalization working")

    # Test 3: Safety filtering
    print("\\nTest 3: Safety filtering")
    unsafe_html = "<script>alert('test')</script><p>Safe content</p>"
    safe_result = safe_extract_content(unsafe_html, is_url=False)
    assert safe_result["status"] == "success", "Safety extraction failed"
    assert "script" not in safe_result["text"].lower(), "Script not filtered"
    print("✓ Safety filtering working")

    # Test 4: Metadata extraction
    print("\\nTest 4: Metadata extraction")
    meta_html = """
    <html>
    <head>
        <title>測試標題</title>
        <meta name="author" content="測試作者">
        <meta name="description" content="測試描述">
    </head>
    <body><p>內容</p></body>
    </html>
    """

    result4 = extract_advanced_content(meta_html)
    assert result4["title"] == "測試標題", "Title extraction failed"
    assert result4["author"] == "測試作者", "Author extraction failed"
    print("✓ Metadata extraction working")

    print("\\n🎉 All smoke tests passed!")
    return True


# Run smoke test
smoke_test_extraction()

In [None]:
# ============================================================================
# Cell 9: Reusable Content Extractor Class
# ============================================================================


class ContentExtractor:
    """
    Reusable content extraction tool with configurable options
    """

    def __init__(self, config: Dict = None):
        """
        Initialize extractor with configuration

        Args:
            config: Configuration dictionary
        """

        self.config = {
            "include_links": True,
            "include_formatting": True,
            "chinese_normalization": True,
            "safety_filtering": True,
            "request_timeout": 30,
            "request_delay": 1.0,
            "convert_chinese": None,  # 't2s', 's2t', or None
            "max_content_length": 100000,
            "user_agent": "Mozilla/5.0 (compatible; ContentExtractor/1.0)",
        }

        if config:
            self.config.update(config)

    def extract_from_url(self, url: str) -> Dict[str, any]:
        """Extract content from URL"""

        if not is_safe_url(url):
            return {
                "status": "unsafe_url",
                "error": "URL failed safety check",
                "text": "",
                "metadata": {},
            }

        try:
            headers = {"User-Agent": self.config["user_agent"]}
            response = requests.get(
                url, headers=headers, timeout=self.config["request_timeout"]
            )
            response.raise_for_status()

            return self.extract_from_html(response.content, source_url=url)

        except Exception as e:
            return {"status": "error", "error": str(e), "text": "", "metadata": {}}

    def extract_from_html(
        self, html_content: Union[str, bytes], source_url: str = ""
    ) -> Dict[str, any]:
        """Extract content from HTML"""

        try:
            # Handle encoding
            if isinstance(html_content, bytes):
                encoding = detect_encoding(html_content)
                html_content = html_content.decode(encoding, errors="ignore")

            # Basic extraction
            result = extract_advanced_content(html_content, self.config)

            # Chinese processing
            if self.config["chinese_normalization"] and result["text"]:
                result["text"] = normalize_chinese_text(
                    result["text"], self.config["convert_chinese"]
                )

            # Safety filtering
            if self.config["safety_filtering"]:
                result["text"] = filter_extracted_content(result["text"])

            # Length limiting
            max_len = self.config["max_content_length"]
            if len(result["text"]) > max_len:
                result["text"] = result["text"][:max_len] + "..."
                result["truncated"] = True

            # Add extraction metadata
            result.update(
                {
                    "status": "success",
                    "source_url": source_url,
                    "extraction_time": time.time(),
                    "extractor_version": "1.0",
                }
            )

            return result

        except Exception as e:
            return {"status": "error", "error": str(e), "text": "", "metadata": {}}

    def batch_extract(self, urls: List[str]) -> List[Dict[str, any]]:
        """Extract content from multiple URLs"""

        results = []

        for i, url in enumerate(urls):
            print(f"Processing {i+1}/{len(urls)}: {url}")

            result = self.extract_from_url(url)
            results.append(result)

            # Rate limiting
            if i < len(urls) - 1:
                time.sleep(self.config["request_delay"])

        return results


# Test the extractor class
extractor = ContentExtractor(
    {
        "chinese_normalization": True,
        "convert_chinese": "s2t",  # Convert to traditional Chinese
        "safety_filtering": True,
        "max_content_length": 5000,
    }
)

# Test with HTML content
test_html_final = """
<html>
<head><title>RAG 系统实现指南</title></head>
<body>
    <h1>检索增强生成(RAG)系统实现</h1>
    <p>本文介绍如何构建一个完整的RAG系统。</p>
    <h2>核心组件</h2>
    <ul>
        <li>文档处理器</li>
        <li>向量数据库</li>
        <li>检索模块</li>
        <li>生成模块</li>
    </ul>
</body>
</html>
"""

final_result = extractor.extract_from_html(test_html_final)
print("\\nFinal extractor test:")
print(f"Status: {final_result['status']}")
print(f"Title: {final_result['title']}")
print(f"Text preview: {final_result['text'][:150]}...")
print(f"Word count: {final_result['word_count']}")

print("\\n✅ Content extraction tool ready for use!")

# ============================================================================
# Summary
# ============================================================================

print(
    """
=== nb23 Content Extraction Summary ===

✅ Completed:
- Basic HTML content extraction with trafilatura
- Advanced extraction with metadata and formatting
- Chinese text normalization and encoding handling
- Batch URL processing with rate limiting
- Safety filtering and content validation
- Reusable ContentExtractor class

🔧 Key Features:
- Multi-format support (HTML, web pages)
- Chinese-friendly text processing
- Safety checks and content filtering
- Configurable extraction options
- Error handling and status reporting

📊 Performance:
- Handles encoding detection automatically
- Rate limiting for responsible web scraping
- Memory-efficient processing
- Robust error handling

🚀 Next Steps:
- Integrate with RAG document pipeline
- Add support for PDF extraction
- Implement content caching
- Add more sophisticated safety rules
"""
)

In [None]:
# Quick verification
def verify_extraction():
    sample = "<html><head><title>測試</title></head><body><p>這是測試內容。</p></body></html>"
    result = extract_basic_content(sample)
    assert "測試內容" in result["text"]
    assert result["title"] == "測試"
    print("✓ Content extraction verified")


verify_extraction()