In [None]:
 Document Information Extraction and Structuring
# 文件結構化抽取與驗證 - 使用 LLM 進行智能文件解析

## 1. Environment Setup & Dependencies
# 環境初始化與依賴管理

# === Shared Cache Bootstrap ===
import os, pathlib, torch
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)

print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())
if torch.cuda.is_available():
    print(f"[GPU] {torch.cuda.get_device_name(0)} | VRAM: {torch.cuda.get_device_properties(0).total_memory // 1e9:.1f}GB")


In [None]:
# Install required packages
# 安裝文件處理與驗證相關套件
!pip install -q transformers accelerate bitsandbytes
!pip install -q PyPDF2 python-docx beautifulsoup4 lxml
!pip install -q pydantic rich spacy nltk
!pip install -q pandas openpyxl tabulate

## 2. Document Parser - Multi-format Support
# 多格式文件解析器 - 統一文件讀取介面

import PyPDF2
import docx
import json
import re
from pathlib import Path
from typing import Dict, List, Union, Optional
from dataclasses import dataclass
from bs4 import BeautifulSoup
import pandas as pd


In [None]:
@dataclass
class DocumentContent:
    """Structured document content with metadata"""
    text: str
    title: Optional[str] = None
    pages: Optional[int] = None
    tables: List[pd.DataFrame] = None
    metadata: Dict = None

    def __post_init__(self):
        if self.tables is None:
            self.tables = []
        if self.metadata is None:
            self.metadata = {}

class DocumentParser:
    """Universal document parser supporting PDF, DOCX, HTML, TXT, MD"""

    def __init__(self):
        self.supported_formats = {'.pdf', '.docx', '.html', '.htm', '.txt', '.md'}

    def parse(self, file_path: Union[str, Path]) -> DocumentContent:
        """Parse document and return structured content"""
        file_path = Path(file_path)

        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        extension = file_path.suffix.lower()
        if extension not in self.supported_formats:
            raise ValueError(f"Unsupported format: {extension}")

        if extension == '.pdf':
            return self._parse_pdf(file_path)
        elif extension == '.docx':
            return self._parse_docx(file_path)
        elif extension in ['.html', '.htm']:
            return self._parse_html(file_path)
        elif extension in ['.txt', '.md']:
            return self._parse_text(file_path)

    def _parse_pdf(self, file_path: Path) -> DocumentContent:
        """Extract text and metadata from PDF"""
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)

            # Extract text from all pages
            text_parts = []
            for page in reader.pages:
                text_parts.append(page.extract_text())

            text = '\n\n'.join(text_parts)

            # Extract metadata
            metadata = {
                'file_path': str(file_path),
                'file_size': file_path.stat().st_size,
                'format': 'pdf'
            }

            if reader.metadata:
                metadata.update({
                    'title': reader.metadata.get('/Title', ''),
                    'author': reader.metadata.get('/Author', ''),
                    'subject': reader.metadata.get('/Subject', ''),
                    'creator': reader.metadata.get('/Creator', '')
                })

            return DocumentContent(
                text=text,
                title=metadata.get('title'),
                pages=len(reader.pages),
                metadata=metadata
            )

    def _parse_docx(self, file_path: Path) -> DocumentContent:
        """Extract text and tables from DOCX"""
        doc = docx.Document(file_path)

        # Extract paragraphs
        text_parts = [paragraph.text for paragraph in doc.paragraphs if paragraph.text.strip()]
        text = '\n\n'.join(text_parts)

        # Extract tables
        tables = []
        for table in doc.tables:
            data = []
            for row in table.rows:
                row_data = [cell.text.strip() for cell in row.cells]
                data.append(row_data)

            if data:
                df = pd.DataFrame(data[1:], columns=data[0] if data else None)
                tables.append(df)

        metadata = {
            'file_path': str(file_path),
            'file_size': file_path.stat().st_size,
            'format': 'docx',
            'paragraphs': len(doc.paragraphs),
            'tables': len(tables)
        }

        return DocumentContent(
            text=text,
            tables=tables,
            metadata=metadata
        )

    def _parse_html(self, file_path: Path) -> DocumentContent:
        """Extract text from HTML"""
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file.read(), 'html.parser')

        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()

        # Extract text
        text = soup.get_text()
        text = re.sub(r'\n\s*\n', '\n\n', text)  # Clean up whitespace

        # Extract title
        title = soup.title.string if soup.title else None

        metadata = {
            'file_path': str(file_path),
            'file_size': file_path.stat().st_size,
            'format': 'html',
            'title': title
        }

        return DocumentContent(
            text=text,
            title=title,
            metadata=metadata
        )

    def _parse_text(self, file_path: Path) -> DocumentContent:
        """Extract text from plain text or markdown"""
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Try to extract title from first line if it looks like a title
        lines = text.split('\n')
        title = None
        if lines and (lines[0].startswith('#') or len(lines[0]) < 100):
            title = lines[0].lstrip('#').strip()

        metadata = {
            'file_path': str(file_path),
            'file_size': file_path.stat().st_size,
            'format': file_path.suffix[1:],  # Remove dot
            'lines': len(lines)
        }

        return DocumentContent(
            text=text,
            title=title,
            metadata=metadata
        )

# Test document parser with sample content
# 測試文件解析器
sample_text = """
# Company Annual Report 2024

## Executive Summary
Our company achieved record growth in 2024, with revenue increasing by 25% to $150M.
Key highlights include:
- New product launches in Q2 and Q4
- Expansion into Asian markets
- Strategic partnership with TechCorp

## Financial Performance
| Metric | 2023 | 2024 | Change |
|--------|------|------|--------|
| Revenue | $120M | $150M | +25% |
| Profit | $15M | $22M | +47% |
| Employees | 450 | 520 | +16% |

## Key Personnel
- CEO: John Smith (john.smith@company.com)
- CFO: Jane Doe (jane.doe@company.com)
- CTO: Bob Wilson (bob.wilson@company.com)

## Contact Information
Address: 123 Business St, Tech City, TC 12345
Phone: +1-555-0123
Website: www.company.com
"""

# Create sample document for testing
sample_path = Path("sample_report.md")
with open(sample_path, 'w', encoding='utf-8') as f:
    f.write(sample_text)

# Test parser
parser = DocumentParser()
doc_content = parser.parse(sample_path)

print("📄 Document Parsing Results:")
print(f"Title: {doc_content.title}")
print(f"Text length: {len(doc_content.text)} characters")
print(f"Metadata: {doc_content.metadata}")
print("\n" + "="*50)

In [None]:
## 3. Schema Definition - Structured Output Models
# Schema 定義 - 使用 Pydantic 定義結構化輸出格式

from pydantic import BaseModel, Field, validator
from typing import List, Optional, Dict, Any
from datetime import datetime
from enum import Enum


class ContactInfo(BaseModel):
    """Contact information structure"""

    name: Optional[str] = None
    email: Optional[str] = None
    phone: Optional[str] = None
    title: Optional[str] = None

    @validator("email")
    def validate_email(cls, v):
        if v and "@" not in v:
            raise ValueError("Invalid email format")
        return v


class FinancialMetric(BaseModel):
    """Financial metric with value and unit"""

    name: str
    value: float
    unit: str = "USD"
    period: Optional[str] = None


class CompanyInfo(BaseModel):
    """Company information schema"""

    company_name: Optional[str] = None
    industry: Optional[str] = None
    address: Optional[str] = None
    phone: Optional[str] = None
    website: Optional[str] = None
    employees: Optional[int] = None


class DocumentSummary(BaseModel):
    """Complete document extraction schema"""

    document_type: str = Field(
        ...,
        description="Type of document (e.g., 'annual_report', 'contract', 'resume')",
    )
    title: Optional[str] = None
    summary: str = Field(..., description="Brief summary of document content")

    # Entities
    contacts: List[ContactInfo] = Field(
        default_factory=list, description="People mentioned in document"
    )
    companies: List[CompanyInfo] = Field(
        default_factory=list, description="Companies mentioned"
    )
    financial_metrics: List[FinancialMetric] = Field(
        default_factory=list, description="Financial data"
    )

    # Metadata
    key_dates: List[str] = Field(
        default_factory=list, description="Important dates mentioned"
    )
    key_topics: List[str] = Field(
        default_factory=list, description="Main topics/themes"
    )
    confidence_score: float = Field(
        default=0.0, ge=0.0, le=1.0, description="Extraction confidence"
    )

    class Config:
        schema_extra = {
            "example": {
                "document_type": "annual_report",
                "title": "Company Annual Report 2024",
                "summary": "Annual financial performance report showing 25% revenue growth",
                "contacts": [
                    {
                        "name": "John Smith",
                        "title": "CEO",
                        "email": "john.smith@company.com",
                    }
                ],
                "companies": [{"company_name": "TechCorp", "industry": "Technology"}],
                "financial_metrics": [
                    {
                        "name": "Revenue",
                        "value": 150000000,
                        "unit": "USD",
                        "period": "2024",
                    }
                ],
            }
        }


# Contract-specific schema
class ContractClause(BaseModel):
    """Contract clause structure"""

    clause_type: str  # e.g., "payment_terms", "termination", "liability"
    content: str
    importance: str = "medium"  # low, medium, high, critical


class ContractInfo(BaseModel):
    """Contract document schema"""

    contract_type: str  # e.g., "service_agreement", "employment", "NDA"
    parties: List[str] = Field(default_factory=list)
    effective_date: Optional[str] = None
    expiration_date: Optional[str] = None
    contract_value: Optional[float] = None
    currency: str = "USD"
    key_clauses: List[ContractClause] = Field(default_factory=list)
    obligations: List[str] = Field(default_factory=list)
    risks: List[str] = Field(default_factory=list)


print("✅ Schema definitions created successfully")
print("Available schemas: DocumentSummary, ContractInfo, ContactInfo, FinancialMetric")

In [None]:
## 4. LLM Extraction Engine - Structured Information Extraction
# LLM 抽取引擎 - 結構化資訊抽取與提示工程

import json
import re
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
)
import torch


class LLMExtractor:
    """LLM-based information extraction engine"""

    def __init__(
        self, model_name: str = "microsoft/DialoGPT-medium", use_4bit: bool = True
    ):
        """Initialize with low-VRAM friendly settings"""
        self.model_name = model_name
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Configure quantization for low VRAM
        if use_4bit and torch.cuda.is_available():
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_use_double_quant=True,
            )
        else:
            bnb_config = None

        # Load model with device mapping for multi-GPU or CPU fallback
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name, padding_side="left"
            )
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token

            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                quantization_config=bnb_config,
                device_map="auto" if torch.cuda.is_available() else None,
                torch_dtype=(
                    torch.float16 if torch.cuda.is_available() else torch.float32
                ),
                low_cpu_mem_usage=True,
            )

            print(f"✅ Model loaded: {model_name} on {self.device}")
            if torch.cuda.is_available():
                print(f"💾 VRAM usage: {torch.cuda.memory_allocated() / 1e9:.2f}GB")

        except Exception as e:
            print(f"❌ Error loading model: {e}")
            print("💡 Falling back to simpler extraction methods...")
            self.model = None
            self.tokenizer = None

    def extract_structured_info(self, text: str, schema_type: str = "general") -> Dict:
        """Extract structured information based on schema type"""

        # Create extraction prompt based on schema
        if schema_type == "general":
            prompt = self._create_general_extraction_prompt(text)
            return self._extract_with_prompt(prompt, DocumentSummary)
        elif schema_type == "contract":
            prompt = self._create_contract_extraction_prompt(text)
            return self._extract_with_prompt(prompt, ContractInfo)
        else:
            return self._fallback_extraction(text)

    def _create_general_extraction_prompt(self, text: str) -> str:
        """Create prompt for general document extraction"""
        return f"""
Analyze the following document and extract structured information in JSON format.

Document text:
{text[:2000]}...

Extract the following information:
1. Document type (e.g., annual_report, contract, resume, email)
2. Title or subject
3. Brief summary (1-2 sentences)
4. People mentioned (name, title, email, phone)
5. Companies mentioned (name, industry, contact info)
6. Financial metrics (amounts, percentages, revenues)
7. Important dates
8. Key topics/themes

Return only valid JSON in this exact format:
{{
    "document_type": "document_type_here",
    "title": "title_here",
    "summary": "summary_here",
    "contacts": [
        {{"name": "John Doe", "title": "CEO", "email": "john@company.com", "phone": "+1-555-0123"}}
    ],
    "companies": [
        {{"company_name": "TechCorp", "industry": "Technology", "website": "www.techcorp.com"}}
    ],
    "financial_metrics": [
        {{"name": "Revenue", "value": 150000000, "unit": "USD", "period": "2024"}}
    ],
    "key_dates": ["2024-01-01", "Q2 2024"],
    "key_topics": ["growth", "expansion", "partnership"],
    "confidence_score": 0.8
}}

JSON:
"""

    def _create_contract_extraction_prompt(self, text: str) -> str:
        """Create prompt for contract-specific extraction"""
        return f"""
Analyze this contract document and extract key legal information in JSON format.

Contract text:
{text[:2000]}...

Extract:
1. Contract type (service_agreement, employment, NDA, etc.)
2. Parties involved
3. Effective and expiration dates
4. Contract value and currency
5. Key clauses (payment terms, termination, liability, etc.)
6. Obligations for each party
7. Potential risks or red flags

Return valid JSON:
{{
    "contract_type": "service_agreement",
    "parties": ["Company A", "Company B"],
    "effective_date": "2024-01-01",
    "expiration_date": "2024-12-31",
    "contract_value": 100000,
    "currency": "USD",
    "key_clauses": [
        {{"clause_type": "payment_terms", "content": "Payment due within 30 days", "importance": "high"}}
    ],
    "obligations": ["Company A must deliver software", "Company B must pay fees"],
    "risks": ["No penalty for late delivery", "Unclear IP ownership"]
}}

JSON:
"""

    def _extract_with_prompt(self, prompt: str, schema_class) -> Dict:
        """Extract information using LLM with structured prompt"""

        if self.model is None:
            return self._fallback_extraction(prompt)

        try:
            # Tokenize and generate
            inputs = self.tokenizer(
                prompt, return_tensors="pt", truncation=True, max_length=1024
            )
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=512,
                    temperature=0.1,  # Low temperature for structured output
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id,
                    num_return_sequences=1,
                )

            # Decode response
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            json_part = response[len(prompt) :].strip()

            # Extract JSON from response
            json_match = re.search(r"\{.*\}", json_part, re.DOTALL)
            if json_match:
                json_str = json_match.group()
                try:
                    extracted_data = json.loads(json_str)
                    # Validate against schema
                    validated = schema_class(**extracted_data)
                    return validated.dict()
                except (json.JSONDecodeError, ValueError) as e:
                    print(f"⚠️ JSON parsing error: {e}")
                    return self._fallback_extraction(prompt)
            else:
                return self._fallback_extraction(prompt)

        except Exception as e:
            print(f"❌ Extraction error: {e}")
            return self._fallback_extraction(prompt)

    def _fallback_extraction(self, text: str) -> Dict:
        """Fallback extraction using regex patterns"""
        print("📋 Using fallback regex-based extraction...")

        # Simple regex-based extraction
        emails = re.findall(
            r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", text
        )
        phones = re.findall(
            r"\+?1?-?\s?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}", text
        )
        dates = re.findall(
            r"\b\d{4}[-/]\d{2}[-/]\d{2}\b|\b\d{1,2}[-/]\d{1,2}[-/]\d{4}\b", text
        )
        amounts = re.findall(r"\$\s*(\d+(?:,\d{3})*(?:\.\d{2})?)", text)

        # Extract potential names (capitalized words)
        potential_names = re.findall(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b", text)

        return {
            "document_type": "unknown",
            "title": text.split("\n")[0][:100] if text else "Unknown",
            "summary": text[:200] + "..." if len(text) > 200 else text,
            "contacts": [{"email": email} for email in emails[:3]],
            "companies": [],
            "financial_metrics": [
                {"name": "Amount", "value": float(amt.replace(",", "")), "unit": "USD"}
                for amt in amounts[:3]
            ],
            "key_dates": dates[:5],
            "key_topics": ["document_analysis"],
            "confidence_score": 0.3,  # Low confidence for regex-based extraction
        }


# Initialize extractor with fallback-friendly model
# 初始化抽取器（使用容錯模型）
try:
    # Try with smaller model first for testing
    extractor = LLMExtractor(model_name="microsoft/DialoGPT-small", use_4bit=True)
except Exception as e:
    print(f"⚠️ Model loading failed: {e}")
    print("💡 Creating extractor with fallback mode...")
    extractor = LLMExtractor()

# Test extraction on sample document
# 測試文件抽取功能
print("\n🔍 Testing Information Extraction:")
print("=" * 50)

extracted_info = extractor.extract_structured_info(sample_text, schema_type="general")
print("📊 Extracted Information:")
print(json.dumps(extracted_info, indent=2, ensure_ascii=False))

In [None]:
## 5. Batch Processing Pipeline - Scalable Document Processing
# 批量處理管線 - 可擴展的文件處理工作流

import concurrent.futures
from tqdm import tqdm
import logging
from datetime import datetime
from typing import Generator


class DocumentProcessingPipeline:
    """Scalable document processing pipeline with error handling"""

    def __init__(
        self, parser: DocumentParser, extractor: LLMExtractor, max_workers: int = 2
    ):
        self.parser = parser
        self.extractor = extractor
        self.max_workers = max_workers
        self.results = []

        # Setup logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def process_single_document(
        self, file_path: Path, schema_type: str = "general"
    ) -> Dict:
        """Process a single document through the full pipeline"""
        try:
            # Parse document
            doc_content = self.parser.parse(file_path)

            # Extract structured information
            extracted_info = self.extractor.extract_structured_info(
                doc_content.text, schema_type=schema_type
            )

            # Combine results
            result = {
                "file_path": str(file_path),
                "processing_time": datetime.now().isoformat(),
                "document_metadata": doc_content.metadata,
                "extracted_info": extracted_info,
                "status": "success",
            }

            # Add table information if available
            if doc_content.tables:
                result["tables_found"] = len(doc_content.tables)
                result["table_summaries"] = [
                    {"rows": len(table), "columns": len(table.columns)}
                    for table in doc_content.tables
                ]

            return result

        except Exception as e:
            self.logger.error(f"Error processing {file_path}: {e}")
            return {
                "file_path": str(file_path),
                "processing_time": datetime.now().isoformat(),
                "error": str(e),
                "status": "error",
            }

    def process_batch(
        self, file_paths: List[Path], schema_type: str = "general"
    ) -> List[Dict]:
        """Process multiple documents in parallel"""
        results = []

        # Sequential processing for memory management (can enable parallel if VRAM allows)
        if self.max_workers == 1 or not torch.cuda.is_available():
            # Sequential processing
            for file_path in tqdm(file_paths, desc="Processing documents"):
                result = self.process_single_document(file_path, schema_type)
                results.append(result)
        else:
            # Parallel processing (use with caution on GPU)
            with concurrent.futures.ThreadPoolExecutor(
                max_workers=self.max_workers
            ) as executor:
                future_to_file = {
                    executor.submit(
                        self.process_single_document, file_path, schema_type
                    ): file_path
                    for file_path in file_paths
                }

                for future in tqdm(
                    concurrent.futures.as_completed(future_to_file),
                    total=len(file_paths),
                    desc="Processing documents",
                ):
                    result = future.result()
                    results.append(result)

        self.results = results
        return results

    def export_results(self, output_path: Path, format: str = "json"):
        """Export processing results to file"""
        if format == "json":
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(self.results, f, indent=2, ensure_ascii=False)
        elif format == "csv":
            # Flatten results for CSV export
            flattened = []
            for result in self.results:
                if result["status"] == "success":
                    flat_result = {
                        "file_path": result["file_path"],
                        "processing_time": result["processing_time"],
                        "document_type": result["extracted_info"].get(
                            "document_type", ""
                        ),
                        "title": result["extracted_info"].get("title", ""),
                        "summary": result["extracted_info"].get("summary", ""),
                        "contacts_count": len(
                            result["extracted_info"].get("contacts", [])
                        ),
                        "companies_count": len(
                            result["extracted_info"].get("companies", [])
                        ),
                        "confidence_score": result["extracted_info"].get(
                            "confidence_score", 0
                        ),
                    }
                else:
                    flat_result = {
                        "file_path": result["file_path"],
                        "processing_time": result["processing_time"],
                        "error": result.get("error", ""),
                        "status": result["status"],
                    }
                flattened.append(flat_result)

            df = pd.DataFrame(flattened)
            df.to_csv(output_path, index=False, encoding="utf-8")

        print(f"✅ Results exported to: {output_path}")

    def get_processing_stats(self) -> Dict:
        """Get processing statistics"""
        if not self.results:
            return {"message": "No results to analyze"}

        total_docs = len(self.results)
        successful = sum(1 for r in self.results if r["status"] == "success")
        failed = total_docs - successful

        # Calculate confidence score distribution
        confidence_scores = [
            r["extracted_info"]["confidence_score"]
            for r in self.results
            if r["status"] == "success" and "confidence_score" in r["extracted_info"]
        ]

        avg_confidence = (
            sum(confidence_scores) / len(confidence_scores) if confidence_scores else 0
        )

        return {
            "total_documents": total_docs,
            "successful": successful,
            "failed": failed,
            "success_rate": successful / total_docs * 100,
            "average_confidence": avg_confidence,
            "confidence_distribution": {
                "high (>0.8)": sum(1 for c in confidence_scores if c > 0.8),
                "medium (0.5-0.8)": sum(
                    1 for c in confidence_scores if 0.5 <= c <= 0.8
                ),
                "low (<0.5)": sum(1 for c in confidence_scores if c < 0.5),
            },
        }


# Create processing pipeline
# 創建處理管線
pipeline = DocumentProcessingPipeline(
    parser=parser,
    extractor=extractor,
    max_workers=1,  # Sequential processing for VRAM safety
)

# Create additional sample documents for batch testing
# 創建額外範例文件以測試批量處理
sample_files = []

# Sample contract document
contract_text = """
SERVICE AGREEMENT

This Service Agreement ("Agreement") is entered into on January 1, 2024,
between TechCorp Inc. ("Provider") and Business Solutions Ltd. ("Client").

PARTIES:
- Provider: TechCorp Inc., 123 Tech Street, Silicon Valley, CA 94000
- Client: Business Solutions Ltd., 456 Business Ave, New York, NY 10001

SCOPE OF WORK:
The Provider agrees to deliver custom software development services including:
1. Web application development
2. Mobile app development
3. System integration
4. Technical support and maintenance

PAYMENT TERMS:
- Total contract value: $250,000 USD
- Payment schedule: 50% upfront, 25% at milestone 1, 25% upon completion
- Invoices due within 30 days of receipt

TIMELINE:
- Effective Date: January 1, 2024
- Project Duration: 12 months
- Completion Date: December 31, 2024

TERMINATION:
Either party may terminate this agreement with 30 days written notice.
Client retains rights to completed work upon termination.

CONTACTS:
- Provider Contact: John Tech (john.tech@techcorp.com, +1-555-TECH)
- Client Contact: Jane Business (jane.business@bizsolve.com, +1-555-BUSI)
"""

contract_path = Path("sample_contract.txt")
with open(contract_path, "w", encoding="utf-8") as f:
    f.write(contract_text)
sample_files.append(contract_path)

# Sample resume document
resume_text = """
ALICE CHEN
Software Engineer

Contact Information:
Email: alice.chen@email.com
Phone: +1-555-0199
LinkedIn: linkedin.com/in/alicechen
Location: San Francisco, CA

PROFESSIONAL SUMMARY:
Experienced software engineer with 8+ years in full-stack development.
Specialized in Python, JavaScript, and cloud technologies.

WORK EXPERIENCE:

Senior Software Engineer | Meta | 2020-2024
- Led development of microservices handling 10M+ daily requests
- Reduced API response time by 40% through optimization
- Mentored 5 junior developers

Software Engineer | Google | 2018-2020
- Developed machine learning pipelines for search algorithms
- Collaborated with cross-functional teams of 20+ members
- Contributed to open-source TensorFlow projects

Junior Developer | Startup Co | 2016-2018
- Built web applications using React and Node.js
- Implemented CI/CD pipelines reducing deployment time by 60%

EDUCATION:
Master of Science in Computer Science | Stanford University | 2016
Bachelor of Science in Software Engineering | UC Berkeley | 2014

SKILLS:
- Programming: Python, JavaScript, Java, Go, SQL
- Frameworks: React, Django, Flask, Node.js
- Cloud: AWS, GCP, Docker, Kubernetes
- Databases: PostgreSQL, MongoDB, Redis

CERTIFICATIONS:
- AWS Solutions Architect Professional (2023)
- Google Cloud Professional Data Engineer (2022)
"""

resume_path = Path("sample_resume.txt")
with open(resume_path, "w", encoding="utf-8") as f:
    f.write(resume_text)
sample_files.append(resume_path)

# Test batch processing
# 測試批量處理
print("\n🔄 Testing Batch Processing Pipeline:")
print("=" * 50)

# Process documents with different schema types
results = []
for file_path in sample_files:
    if "contract" in file_path.name:
        result = pipeline.process_single_document(file_path, schema_type="contract")
    else:
        result = pipeline.process_single_document(file_path, schema_type="general")
    results.append(result)

# Add original sample document
results.append(pipeline.process_single_document(sample_path, schema_type="general"))

pipeline.results = results

# Display processing statistics
stats = pipeline.get_processing_stats()
print("\n📊 Processing Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")

# Export results
output_path = Path("extraction_results.json")
pipeline.export_results(output_path, format="json")

print(f"\n✅ Processed {len(sample_files)+1} documents successfully")

In [None]:
## 6. Quality Assessment Module - Extraction Accuracy Evaluation
# 品質評估模組 - 抽取準確率評估

from sklearn.metrics import precision_recall_fscore_support
import difflib
from typing import Tuple


class ExtractionEvaluator:
    """Evaluate extraction quality against ground truth"""

    def __init__(self):
        self.evaluation_results = []

    def evaluate_extraction(self, extracted: Dict, ground_truth: Dict) -> Dict:
        """Evaluate extraction against ground truth data"""

        results = {
            "overall_accuracy": 0.0,
            "field_accuracy": {},
            "entity_metrics": {},
            "confidence_correlation": 0.0,
        }

        # Field-level accuracy
        field_scores = []
        for field in ground_truth.keys():
            if field in extracted:
                if isinstance(ground_truth[field], str):
                    # Text similarity for string fields
                    similarity = difflib.SequenceMatcher(
                        None,
                        str(extracted[field]).lower(),
                        str(ground_truth[field]).lower(),
                    ).ratio()
                    field_scores.append(similarity)
                    results["field_accuracy"][field] = similarity
                elif isinstance(ground_truth[field], list):
                    # List overlap for list fields
                    if len(ground_truth[field]) == 0:
                        overlap = 1.0 if len(extracted[field]) == 0 else 0.0
                    else:
                        extracted_set = set(str(x).lower() for x in extracted[field])
                        ground_truth_set = set(
                            str(x).lower() for x in ground_truth[field]
                        )
                        overlap = len(extracted_set & ground_truth_set) / len(
                            ground_truth_set
                        )
                    field_scores.append(overlap)
                    results["field_accuracy"][field] = overlap
                else:
                    # Exact match for other types
                    exact_match = (
                        1.0 if extracted[field] == ground_truth[field] else 0.0
                    )
                    field_scores.append(exact_match)
                    results["field_accuracy"][field] = exact_match
            else:
                field_scores.append(0.0)
                results["field_accuracy"][field] = 0.0

        results["overall_accuracy"] = (
            sum(field_scores) / len(field_scores) if field_scores else 0.0
        )

        # Entity-level evaluation (contacts, companies)
        if "contacts" in ground_truth and "contacts" in extracted:
            results["entity_metrics"]["contacts"] = self._evaluate_entities(
                extracted["contacts"], ground_truth["contacts"]
            )

        if "companies" in ground_truth and "companies" in extracted:
            results["entity_metrics"]["companies"] = self._evaluate_entities(
                extracted["companies"], ground_truth["companies"]
            )

        # Confidence correlation (if available)
        if "confidence_score" in extracted:
            results["confidence_correlation"] = min(
                extracted["confidence_score"] * results["overall_accuracy"], 1.0
            )

        return results

    def _evaluate_entities(
        self, extracted_entities: List[Dict], ground_truth_entities: List[Dict]
    ) -> Dict:
        """Evaluate entity extraction (precision, recall, F1)"""
        if not ground_truth_entities:
            return (
                {"precision": 1.0, "recall": 1.0, "f1": 1.0}
                if not extracted_entities
                else {"precision": 0.0, "recall": 1.0, "f1": 0.0}
            )

        if not extracted_entities:
            return {"precision": 1.0, "recall": 0.0, "f1": 0.0}

        # Simple name-based matching for demonstration
        extracted_names = set()
        ground_truth_names = set()

        for entity in extracted_entities:
            if "name" in entity and entity["name"]:
                extracted_names.add(entity["name"].lower().strip())

        for entity in ground_truth_entities:
            if "name" in entity and entity["name"]:
                ground_truth_names.add(entity["name"].lower().strip())

        if not ground_truth_names:
            precision = 1.0 if not extracted_names else 0.0
            recall = 1.0
            f1 = 1.0 if precision == 1.0 else 0.0
        else:
            true_positives = len(extracted_names & ground_truth_names)
            precision = (
                true_positives / len(extracted_names) if extracted_names else 0.0
            )
            recall = true_positives / len(ground_truth_names)
            f1 = (
                2 * (precision * recall) / (precision + recall)
                if (precision + recall) > 0
                else 0.0
            )

        return {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "true_positives": len(extracted_names & ground_truth_names),
            "extracted_count": len(extracted_names),
            "ground_truth_count": len(ground_truth_names),
        }

    def create_evaluation_report(self, evaluations: List[Dict]) -> Dict:
        """Create comprehensive evaluation report"""
        if not evaluations:
            return {"error": "No evaluations provided"}

        # Aggregate metrics
        overall_accuracies = [e["overall_accuracy"] for e in evaluations]

        report = {
            "summary": {
                "total_evaluations": len(evaluations),
                "average_accuracy": sum(overall_accuracies) / len(overall_accuracies),
                "min_accuracy": min(overall_accuracies),
                "max_accuracy": max(overall_accuracies),
                "std_accuracy": (
                    sum(
                        (x - sum(overall_accuracies) / len(overall_accuracies)) ** 2
                        for x in overall_accuracies
                    )
                    / len(overall_accuracies)
                )
                ** 0.5,
            },
            "field_performance": {},
            "entity_performance": {},
            "recommendations": [],
        }

        # Field-level performance
        all_fields = set()
        for eval_result in evaluations:
            all_fields.update(eval_result["field_accuracy"].keys())

        for field in all_fields:
            field_scores = [e["field_accuracy"].get(field, 0.0) for e in evaluations]
            report["field_performance"][field] = {
                "average": sum(field_scores) / len(field_scores),
                "min": min(field_scores),
                "max": max(field_scores),
            }

        # Entity-level performance
        for entity_type in ["contacts", "companies"]:
            entity_metrics = []
            for eval_result in evaluations:
                if entity_type in eval_result["entity_metrics"]:
                    entity_metrics.append(eval_result["entity_metrics"][entity_type])

            if entity_metrics:
                avg_precision = sum(m["precision"] for m in entity_metrics) / len(
                    entity_metrics
                )
                avg_recall = sum(m["recall"] for m in entity_metrics) / len(
                    entity_metrics
                )
                avg_f1 = sum(m["f1"] for m in entity_metrics) / len(entity_metrics)

                report["entity_performance"][entity_type] = {
                    "precision": avg_precision,
                    "recall": avg_recall,
                    "f1": avg_f1,
                }

        # Generate recommendations
        if report["summary"]["average_accuracy"] < 0.7:
            report["recommendations"].append(
                "Overall accuracy is low. Consider improving prompts or using a larger model."
            )

        for field, performance in report["field_performance"].items():
            if performance["average"] < 0.5:
                report["recommendations"].append(
                    f"Field '{field}' shows poor extraction accuracy. Review extraction logic."
                )

        return report


# Test evaluation with sample data
# 測試評估功能
evaluator = ExtractionEvaluator()

# Create ground truth for sample document
ground_truth_sample = {
    "document_type": "annual_report",
    "title": "Company Annual Report 2024",
    "summary": "Annual financial performance report showing growth",
    "contacts": [
        {"name": "John Smith", "email": "john.smith@company.com", "title": "CEO"},
        {"name": "Jane Doe", "email": "jane.doe@company.com", "title": "CFO"},
        {"name": "Bob Wilson", "email": "bob.wilson@company.com", "title": "CTO"},
    ],
    "companies": [{"company_name": "TechCorp", "industry": "Technology"}],
    "financial_metrics": [
        {"name": "Revenue", "value": 150000000, "unit": "USD", "period": "2024"},
        {"name": "Profit", "value": 22000000, "unit": "USD", "period": "2024"},
    ],
    "key_dates": ["2024"],
    "key_topics": ["growth", "revenue", "partnership"],
    "confidence_score": 0.8,
}

# Evaluate extraction
evaluation_result = evaluator.evaluate_extraction(extracted_info, ground_truth_sample)

print("\n🎯 Extraction Quality Evaluation:")
print("=" * 50)
print(f"Overall Accuracy: {evaluation_result['overall_accuracy']:.2f}")
print("\nField-level Accuracy:")
for field, score in evaluation_result["field_accuracy"].items():
    print(f"  {field}: {score:.2f}")

if evaluation_result["entity_metrics"]:
    print("\nEntity Extraction Metrics:")
    for entity_type, metrics in evaluation_result["entity_metrics"].items():
        print(f"  {entity_type}:")
        print(f"    Precision: {metrics['precision']:.2f}")
        print(f"    Recall: {metrics['recall']:.2f}")
        print(f"    F1-Score: {metrics['f1']:.2f}")

In [None]:
## 7. Real-world Use Cases - Contract, Resume, and Academic Paper Analysis
# 實戰案例 - 合約、簡歷、學術論文分析


class SpecializedExtractors:
    """Specialized extractors for different document types"""

    def __init__(self, base_extractor: LLMExtractor):
        self.base_extractor = base_extractor

    def extract_resume_info(self, text: str) -> Dict:
        """Extract structured information from resume/CV"""
        prompt = f"""
Extract professional information from this resume/CV in JSON format:

{text[:1500]}...

Extract:
1. Personal information (name, email, phone, location)
2. Professional summary/objective
3. Work experience (company, role, duration, achievements)
4. Education (degree, school, year)
5. Skills (technical, soft skills)
6. Certifications
7. Years of experience (total)

Return JSON:
{{
    "personal_info": {{
        "name": "Full Name",
        "email": "email@domain.com",
        "phone": "+1-555-0123",
        "location": "City, State"
    }},
    "professional_summary": "Brief summary...",
    "work_experience": [
        {{
            "company": "Company Name",
            "role": "Job Title",
            "duration": "2020-2024",
            "achievements": ["Achievement 1", "Achievement 2"]
        }}
    ],
    "education": [
        {{
            "degree": "Master of Science",
            "field": "Computer Science",
            "school": "University Name",
            "year": "2016"
        }}
    ],
    "skills": {{
        "technical": ["Python", "JavaScript"],
        "soft": ["Leadership", "Communication"]
    }},
    "certifications": ["AWS Certified", "Google Cloud"],
    "total_experience_years": 8
}}

JSON:
"""
        return self.base_extractor._extract_with_prompt(prompt, DocumentSummary)

    def extract_academic_paper_info(self, text: str) -> Dict:
        """Extract information from academic papers"""
        prompt = f"""
Extract academic paper information in JSON format:

{text[:1500]}...

Extract:
1. Title and abstract
2. Authors and affiliations
3. Keywords
4. Methodology/approach
5. Key findings/results
6. References count (estimate)
7. Research field/domain

Return JSON:
{{
    "title": "Paper Title",
    "abstract": "Abstract text...",
    "authors": [
        {{"name": "Author Name", "affiliation": "University Name"}}
    ],
    "keywords": ["keyword1", "keyword2"],
    "methodology": "Research approach description",
    "key_findings": ["Finding 1", "Finding 2"],
    "research_field": "Computer Science",
    "estimated_references": 25,
    "publication_info": {{
        "journal": "Journal Name",
        "year": "2024",
        "volume": "12",
        "pages": "1-15"
    }}
}}

JSON:
"""
        return self.base_extractor._extract_with_prompt(prompt, DocumentSummary)


# Initialize specialized extractors
# 初始化專門抽取器
specialized = SpecializedExtractors(extractor)

# Test resume extraction
print("\n👤 Testing Resume Information Extraction:")
print("=" * 50)

resume_info = specialized.extract_resume_info(resume_text)
print("📄 Resume Extraction Results:")
print(json.dumps(resume_info, indent=2, ensure_ascii=False)[:1000] + "...")

# Create sample academic paper for testing
academic_paper_text = """
Deep Learning for Natural Language Processing: A Comprehensive Survey

Abstract:
This paper presents a comprehensive survey of deep learning techniques applied to natural language processing (NLP) tasks. We review the evolution from traditional machine learning approaches to modern transformer-based architectures, analyzing their strengths and limitations across various NLP applications.

Authors:
- Dr. Sarah Johnson, Stanford University, Computer Science Department
- Prof. Michael Chen, MIT, Artificial Intelligence Lab
- Dr. Elena Rodriguez, Google Research, NLP Team

Keywords: deep learning, natural language processing, transformers, BERT, GPT, neural networks, language models

1. Introduction
Natural Language Processing has undergone significant transformation with the advent of deep learning techniques. Traditional approaches based on rule-based systems and statistical methods have largely been superseded by neural network architectures.

2. Methodology
We conducted a systematic literature review of 150+ papers published between 2018-2024, focusing on transformer architectures and their applications. Our analysis covers supervised, unsupervised, and semi-supervised learning paradigms.

3. Key Findings
- Transformer models achieve state-of-the-art performance across multiple NLP benchmarks
- Pre-training on large corpora followed by fine-tuning shows consistent improvements
- Attention mechanisms enable better handling of long-range dependencies
- Model size correlates with performance up to a saturation point

4. Applications
The reviewed techniques show effectiveness in:
- Machine Translation (BLEU scores improved by 15-20%)
- Sentiment Analysis (accuracy gains of 10-12%)
- Question Answering (F1 scores increased by 8-15%)
- Text Summarization (ROUGE scores improved by 12-18%)

5. Conclusion
Deep learning has revolutionized NLP, with transformer architectures leading current advances. Future research should focus on efficiency improvements and multi-modal integration.

References: [1-47 academic citations listed]

Published in: Journal of Artificial Intelligence Research, Vol. 28, 2024, pp. 1-25
"""

print("\n📚 Testing Academic Paper Information Extraction:")
print("=" * 50)

academic_info = specialized.extract_academic_paper_info(academic_paper_text)
print("📄 Academic Paper Extraction Results:")
print(json.dumps(academic_info, indent=2, ensure_ascii=False)[:1000] + "...")

In [None]:
## 8. Validation and Schema Compliance
# 驗證與 Schema 符合性檢查

from pydantic import ValidationError
import jsonschema


class ExtractionValidator:
    """Validate extraction results against predefined schemas"""

    def __init__(self):
        self.validation_results = []

    def validate_against_schema(self, data: Dict, schema_class) -> Dict:
        """Validate extracted data against Pydantic schema"""
        try:
            # Attempt to create model instance
            validated_model = schema_class(**data)
            return {
                "is_valid": True,
                "validated_data": validated_model.dict(),
                "errors": [],
            }
        except ValidationError as e:
            return {
                "is_valid": False,
                "validated_data": None,
                "errors": [
                    {"field": err["loc"], "message": err["msg"]} for err in e.errors
                ],
            }
        except Exception as e:
            return {
                "is_valid": False,
                "validated_data": None,
                "errors": [{"field": "general", "message": str(e)}],
            }

    def clean_and_retry_validation(self, data: Dict, schema_class) -> Dict:
        """Attempt to clean data and retry validation"""
        cleaned_data = data.copy()

        # Common cleaning operations
        for key, value in cleaned_data.items():
            if isinstance(value, str):
                # Clean strings
                cleaned_data[key] = value.strip()
            elif isinstance(value, list):
                # Clean lists - remove None values
                cleaned_data[key] = [item for item in value if item is not None]

        # Set default values for required fields if missing
        if hasattr(schema_class, "__fields__"):
            for field_name, field_info in schema_class.__fields__.items():
                if field_name not in cleaned_data:
                    if (
                        hasattr(field_info, "default")
                        and field_info.default is not None
                    ):
                        cleaned_data[field_name] = field_info.default
                    elif field_info.type_ == str:
                        cleaned_data[field_name] = ""
                    elif field_info.type_ == list:
                        cleaned_data[field_name] = []
                    elif field_info.type_ == float:
                        cleaned_data[field_name] = 0.0

        return self.validate_against_schema(cleaned_data, schema_class)


# Test validation
# 測試驗證功能
validator = ExtractionValidator()

print("\n✅ Testing Schema Validation:")
print("=" * 50)

# Validate sample extraction result
validation_result = validator.validate_against_schema(extracted_info, DocumentSummary)
print(
    f"Validation Result: {'✅ VALID' if validation_result['is_valid'] else '❌ INVALID'}"
)

if not validation_result["is_valid"]:
    print("Validation Errors:")
    for error in validation_result["errors"]:
        print(f"  - {error['field']}: {error['message']}")

    # Try cleaning and re-validation
    print("\n🔧 Attempting to clean and re-validate...")
    cleaned_validation = validator.clean_and_retry_validation(
        extracted_info, DocumentSummary
    )
    print(
        f"Cleaned Validation: {'✅ VALID' if cleaned_validation['is_valid'] else '❌ STILL INVALID'}"
    )

In [None]:
## 9. Performance Monitoring and Optimization
# 性能監控與優化

import time
import psutil
import gc


class PerformanceMonitor:
    """Monitor extraction performance and resource usage"""

    def __init__(self):
        self.metrics = []

    def measure_extraction_performance(
        self, extractor: LLMExtractor, text: str, schema_type: str = "general"
    ) -> Dict:
        """Measure extraction performance metrics"""

        # Clear GPU cache before measurement
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            initial_vram = torch.cuda.memory_allocated()

        initial_ram = psutil.virtual_memory().used
        start_time = time.time()

        # Perform extraction
        try:
            result = extractor.extract_structured_info(text, schema_type)
            success = True
            error = None
        except Exception as e:
            result = None
            success = False
            error = str(e)

        end_time = time.time()
        final_ram = psutil.virtual_memory().used

        if torch.cuda.is_available():
            final_vram = torch.cuda.memory_allocated()
            vram_used = (final_vram - initial_vram) / 1e6  # MB
        else:
            vram_used = 0

        ram_used = (final_ram - initial_ram) / 1e6  # MB
        processing_time = end_time - start_time

        metrics = {
            "processing_time_seconds": processing_time,
            "ram_used_mb": ram_used,
            "vram_used_mb": vram_used,
            "text_length": len(text),
            "tokens_per_second": (
                len(text.split()) / processing_time if processing_time > 0 else 0
            ),
            "success": success,
            "error": error,
            "timestamp": datetime.now().isoformat(),
        }

        self.metrics.append(metrics)
        return metrics

    def get_performance_summary(self) -> Dict:
        """Generate performance summary statistics"""
        if not self.metrics:
            return {"message": "No performance data available"}

        successful_runs = [m for m in self.metrics if m["success"]]

        if not successful_runs:
            return {"message": "No successful runs to analyze"}

        processing_times = [m["processing_time_seconds"] for m in successful_runs]
        ram_usage = [m["ram_used_mb"] for m in successful_runs]
        vram_usage = [m["vram_used_mb"] for m in successful_runs]

        return {
            "total_runs": len(self.metrics),
            "successful_runs": len(successful_runs),
            "success_rate": len(successful_runs) / len(self.metrics) * 100,
            "average_processing_time": sum(processing_times) / len(processing_times),
            "min_processing_time": min(processing_times),
            "max_processing_time": max(processing_times),
            "average_ram_usage_mb": sum(ram_usage) / len(ram_usage),
            "average_vram_usage_mb": sum(vram_usage) / len(vram_usage),
            "average_tokens_per_second": sum(
                m["tokens_per_second"] for m in successful_runs
            )
            / len(successful_runs),
        }


# Test performance monitoring
# 測試性能監控
monitor = PerformanceMonitor()

print("\n⚡ Performance Testing:")
print("=" * 50)

# Test with sample documents
test_texts = [
    sample_text,
    contract_text,
    resume_text[:1000],
]  # Truncate for faster testing

for i, text in enumerate(test_texts):
    print(f"\nTesting document {i+1}...")
    perf_metrics = monitor.measure_extraction_performance(extractor, text)
    print(f"  Processing time: {perf_metrics['processing_time_seconds']:.2f}s")
    print(f"  RAM used: {perf_metrics['ram_used_mb']:.1f}MB")
    print(f"  VRAM used: {perf_metrics['vram_used_mb']:.1f}MB")
    print(f"  Success: {'✅' if perf_metrics['success'] else '❌'}")

# Performance summary
perf_summary = monitor.get_performance_summary()
print(f"\n📊 Performance Summary:")
for key, value in perf_summary.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value}")

In [None]:
## 10. Smoke Test & Validation
# 驗收測試與驗證


def run_smoke_test():
    """Comprehensive smoke test for document extraction pipeline"""
    print("\n🧪 Running Comprehensive Smoke Test...")
    print("=" * 60)

    test_results = {
        "parser_test": False,
        "extractor_test": False,
        "pipeline_test": False,
        "validation_test": False,
        "performance_test": False,
    }

    try:
        # Test 1: Document Parser
        print("1️⃣ Testing Document Parser...")
        test_parser = DocumentParser()
        test_doc = test_parser.parse(sample_path)
        assert test_doc.text is not None
        assert len(test_doc.text) > 0
        test_results["parser_test"] = True
        print("   ✅ Parser works correctly")

        # Test 2: Information Extractor
        print("2️⃣ Testing Information Extractor...")
        test_extractor = LLMExtractor()
        test_extraction = test_extractor.extract_structured_info(sample_text[:500])
        assert isinstance(test_extraction, dict)
        assert "document_type" in test_extraction
        test_results["extractor_test"] = True
        print("   ✅ Extractor works correctly")

        # Test 3: Processing Pipeline
        print("3️⃣ Testing Processing Pipeline...")
        test_pipeline = DocumentProcessingPipeline(
            test_parser, test_extractor, max_workers=1
        )
        pipeline_result = test_pipeline.process_single_document(sample_path)
        assert pipeline_result["status"] == "success"
        assert "extracted_info" in pipeline_result
        test_results["pipeline_test"] = True
        print("   ✅ Pipeline works correctly")

        # Test 4: Schema Validation
        print("4️⃣ Testing Schema Validation...")
        test_validator = ExtractionValidator()
        validation_result = test_validator.validate_against_schema(
            test_extraction, DocumentSummary
        )
        # Should either be valid or cleanable
        if not validation_result["is_valid"]:
            cleaned_result = test_validator.clean_and_retry_validation(
                test_extraction, DocumentSummary
            )
            assert (
                cleaned_result["is_valid"] or len(cleaned_result["errors"]) < 5
            )  # Allow some errors
        test_results["validation_test"] = True
        print("   ✅ Validation works correctly")

        # Test 5: Performance Monitoring
        print("5️⃣ Testing Performance Monitoring...")
        test_monitor = PerformanceMonitor()
        perf_result = test_monitor.measure_extraction_performance(
            test_extractor, sample_text[:300]
        )
        assert "processing_time_seconds" in perf_result
        assert perf_result["processing_time_seconds"] > 0
        test_results["performance_test"] = True
        print("   ✅ Performance monitoring works correctly")

    except Exception as e:
        print(f"   ❌ Test failed: {e}")
        return False

    # Final results
    all_passed = all(test_results.values())
    print(f"\n🎯 Smoke Test Results:")
    for test_name, passed in test_results.items():
        status = "✅ PASS" if passed else "❌ FAIL"
        print(f"   {test_name}: {status}")

    print(f"\n{'🎉 ALL TESTS PASSED!' if all_passed else '⚠️ SOME TESTS FAILED'}")
    return all_passed


# Run comprehensive smoke test
smoke_test_result = run_smoke_test()

# Cleanup test files
import os

try:
    os.remove(sample_path)
    os.remove(contract_path)
    os.remove(resume_path)
    os.remove("extraction_results.json")
    print("\n🧹 Cleanup completed - test files removed")
except:
    print("\n⚠️ Some test files could not be removed")

print(f"\n{'='*60}")
print("📋 NOTEBOOK COMPLETION SUMMARY")
print(f"{'='*60}")

print(
    """
✅ 完成項目 (Completed Items):
  • 多格式文件解析器 (PDF/DOCX/HTML/TXT/MD)
  • Pydantic Schema 定義與驗證
  • LLM 結構化資訊抽取引擎
  • 批量處理管線與錯誤處理
  • 專門抽取器 (簡歷/合約/學術論文)
  • 品質評估與準確率計算
  • 性能監控與資源使用追蹤
  • 完整的驗收測試流程

🧠 核心概念 (Core Concepts):
  • Schema-driven Extraction: 使用 Pydantic 定義結構化輸出格式
  • Multi-format Parsing: 統一介面處理不同文件格式
  • LLM Prompting for IE: 結構化提示工程進行資訊抽取
  • Quality Assessment: 準確率、召回率、F1-score 評估
  • Performance Optimization: 低 VRAM 配置與批量處理
  • Error Handling: 容錯機制與降級策略

⚠️ 常見問題 (Common Issues):
  • 模型載入失敗 → 使用 4-bit 量化或降級至 CPU
  • JSON 解析錯誤 → 實作正規表達式後備方案
  • 記憶體不足 → 減少 batch size 或使用序列處理
  • 抽取準確率低 → 調整提示詞或使用更大模型
  • Schema 驗證失敗 → 實作資料清理與預設值

🚀 下一步建議 (Next Steps):
  1. 整合向量資料庫進行語意搜尋
  2. 加入多語言支援 (中文/英文混合)
  3. 實作增量學習機制改善抽取品質
  4. 開發 Web UI 進行互動式文件分析
  5. 整合 OCR 功能處理掃描文件
"""
)

In [None]:
## 11. Advanced Features & Extensions (Optional)
# 進階功能與擴展 (可選)

print(f"\n{'='*60}")
print("🔬 ADVANCED FEATURES PREVIEW")
print(f"{'='*60}")

print(
    """
💡 可選擴展功能 (Optional Extensions):

1. 📸 OCR Integration (光學字符識別)
   - 使用 pytesseract 或 EasyOCR 處理掃描文件
   - 支援中英文混合識別
   - 表格結構保持

2. 🌐 Multilingual Support (多語言支援)
   - 中文實體識別與關係抽取
   - 繁簡轉換 (opencc)
   - 跨語言 Schema 映射

3. 🔄 Active Learning (主動學習)
   - 標註介面整合
   - 不確定性採樣
   - 漸進式模型改善

4. 📊 Dashboard & Analytics (儀表板分析)
   - Streamlit/Gradio 網頁介面
   - 即時抽取準確率監控
   - 文件類型分布分析

5. 🔗 API Integration (API 整合)
   - RESTful API endpoints
   - Webhook 支援批量處理
   - 雲端儲存整合 (S3/GCS)

6. 🎯 Domain Adaptation (領域適應)
   - 醫療文件專用抽取器
   - 法律合約風險評估
   - 財務報表數據驗證

使用指令查看具體實作:
  • nb17_multilingual_ocr.ipynb
  • nb18_active_learning_pipeline.ipynb
  • nb19_document_analytics_dashboard.ipynb
"""
)

print(f"\n🎓 學習建議 (Learning Recommendations):")
print(
    """
1. 先熟練基本抽取管線，再嘗試進階功能
2. 針對特定領域收集高品質標註資料
3. 比較不同 LLM 在結構化抽取的表現
4. 建立評估基準以量化改善效果
5. 考慮成本效益平衡 (準確率 vs 處理速度)
"""
)


### 📋 本章小結

**✅ 完成項目 (Completed Items):**
- 多格式文件解析器 (PDF/DOCX/HTML/TXT/MD 統一介面)
- Pydantic Schema 定義與驗證 (DocumentSummary, ContractInfo)
- LLM 結構化資訊抽取引擎 (支援低 VRAM 配置)
- 批量處理管線與錯誤處理 (並行處理與容錯機制)
- 專門抽取器 (簡歷/合約/學術論文特化版本)
- 品質評估模組 (準確率、召回率、F1-score)
- 性能監控與資源使用追蹤 (RAM/VRAM 監控)
- 完整的驗收測試流程 (端到端功能驗證)

**🧠 核心原理 (Core Concepts):**
- **Schema-driven Extraction**: 使用 Pydantic 定義結構化輸出格式，確保資料一致性
- **Multi-format Parsing**: 統一介面處理不同文件格式，降低整合複雜度  
- **LLM Prompting for IE**: 結構化提示工程進行資訊抽取，平衡準確率與可控性
- **Quality Assessment**: 多維度評估指標 (欄位準確率、實體 F1、信心度相關性)
- **Performance Optimization**: 低 VRAM 配置策略與批量處理最佳化

**⚠️ 常見陷阱 (Common Pitfalls):**
- 模型載入失敗 → 使用 4-bit 量化或降級至 CPU 模式
- JSON 解析錯誤 → 實作正規表達式後備抽取方案  
- 記憶體不足 → 減少 batch size 或改用序列處理
- 抽取準確率低 → 調整提示詞模板或使用更大模型
- Schema 驗證失敗 → 實作資料清理與預設值機制

**🚀 下一步行動 (Next Actions):**
1. **整合向量檢索**: 結合 `nb26_rag_basic_faiss.ipynb` 進行語意搜尋
2. **多語言支援**: 加入中文實體識別與繁簡轉換
3. **增量學習**: 實作主動學習機制持續改善抽取品質  
4. **Web UI 開發**: 建立互動式文件分析介面
5. **領域特化**: 針對醫療/法律/財務等特定領域客製化

**💡 實務建議:**
- 優先建立高品質的標註資料集作為評估基準
- 比較不同 LLM (GPT/Qwen/DeepSeek) 在結構化抽取的表現差異
- 考慮成本效益平衡：準確率提升 vs 處理速度與資源消耗
- 建立監控儀表板追蹤生產環境的抽取品質變化

這個 notebook 為後續的 RAG 應用和 Agent 系統提供了強大的文件理解基礎，是構建智能文件處理系統的重要里程碑。