
## Stage 1 - nb07: Dataset JSONL Messages 格式與校驗

**目標**：建立標準化的對話資料格式、Pydantic 驗證器與實用工具
 
 **前提**：Stage 1 基礎環境已設定完成
 
 **主要功能**：
 - 定義 OpenAI 風格的 messages 格式
 - Pydantic 模型驗證與型別安全
 - JSONL 檔案處理與錯誤容忍
 - 資料統計與品質檢查

In [None]:
# Cell1:  Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

In [None]:
# %%
# Cell 2: Dependencies and Setup
import json
import jsonlines
from typing import List, Dict, Any, Optional, Union, Literal
from pathlib import Path
from pydantic import BaseModel, Field, validator, field_validator, ValidationError
from datetime import datetime
import tiktoken
from collections import Counter

# Create data directories
Path("data").mkdir(exist_ok=True)
Path("outs").mkdir(exist_ok=True)

print("Dependencies loaded successfully")

In [None]:
# %%
# Cell 3: Messages Schema Definition with Pydantic
class Message(BaseModel):
    """Single message in conversation"""

    role: Literal["system", "user", "assistant"] = Field(
        ..., description="Message role"
    )
    content: str = Field(..., min_length=1, description="Message content")

    @field_validator("content", mode='after')
    def content_not_empty(cls, v):
        if not v.strip():
            raise ValueError("Content cannot be empty or whitespace only")
        return v.strip()


class Conversation(BaseModel):
    """Complete conversation with metadata"""

    messages: List[Message] = Field(..., min_items=1, description="List of messages") # type: ignore
    conversation_id: Optional[str] = Field(None, description="Unique conversation ID")
    metadata: Optional[Dict[str, Any]] = Field(
        default_factory=dict, description="Additional metadata"
    )

    @field_validator("messages", mode='after')
    def validate_conversation_flow(cls, v):
        """Ensure conversation has proper flow"""
        if not v:
            raise ValueError("Conversation must have at least one message")

        # Check if conversation ends with assistant (good practice)
        if len(v) > 1 and v[-1].role != "assistant":
            print(f"Warning: Conversation doesn't end with assistant response")

        return v

    def token_count(self, model_name: str = "gpt-3.5-turbo") -> int:
        """Estimate token count using tiktoken"""
        try:
            enc = tiktoken.encoding_for_model(model_name)
        except KeyError:
            enc = tiktoken.get_encoding("cl100k_base")  # fallback

        total_tokens = 0
        for msg in self.messages:
            # Rough estimate: role + content + formatting tokens
            total_tokens += len(enc.encode(f"{msg.role}: {msg.content}"))

        return total_tokens

    def get_stats(self) -> Dict[str, Any]:
        """Get conversation statistics"""
        roles = [msg.role for msg in self.messages]
        role_counts = Counter(roles)

        content_lengths = [len(msg.content) for msg in self.messages]

        return {
            "message_count": len(self.messages),
            "role_distribution": dict(role_counts),
            "total_chars": sum(content_lengths),
            "avg_message_length": sum(content_lengths) / len(content_lengths),
            "token_estimate": self.token_count(),
            "conversation_id": self.conversation_id,
        }

In [None]:
# %%
# Cell 4: JSONL File Processing Tools
class DatasetProcessor:
    """Handle JSONL dataset processing with error tolerance"""

    def __init__(self, error_tolerance: float = 0.1):
        """
        Args:
            error_tolerance: Maximum fraction of invalid records to tolerate
        """
        self.error_tolerance = error_tolerance
        self.validation_errors = []

    def load_jsonl(self, file_path: Union[str, Path]) -> List[Conversation]:
        """Load and validate JSONL file"""
        file_path = Path(file_path)
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        conversations = []
        errors = []
        total_lines = 0

        print(f"Loading JSONL from: {file_path}")

        with jsonlines.open(file_path, "r") as reader:
            for line_num, data in enumerate(reader, 1):
                total_lines += 1
                try:
                    if "messages" in data:
                        # Direct format
                        conv = Conversation(**data)
                    else:
                        # Legacy format conversion
                        conv = self._convert_legacy_format(data, line_num)

                    conversations.append(conv)

                except ValidationError as e:
                    error_msg = f"Line {line_num}: {str(e)}"
                    errors.append(error_msg)
                    self.validation_errors.append(
                        {"line": line_num, "data": data, "error": str(e)}
                    )
                except Exception as e:
                    error_msg = f"Line {line_num}: Unexpected error - {str(e)}"
                    errors.append(error_msg)

        # Check error tolerance
        error_rate = len(errors) / total_lines if total_lines > 0 else 0

        print(f"Loaded {len(conversations)}/{total_lines} conversations")
        print(f"Error rate: {error_rate:.1%}")

        if error_rate > self.error_tolerance:
            print(
                f"WARNING: Error rate {error_rate:.1%} exceeds tolerance {self.error_tolerance:.1%}"
            )
            print("First 3 errors:")
            for err in errors[:3]:
                print(f"  - {err}")

        return conversations

    def save_jsonl(
        self, conversations: List[Conversation], file_path: Union[str, Path]
    ):
        """Save conversations to JSONL file"""
        file_path = Path(file_path)
        file_path.parent.mkdir(parents=True, exist_ok=True)

        with jsonlines.open(file_path, "w") as writer:
            for conv in conversations:
                writer.write(conv.dict())

        print(f"Saved {len(conversations)} conversations to: {file_path}")

    def _convert_legacy_format(self, data: Dict, line_num: int) -> Conversation:
        """Convert legacy formats to standard format"""

        # Handle various legacy formats
        if "instruction" in data and "output" in data:
            # Alpaca-style format
            messages = [
                Message(role="user", content=data["instruction"]),
                Message(role="assistant", content=data["output"]),
            ]
            if "input" in data and data["input"].strip():
                messages[0].content = f"{data['instruction']}\n\nInput: {data['input']}"

        elif "prompt" in data and "response" in data:
            # Simple prompt-response format
            messages = [
                Message(role="user", content=data["prompt"]),
                Message(role="assistant", content=data["response"]),
            ]
        else:
            raise ValueError(f"Unknown format in line {line_num}")

        return Conversation(
            messages=messages,
            conversation_id=data.get("id", f"conv_{line_num}"),
            metadata=data.get("metadata", {}),
        )

In [None]:
# %%
# Cell 5: Dataset Validation and Statistics
class DatasetAnalyzer:
    """Analyze and validate datasets"""

    @staticmethod
    def analyze_dataset(conversations: List[Conversation]) -> Dict[str, Any]:
        """Generate comprehensive dataset statistics"""

        if not conversations:
            return {"error": "No conversations to analyze"}

        # Aggregate statistics
        all_stats = [conv.get_stats() for conv in conversations]

        # Role distribution across all conversations
        role_counts = Counter()
        for stats in all_stats:
            for role, count in stats["role_distribution"].items():
                role_counts[role] += count

        # Message and token statistics
        message_counts = [stats["message_count"] for stats in all_stats]
        token_counts = [stats["token_estimate"] for stats in all_stats]
        char_counts = [stats["total_chars"] for stats in all_stats]

        analysis = {
            "dataset_size": len(conversations),
            "total_messages": sum(message_counts),
            "total_estimated_tokens": sum(token_counts),
            "total_characters": sum(char_counts),
            "role_distribution": dict(role_counts),
            "conversation_length": {
                "min": min(message_counts),
                "max": max(message_counts),
                "avg": sum(message_counts) / len(message_counts),
                "median": sorted(message_counts)[len(message_counts) // 2],
            },
            "token_distribution": {
                "min": min(token_counts),
                "max": max(token_counts),
                "avg": sum(token_counts) / len(token_counts),
                "median": sorted(token_counts)[len(token_counts) // 2],
            },
            "quality_checks": DatasetAnalyzer._quality_checks(conversations),
        }

        return analysis

    @staticmethod
    def _quality_checks(conversations: List[Conversation]) -> Dict[str, Any]:
        """Run quality checks on dataset"""
        checks = {
            "empty_messages": 0,
            "very_short_messages": 0,  # < 10 chars
            "very_long_messages": 0,  # > 2000 chars
            "single_message_convs": 0,
            "incomplete_convs": 0,  # don't end with assistant
        }

        for conv in conversations:
            if len(conv.messages) == 1:
                checks["single_message_convs"] += 1

            if conv.messages[-1].role != "assistant":
                checks["incomplete_convs"] += 1

            for msg in conv.messages:
                if len(msg.content.strip()) == 0:
                    checks["empty_messages"] += 1
                elif len(msg.content) < 10:
                    checks["very_short_messages"] += 1
                elif len(msg.content) > 2000:
                    checks["very_long_messages"] += 1

        return checks

    @staticmethod
    def print_analysis(analysis: Dict[str, Any]):
        """Pretty print analysis results"""
        print("=== Dataset Analysis ===")
        print(f"Dataset size: {analysis['dataset_size']} conversations")
        print(f"Total messages: {analysis['total_messages']}")
        print(f"Estimated tokens: {analysis['total_estimated_tokens']:,}")

        print("\n--- Role Distribution ---")
        for role, count in analysis["role_distribution"].items():
            percentage = count / analysis["total_messages"] * 100
            print(f"{role}: {count} ({percentage:.1f}%)")

        print("\n--- Conversation Length ---")
        conv_len = analysis["conversation_length"]
        print(
            f"Min: {conv_len['min']}, Max: {conv_len['max']}, Avg: {conv_len['avg']:.1f}"
        )

        print("\n--- Token Distribution ---")
        token_dist = analysis["token_distribution"]
        print(
            f"Min: {token_dist['min']}, Max: {token_dist['max']}, Avg: {token_dist['avg']:.1f}"
        )

        print("\n--- Quality Issues ---")
        quality = analysis["quality_checks"]
        for issue, count in quality.items():
            if count > 0:
                print(f"{issue}: {count}")

In [None]:
# %%
# Cell 6: Format Conversion Tools
class FormatConverter:
    """Convert between different conversation formats"""

    @staticmethod
    def to_alpaca_format(conversations: List[Conversation]) -> List[Dict[str, str]]:
        """Convert to Alpaca instruction format"""
        alpaca_data = []

        for conv in conversations:
            if (
                len(conv.messages) >= 2
                and conv.messages[0].role == "user"
                and conv.messages[1].role == "assistant"
            ):
                alpaca_item = {
                    "instruction": conv.messages[0].content,
                    "input": "",
                    "output": conv.messages[1].content,
                }
                alpaca_data.append(alpaca_item)

        print(f"Converted {len(alpaca_data)} conversations to Alpaca format")
        return alpaca_data

    @staticmethod
    def to_chat_format(conversations: List[Conversation]) -> List[Dict[str, Any]]:
        """Convert to pure chat format (list of messages)"""
        chat_data = []

        for conv in conversations:
            chat_item = {
                "messages": [
                    {"role": msg.role, "content": msg.content} for msg in conv.messages
                ],
                "id": conv.conversation_id,
            }
            if conv.metadata:
                chat_item["metadata"] = conv.metadata
            chat_data.append(chat_item)

        return chat_data

    @staticmethod
    def to_prompt_response_format(
        conversations: List[Conversation],
    ) -> List[Dict[str, str]]:
        """Convert to simple prompt-response format"""
        prompt_data = []

        for conv in conversations:
            # Combine all user messages as prompt, last assistant as response
            user_messages = [msg.content for msg in conv.messages if msg.role == "user"]
            assistant_messages = [
                msg.content for msg in conv.messages if msg.role == "assistant"
            ]

            if user_messages and assistant_messages:
                prompt_data.append(
                    {
                        "prompt": "\n".join(user_messages),
                        "response": assistant_messages[-1],  # Last assistant response
                    }
                )

        print(f"Converted {len(prompt_data)} conversations to prompt-response format")
        return prompt_data

In [None]:
# %%
# Cell 7: Create Sample Data and Smoke Test
def create_sample_data():
    """Create sample conversations for testing"""

    sample_conversations = [
        {
            "messages": [
                {
                    "role": "system",
                    "content": "You are a helpful AI assistant specializing in programming.",
                },
                {"role": "user", "content": "What is Python list comprehension?"},
                {
                    "role": "assistant",
                    "content": "Python list comprehension is a concise way to create lists. It follows the syntax: [expression for item in iterable if condition]. For example: [x**2 for x in range(10) if x%2==0] creates a list of squares of even numbers.",
                },
            ],
            "conversation_id": "sample_001",
            "metadata": {"topic": "programming", "difficulty": "beginner"},
        },
        {
            "messages": [
                {"role": "user", "content": "Explain RAG in AI"},
                {
                    "role": "assistant",
                    "content": "RAG (Retrieval-Augmented Generation) is an AI technique that combines information retrieval with text generation. It first retrieves relevant documents from a knowledge base, then uses that context to generate more accurate and informed responses.",
                },
            ],
            "conversation_id": "sample_002",
            "metadata": {"topic": "AI", "difficulty": "intermediate"},
        },
        {
            "messages": [
                {"role": "system", "content": "You are a cooking assistant."},
                {"role": "user", "content": "How to make scrambled eggs?"},
                {
                    "role": "assistant",
                    "content": "To make scrambled eggs: 1) Crack 2-3 eggs into a bowl, 2) Add a pinch of salt and pepper, 3) Whisk well, 4) Heat butter in a non-stick pan over medium-low heat, 5) Pour in eggs and gently stir continuously until just set. Don't overcook!",
                },
                {"role": "user", "content": "What if I want them creamier?"},
                {
                    "role": "assistant",
                    "content": "For creamier scrambled eggs: 1) Add 1-2 tablespoons of cream or milk when whisking, 2) Cook on lower heat, 3) Remove from heat while still slightly wet as they'll continue cooking, 4) You can also add a pat of cold butter at the end and stir it in.",
                },
            ],
            "conversation_id": "sample_003",
            "metadata": {"topic": "cooking", "difficulty": "beginner"},
        },
    ]

    return sample_conversations


# Smoke Test
print("=== Smoke Test: JSONL Messages Processing ===")

# Create sample data
sample_data = create_sample_data()
processor = DatasetProcessor(error_tolerance=0.2)

# Save sample data to file
sample_file = Path("data/sample_conversations.jsonl")
with jsonlines.open(sample_file, "w") as writer:
    for item in sample_data:
        writer.write(item)

print(f"✓ Created sample file: {sample_file}")

# Test loading and validation
conversations = processor.load_jsonl(sample_file)
print(f"✓ Loaded {len(conversations)} conversations")

# Test analysis
analyzer = DatasetAnalyzer()
analysis = analyzer.analyze_dataset(conversations)
analyzer.print_analysis(analysis)

# Test format conversion
converter = FormatConverter()
alpaca_format = converter.to_alpaca_format(conversations)
chat_format = converter.to_chat_format(conversations)

print(f"\n✓ Format conversions successful:")
print(f"  - Alpaca format: {len(alpaca_format)} items")
print(f"  - Chat format: {len(chat_format)} items")

# Test individual conversation
conv = conversations[0]
print(f"\n✓ Sample conversation stats:")
print(f"  - Messages: {len(conv.messages)}")
print(f"  - Tokens: {conv.token_count()}")
print(f"  - Characters: {len(conv.dict()['messages'][1]['content'])}")

print("\n🎉 All tests passed! JSONL messages processing is ready.")




 ## 重要參數與設定

 ### 低顯存選項
 - 此 notebook 主要處理文字資料，不使用 GPU
 - 記憶體使用：主要受資料集大小影響，建議分批處理大型資料集

 ### 關鍵參數
 - `error_tolerance`: 容錯率設定（預設 10%）
 - `min_length=1`: 訊息最小長度
 - Token 估算：使用 tiktoken，支援多種模型

 ### 安全考量
 - 輸入驗證：防止空白或惡意內容
 - 檔案路徑檢查：防止路徑遍歷攻擊
 - 錯誤容忍：避免單一錯誤影響整個處理流程

 ## 使用時機

 **適用情況**：
 - 準備訓練資料集
 - 驗證對話格式正確性
 - 資料集品質檢查與統計
 - 格式轉換與標準化

 **不適用情況**：
 - 即時對話處理（太重）
 - 小量資料（過度工程化）
 - 非結構化文字處理