In [None]:
# nb23_dataset_curation_cleaning.ipynb
# 資料集整理與清洗 (Dataset Curation & Cleaning)

# === Cell 1: Shared Cache Bootstrap ===
import os, pathlib, torch

AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)

print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

# Standard imports for dataset curation
import pandas as pd
import numpy as np
import json
import re
from collections import Counter, defaultdict
from typing import List, Dict, Tuple, Set
import hashlib
from difflib import SequenceMatcher
import warnings

warnings.filterwarnings("ignore")

In [None]:
# === Cell 2: Dataset Loading & Initial Exploration ===
print("=== 資料集載入與初步探索 (Dataset Loading & Exploration) ===")


# Sample instruction dataset creation (simulating multiple sources)
def create_sample_datasets():
    """Create sample instruction datasets with various quality issues"""

    # Dataset 1: Clean Alpaca-style
    alpaca_samples = [
        {
            "instruction": "Explain the concept of machine learning",
            "input": "",
            "output": "Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed.",
        },
        {
            "instruction": "Translate the following English text to Spanish",
            "input": "Hello, how are you?",
            "output": "Hola, ¿cómo estás?",
        },
    ]

    # Dataset 2: With quality issues
    noisy_samples = [
        {
            "instruction": "explain machine learning",  # lowercase, duplicate topic
            "input": "",
            "output": "ML is when computers learn stuff automatically.",  # too short
        },
        {
            "instruction": "My name is John Smith and I live at 123 Main St",  # PII
            "input": "",
            "output": "I can't help with personal information.",
        },
        {
            "instruction": "Translate to Spanish",  # missing input
            "input": "Hello, how are you?",  # duplicate content
            "output": "Hola, ¿cómo estás?",
        },
        {
            "prompt": "What is AI?",  # wrong format key
            "response": "Artificial Intelligence is...",  # wrong format key
        },
    ]

    # Dataset 3: Different format (ChatML style)
    chatmL_samples = [
        {
            "messages": [
                {"role": "user", "content": "What is deep learning?"},
                {
                    "role": "assistant",
                    "content": "Deep learning is a subset of machine learning that uses neural networks with multiple layers.",
                },
            ]
        }
    ]

    return alpaca_samples, noisy_samples, chatmL_samples


alpaca_data, noisy_data, chatml_data = create_sample_datasets()

print(f"Dataset 1 (Clean): {len(alpaca_data)} samples")
print(f"Dataset 2 (Noisy): {len(noisy_data)} samples")
print(f"Dataset 3 (ChatML): {len(chatml_data)} samples")

# Display sample data structures
print("\n--- Sample Data Structures ---")
print("Alpaca format:", json.dumps(alpaca_data[0], indent=2, ensure_ascii=False))
print("Noisy format:", json.dumps(noisy_data[0], indent=2, ensure_ascii=False))
print("ChatML format:", json.dumps(chatml_data[0], indent=2, ensure_ascii=False))

In [None]:
# === Cell 3: Format Standardization ===
print("\n=== 格式檢查與標準化 (Format Checking & Standardization) ===")


class DatasetStandardizer:
    """Standardize different instruction dataset formats"""

    def __init__(self):
        self.target_format = ["instruction", "input", "output"]

    def detect_format(self, sample: Dict) -> str:
        """Detect the format of a data sample"""
        keys = set(sample.keys())

        if "messages" in keys:
            return "chatml"
        elif "instruction" in keys and "output" in keys:
            return "alpaca"
        elif "prompt" in keys and "response" in keys:
            return "prompt_response"
        else:
            return "unknown"

    def standardize_sample(self, sample: Dict) -> Dict:
        """Convert sample to standard Alpaca format"""
        format_type = self.detect_format(sample)

        if format_type == "alpaca":
            # Ensure all required keys exist
            standardized = {
                "instruction": sample.get("instruction", ""),
                "input": sample.get("input", ""),
                "output": sample.get("output", ""),
            }
        elif format_type == "chatml":
            # Convert ChatML to Alpaca format
            messages = sample["messages"]
            user_msg = next((m["content"] for m in messages if m["role"] == "user"), "")
            assistant_msg = next(
                (m["content"] for m in messages if m["role"] == "assistant"), ""
            )

            standardized = {
                "instruction": user_msg,
                "input": "",
                "output": assistant_msg,
            }
        elif format_type == "prompt_response":
            # Convert prompt/response to Alpaca format
            standardized = {
                "instruction": sample.get("prompt", ""),
                "input": "",
                "output": sample.get("response", ""),
            }
        else:
            # Unknown format - try to salvage what we can
            standardized = {"instruction": "", "input": "", "output": ""}

        return standardized

    def standardize_dataset(self, dataset: List[Dict]) -> List[Dict]:
        """Standardize entire dataset"""
        standardized = []
        format_counts = defaultdict(int)

        for sample in dataset:
            format_type = self.detect_format(sample)
            format_counts[format_type] += 1

            try:
                std_sample = self.standardize_sample(sample)
                standardized.append(std_sample)
            except Exception as e:
                print(f"Error standardizing sample: {e}")
                continue

        print(f"Format distribution: {dict(format_counts)}")
        return standardized


# Test standardization
standardizer = DatasetStandardizer()

# Combine all datasets for processing
all_samples = alpaca_data + noisy_data + chatml_data
print(f"Total samples before standardization: {len(all_samples)}")

standardized_dataset = standardizer.standardize_dataset(all_samples)
print(f"Total samples after standardization: {len(standardized_dataset)}")

# Show standardization results
print("\n--- Standardization Results ---")
for i, sample in enumerate(standardized_dataset[:3]):
    print(f"Sample {i+1}:", json.dumps(sample, indent=2, ensure_ascii=False))

In [None]:
# === Cell 4: Duplicate Detection & Removal ===
print("\n=== 重複資料檢測與去除 (Duplicate Detection & Removal) ===")


class DuplicateDetector:
    """Detect and remove duplicate instruction samples"""

    def __init__(self, similarity_threshold: float = 0.8):
        self.similarity_threshold = similarity_threshold

    def get_content_hash(self, sample: Dict) -> str:
        """Generate hash for exact duplicate detection"""
        # Normalize text for consistent hashing
        instruction = sample["instruction"].strip().lower()
        input_text = sample["input"].strip().lower()
        output = sample["output"].strip().lower()

        content = f"{instruction}|{input_text}|{output}"
        return hashlib.md5(content.encode()).hexdigest()

    def calculate_similarity(self, text1: str, text2: str) -> float:
        """Calculate text similarity using SequenceMatcher"""
        return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()

    def find_exact_duplicates(self, dataset: List[Dict]) -> List[int]:
        """Find exact duplicate indices"""
        seen_hashes = {}
        duplicate_indices = []

        for i, sample in enumerate(dataset):
            content_hash = self.get_content_hash(sample)

            if content_hash in seen_hashes:
                duplicate_indices.append(i)
                print(
                    f"Exact duplicate found at index {i} (same as {seen_hashes[content_hash]})"
                )
            else:
                seen_hashes[content_hash] = i

        return duplicate_indices

    def find_near_duplicates(self, dataset: List[Dict]) -> List[Tuple[int, int, float]]:
        """Find near-duplicate pairs"""
        near_duplicates = []

        for i in range(len(dataset)):
            for j in range(i + 1, len(dataset)):
                # Compare instructions
                inst_sim = self.calculate_similarity(
                    dataset[i]["instruction"], dataset[j]["instruction"]
                )

                # Compare outputs
                out_sim = self.calculate_similarity(
                    dataset[i]["output"], dataset[j]["output"]
                )

                # Combined similarity score
                combined_sim = (inst_sim + out_sim) / 2

                if combined_sim >= self.similarity_threshold:
                    near_duplicates.append((i, j, combined_sim))

        return near_duplicates

    def remove_duplicates(self, dataset: List[Dict]) -> List[Dict]:
        """Remove both exact and near duplicates"""
        # Find exact duplicates
        exact_dups = self.find_exact_duplicates(dataset)

        # Find near duplicates
        near_dups = self.find_near_duplicates(dataset)

        # Collect all indices to remove
        indices_to_remove = set(exact_dups)

        for i, j, similarity in near_dups:
            print(f"Near duplicate: samples {i} and {j} (similarity: {similarity:.3f})")
            # Keep the first occurrence, remove the second
            indices_to_remove.add(j)

        # Create cleaned dataset
        cleaned_dataset = [
            sample for i, sample in enumerate(dataset) if i not in indices_to_remove
        ]

        print(f"Removed {len(indices_to_remove)} duplicate samples")
        return cleaned_dataset


# Test duplicate detection
detector = DuplicateDetector(similarity_threshold=0.8)
deduplicated_dataset = detector.remove_duplicates(standardized_dataset)

print(f"Dataset size after deduplication: {len(deduplicated_dataset)}")

In [None]:
# === Cell 5: Content Quality Assessment ===
print("\n=== 內容品質評估 (Content Quality Assessment) ===")


class QualityAssessor:
    """Assess and filter instruction data quality"""

    def __init__(self):
        self.min_instruction_length = 10
        self.min_output_length = 20
        self.max_output_length = 2000

    def check_length_requirements(self, sample: Dict) -> Dict[str, bool]:
        """Check if sample meets length requirements"""
        return {
            "instruction_length_ok": len(sample["instruction"])
            >= self.min_instruction_length,
            "output_length_ok": self.min_output_length
            <= len(sample["output"])
            <= self.max_output_length,
            "not_empty": bool(
                sample["instruction"].strip() and sample["output"].strip()
            ),
        }

    def check_content_quality(self, sample: Dict) -> Dict[str, bool]:
        """Assess content quality indicators"""
        instruction = sample["instruction"].lower()
        output = sample["output"].lower()

        quality_checks = {
            "has_question_words": any(
                word in instruction
                for word in [
                    "what",
                    "how",
                    "why",
                    "when",
                    "where",
                    "which",
                    "who",
                    "explain",
                    "describe",
                    "tell",
                ]
            ),
            "appropriate_response": not any(
                phrase in output
                for phrase in ["i don't know", "i can't help", "sorry", "i cannot"]
            ),
            "no_repetition": not self._has_excessive_repetition(output),
            "coherent_language": self._check_language_coherence(output),
        }

        return quality_checks

    def _has_excessive_repetition(self, text: str) -> bool:
        """Check for excessive word/phrase repetition"""
        words = text.split()
        if len(words) < 10:
            return False

        word_counts = Counter(words)
        max_count = max(word_counts.values())

        # Flag if any word appears more than 30% of total words
        return max_count > len(words) * 0.3

    def _check_language_coherence(self, text: str) -> bool:
        """Basic check for language coherence"""
        # Very simple checks
        has_punctuation = any(p in text for p in ".!?")
        has_capital_letters = any(c.isupper() for c in text)
        not_too_much_caps = sum(1 for c in text if c.isupper()) < len(text) * 0.5

        return has_punctuation and has_capital_letters and not_too_much_caps

    def assess_sample(self, sample: Dict) -> Dict:
        """Complete quality assessment for a sample"""
        length_checks = self.check_length_requirements(sample)
        content_checks = self.check_content_quality(sample)

        # Combine all checks
        all_checks = {**length_checks, **content_checks}

        # Calculate overall quality score
        quality_score = sum(all_checks.values()) / len(all_checks)

        return {
            "sample": sample,
            "checks": all_checks,
            "quality_score": quality_score,
            "passed": quality_score >= 0.7,  # 70% of checks must pass
        }

    def filter_dataset(self, dataset: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
        """Filter dataset based on quality assessment"""
        high_quality = []
        low_quality = []

        for sample in dataset:
            assessment = self.assess_sample(sample)

            if assessment["passed"]:
                high_quality.append(sample)
            else:
                low_quality.append(
                    {
                        "sample": sample,
                        "issues": [k for k, v in assessment["checks"].items() if not v],
                        "score": assessment["quality_score"],
                    }
                )

        return high_quality, low_quality


# Test quality assessment
assessor = QualityAssessor()
high_quality_data, low_quality_data = assessor.filter_dataset(deduplicated_dataset)

print(f"High quality samples: {len(high_quality_data)}")
print(f"Low quality samples: {len(low_quality_data)}")

# Show quality issues
print("\n--- Quality Issues Found ---")
for item in low_quality_data:
    print(f"Issues: {item['issues']}")
    print(f"Score: {item['score']:.2f}")
    print(f"Sample: {item['sample']['instruction'][:50]}...")
    print()

In [None]:
# === Cell 6: Privacy Information Filtering ===
print("\n=== 隱私資訊過濾 (Privacy Information Filtering) ===")


class PrivacyFilter:
    """Filter out personal and sensitive information"""

    def __init__(self):
        # Regex patterns for PII detection
        self.patterns = {
            "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
            "phone": r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b",
            "ssn": r"\b\d{3}-\d{2}-\d{4}\b",
            "address": r"\b\d{1,5}\s+\w+\s+(street|st|avenue|ave|road|rd|lane|ln|drive|dr)\b",
            "credit_card": r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b",
            "person_name": r"\b(my name is|i am|i\'m)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)\b",
        }

        # Sensitive keywords
        self.sensitive_keywords = [
            "password",
            "secret",
            "confidential",
            "private",
            "personal",
            "credit card",
            "social security",
            "bank account",
            "medical record",
        ]

    def detect_pii(self, text: str) -> Dict[str, List[str]]:
        """Detect PII in text"""
        detected = {}

        for pii_type, pattern in self.patterns.items():
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                detected[pii_type] = matches

        return detected

    def check_sensitive_content(self, text: str) -> List[str]:
        """Check for sensitive keywords"""
        text_lower = text.lower()
        found_keywords = [kw for kw in self.sensitive_keywords if kw in text_lower]
        return found_keywords

    def mask_pii(self, text: str) -> str:
        """Mask detected PII with placeholders"""
        masked_text = text

        # Replace with generic placeholders
        replacements = {
            "email": "[EMAIL]",
            "phone": "[PHONE]",
            "ssn": "[SSN]",
            "address": "[ADDRESS]",
            "credit_card": "[CREDIT_CARD]",
        }

        for pii_type, pattern in self.patterns.items():
            if pii_type in replacements:
                masked_text = re.sub(
                    pattern, replacements[pii_type], masked_text, flags=re.IGNORECASE
                )

        return masked_text

    def filter_sample(self, sample: Dict) -> Dict:
        """Filter PII from a sample"""
        # Check all text fields
        pii_found = {}
        sensitive_keywords = []

        for field in ["instruction", "input", "output"]:
            text = sample[field]
            pii_found.update(self.detect_pii(text))
            sensitive_keywords.extend(self.check_sensitive_content(text))

        # Determine if sample should be filtered
        has_pii = bool(pii_found)
        has_sensitive = bool(sensitive_keywords)

        if has_pii or has_sensitive:
            # Option 1: Remove the sample entirely
            # Option 2: Mask the PII and keep the sample
            masked_sample = {
                "instruction": self.mask_pii(sample["instruction"]),
                "input": self.mask_pii(sample["input"]),
                "output": self.mask_pii(sample["output"]),
            }

            return {
                "original": sample,
                "masked": masked_sample,
                "pii_found": pii_found,
                "sensitive_keywords": sensitive_keywords,
                "action": "mask",  # or "remove"
            }
        else:
            return {
                "original": sample,
                "masked": sample,
                "pii_found": {},
                "sensitive_keywords": [],
                "action": "keep",
            }

    def filter_dataset(self, dataset: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
        """Filter entire dataset for privacy issues"""
        clean_samples = []
        flagged_samples = []

        for sample in dataset:
            result = self.filter_sample(sample)

            if result["action"] == "keep":
                clean_samples.append(result["original"])
            elif result["action"] == "mask":
                clean_samples.append(result["masked"])  # Use masked version
                flagged_samples.append(result)
            else:  # remove
                flagged_samples.append(result)

        return clean_samples, flagged_samples


# Test privacy filtering
privacy_filter = PrivacyFilter()
privacy_clean_data, privacy_flagged = privacy_filter.filter_dataset(high_quality_data)

print(f"Clean samples: {len(privacy_clean_data)}")
print(f"Flagged samples: {len(privacy_flagged)}")

# Show flagged samples
print("\n--- Privacy Issues Found ---")
for item in privacy_flagged:
    print(f"PII found: {item['pii_found']}")
    print(f"Sensitive keywords: {item['sensitive_keywords']}")
    print(f"Action: {item['action']}")
    print(f"Original: {item['original']['instruction'][:50]}...")
    print(f"Masked: {item['masked']['instruction'][:50]}...")
    print()

In [None]:
# === Cell 7: Data Augmentation & Diversification ===
print("\n=== 資料增強與多樣化 (Data Augmentation & Diversification) ===")


class DataAugmenter:
    """Augment instruction dataset with variations"""

    def __init__(self):
        self.instruction_templates = [
            "Please {verb} {object}",
            "Can you {verb} {object}?",
            "I need you to {verb} {object}",
            "Help me {verb} {object}",
            "{verb} {object}, please",
        ]

        self.paraphrase_patterns = {
            "explain": ["describe", "clarify", "elaborate on", "tell me about"],
            "write": ["create", "compose", "draft", "generate"],
            "translate": ["convert", "transform", "render"],
            "analyze": ["examine", "evaluate", "assess", "review"],
        }

    def paraphrase_instruction(self, instruction: str) -> List[str]:
        """Generate paraphrases of instructions"""
        paraphrases = []
        instruction_lower = instruction.lower()

        for original, alternatives in self.paraphrase_patterns.items():
            if original in instruction_lower:
                for alt in alternatives:
                    paraphrased = instruction_lower.replace(original, alt)
                    # Capitalize first letter
                    paraphrased = paraphrased[0].upper() + paraphrased[1:]
                    paraphrases.append(paraphrased)

        return paraphrases

    def generate_instruction_variants(self, instruction: str) -> List[str]:
        """Generate instruction variants using templates"""
        variants = []

        # Simple keyword extraction for template filling
        if "explain" in instruction.lower():
            topic = instruction.lower().replace("explain", "").strip()
            for template in self.instruction_templates:
                variant = template.format(verb="explain", object=topic)
                variants.append(variant)

        return variants

    def augment_sample(self, sample: Dict) -> List[Dict]:
        """Create augmented versions of a sample"""
        augmented = [sample]  # Include original

        # Generate instruction paraphrases
        paraphrases = self.paraphrase_instruction(sample["instruction"])
        for paraphrase in paraphrases[:2]:  # Limit to 2 paraphrases
            augmented_sample = sample.copy()
            augmented_sample["instruction"] = paraphrase
            augmented.append(augmented_sample)

        # Generate instruction variants
        variants = self.generate_instruction_variants(sample["instruction"])
        for variant in variants[:1]:  # Limit to 1 variant
            augmented_sample = sample.copy()
            augmented_sample["instruction"] = variant
            augmented.append(augmented_sample)

        return augmented

    def augment_dataset(
        self, dataset: List[Dict], augmentation_ratio: float = 0.3
    ) -> List[Dict]:
        """Augment a portion of the dataset"""
        augmented_dataset = []

        # Select samples to augment
        num_to_augment = int(len(dataset) * augmentation_ratio)
        samples_to_augment = dataset[:num_to_augment]
        remaining_samples = dataset[num_to_augment:]

        print(f"Augmenting {num_to_augment} out of {len(dataset)} samples")

        # Augment selected samples
        for sample in samples_to_augment:
            augmented_samples = self.augment_sample(sample)
            augmented_dataset.extend(augmented_samples)

        # Add remaining samples as-is
        augmented_dataset.extend(remaining_samples)

        return augmented_dataset


# Test data augmentation
augmenter = DataAugmenter()
augmented_dataset = augmenter.augment_dataset(
    privacy_clean_data, augmentation_ratio=0.3
)

print(f"Dataset size after augmentation: {len(augmented_dataset)}")

# Show augmentation examples
print("\n--- Augmentation Examples ---")
original_sample = privacy_clean_data[0]
augmented_samples = augmenter.augment_sample(original_sample)

print(f"Original: {original_sample['instruction']}")
for i, aug_sample in enumerate(augmented_samples[1:], 1):
    print(f"Variant {i}: {aug_sample['instruction']}")

In [None]:
# === Cell 8: Final Dataset Output & Validation ===
print("\n=== 最終資料集輸出與驗證 (Final Dataset Output & Validation) ===")

class DatasetValidator:
    """Validate final curated dataset"""

    def __init__(self):
        pass

    def validate_format(self, dataset: List[Dict]) -> Dict:
        """Validate dataset format consistency"""
        required_fields = {"instruction", "input", "output"}
        validation_results = {
            "total_samples": len(dataset),
            "format_valid": 0,
            "missing_fields": [],
            "empty_fields": {"instruction": 0, "input": 0, "output": 0}
        }

        for sample in dataset:
            sample_fields = set(sample.keys())

            if required_fields.issubset(sample_fields):
                validation_results["format_valid"] += 1
            else:
                missing = required_fields - sample_fields
                validation_results["missing_fields"].extend(missing)

            # Check for empty fields
            for field in required_fields:
                if field in sample and not sample[field].strip():
                    validation_results["empty_fields"][field] += 1

        return validation_results

    def calculate_statistics(self, dataset: List[Dict]) -> Dict:
        """Calculate dataset statistics"""
        instructions = [s["instruction"] for s in dataset]
        outputs = [s["output"] for s in dataset]

        stats = {
            "instruction_lengths": {
                "min": min(len(i) for i in instructions),
                "max": max(len(i) for i in instructions),
                "avg": sum(len(i) for i in instructions) / len(instructions)
            },
            "output_lengths": {
                "min": min(len(o) for o in outputs),
                "max": max(len(o) for o in outputs),
                "avg": sum(len(o) for o in outputs) / len(outputs)
            },
            "unique_instructions": len(set(instructions)),
            "samples_with_input": sum(1 for s in dataset if s["input"].strip())
        }

        return stats

    def generate_report(self, dataset: List[Dict]) -> str:
        """Generate comprehensive dataset report"""
        format_validation = self.validate_format(dataset)
        statistics = self.calculate_statistics(dataset)

        report = f"""
=== Dataset Curation Report ===

Format Validation:
- Total samples: {format_validation['total_samples']}
- Format valid: {format_validation['format_valid']}
- Missing fields found: {set(format_validation['missing_fields'])}
- Empty instructions: {format_validation['empty_fields']['instruction']}
- Empty outputs: {format_validation['empty_fields']['output']}

Content Statistics:
- Instruction lengths: min={statistics['instruction_lengths']['min']}, max={statistics['instruction_lengths']['max']}, avg={statistics['instruction_lengths']['avg']:.1f}
- Output lengths: min={statistics['output_lengths']['min']}, max={statistics['output_lengths']['max']}, avg={statistics['output_lengths']['avg']:.1f}
- Unique instructions: {statistics['unique_instructions']} ({statistics['unique_instructions']/format_validation['total_samples']*100:.1f}% unique)
- Samples with input: {statistics['samples_with_input']}

Recommendation: {'✅ Dataset ready for training' if format_validation['format_valid'] == format_validation['total_samples'] else '⚠️ Format issues need attention'}
"""
        return report

# Validate final dataset
validator = DatasetValidator()
final_report = validator.generate_report(augmented_dataset)
print(final_report)

# Save final curated dataset
output_path = f"{AI_CACHE_ROOT}


## 6. 本章小結 (Stage Summary)

### ✅ 完成項目 (Completed Items)
- **多格式標準化**: 支援 Alpaca, ChatML, Prompt-Response 等格式轉換
- **智慧去重系統**: 精確與語義重複檢測，可調整相似度閾值
- **全面品質評估**: 長度、內容、語言連貫性多維度檢查
- **隱私保護機制**: PII 檢測與遮罩，平衡安全性與可用性
- **資料增強技術**: 指令改寫與變換，提升資料集多樣性
- **驗證與分割**: 自動 train/val 切分，完整品質報告

### 🔑 核心原理 (Core Concepts)
- **資料品質金字塔**: 格式 → 去重 → 品質 → 隱私 → 增強
- **平衡性原則**: 嚴格過濾 vs 資料保留的權衡
- **語義理解**: 不僅看表面文字，更關注語義相似性
- **隱私優先**: 預設安全的資料處理策略
- **可重現性**: 設定隨機種子，確保結果一致性

### 🚨 常見陷阱 (Common Pitfalls)
- 過度去重可能移除有價值的樣本變化
- 品質標準過於嚴格會損失邊界有用樣本
- PII 檢測可能有漏檢或誤檢
- 資料增強需避免語義漂移
- 忽略不平衡分佈問題

### 🎯 下一步建議 (Next Steps)
1. **微調驗證**: 在實際模型上測試清洗後資料效果
2. **品質迭代**: 根據模型表現調整過濾標準
3. **自動化監控**: 建立資料品質的持續監控系統
4. **進階技術**: 探索基於神經網路的語義去重方法

**何時使用這些技術 (When to Use)**:
- 準備微調資料前的必要步驟
- 合併多來源資料集時  
- 發現模型輸出品質下降時
- 需要確保資料隱私合規時
- 資料量不足需要適度增強時

這個資料集整理流程為後續的 LoRA/QLoRA 微調奠定了堅實基礎，確保訓練資料的高品質與一致性！