In [None]:
# 進階大型資料集處理 (Advanced Large-Scale Dataset Processing)
# Goal: Master complex preprocessing methods for TB-scale datasets

# === Cell 1: Advanced Environment Setup ===
import os, pathlib, torch
import warnings

warnings.filterwarnings("ignore")

# Setup shared model cache
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "/mnt/ai/cache")
cache_paths = {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}

for key, path in cache_paths.items():
    os.environ[key] = path
    pathlib.Path(path).mkdir(parents=True, exist_ok=True)

print(f"[Cache] Root: {AI_CACHE_ROOT}")
print(f"[GPU] Available: {torch.cuda.is_available()}")
print(f"[CPU] Cores: {os.cpu_count()}")


In [None]:
# Check and install advanced packages
try:
    import cv2
    import albumentations as A
    import imagehash
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import noisereduce as nr
    import torch.nn.functional as F

    print("✅ All advanced packages available")
except ImportError as e:
    print(f"⚠️ Installing missing packages: {e}")
    os.system(
        "pip install opencv-python albumentations imagehash scikit-learn noisereduce"
    )

In [None]:
# === Cell 2: Advanced Imports & Large Dataset Loading ===
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from datasets import Features, Value, Image, Audio, Sequence
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoProcessor
import numpy as np
import pandas as pd
from PIL import Image, ImageFilter, ImageEnhance
import librosa
import soundfile as sf
from typing import Dict, List, Any, Optional, Tuple, Union
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import hashlib
import json
import gc
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import multiprocessing as mp
from functools import partial
import time
import psutil


# Memory monitoring utility
def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process(os.getpid())
    memory_gb = process.memory_info().rss / 1024 / 1024 / 1024
    return f"{memory_gb:.2f}GB"


# Large dataset streaming with sharding
class AdvancedDatasetLoader:
    """Advanced dataset loader with memory management and sharding"""

    def __init__(
        self,
        dataset_name: str,
        split: str = "train",
        max_shard_size: int = 10000,
        streaming: bool = True,
    ):
        self.dataset_name = dataset_name
        self.split = split
        self.max_shard_size = max_shard_size
        self.streaming = streaming
        self.cache_dir = os.environ["HF_DATASETS_CACHE"]

    def load_with_sharding(self, num_samples: Optional[int] = None):
        """Load dataset with automatic sharding for memory efficiency"""
        print(f"Loading {self.dataset_name} with sharding...")

        if self.streaming:
            # Use streaming for very large datasets
            dataset = load_dataset(
                self.dataset_name,
                split=self.split,
                streaming=True,
                cache_dir=self.cache_dir,
            )

            if num_samples:
                dataset = dataset.take(num_samples)

            return dataset
        else:
            # Load specific slice for manageable datasets
            split_slice = f"{self.split}[:{num_samples}]" if num_samples else self.split
            return load_dataset(
                self.dataset_name, split=split_slice, cache_dir=self.cache_dir
            )

    def create_shards(self, dataset, shard_size: int = None):
        """Split dataset into manageable shards"""
        shard_size = shard_size or self.max_shard_size
        shards = []

        current_shard = []
        for i, example in enumerate(dataset):
            current_shard.append(example)

            if len(current_shard) >= shard_size:
                shards.append(Dataset.from_list(current_shard))
                current_shard = []
                print(f"Created shard {len(shards)}, Memory: {get_memory_usage()}")

        # Add remaining examples
        if current_shard:
            shards.append(Dataset.from_list(current_shard))

        return shards


# Demo with manageable datasets
print("Setting up demo datasets...")

# Text dataset - use a medium-sized one
try:
    loader = AdvancedDatasetLoader("wikitext", "train", streaming=False)
    wiki_dataset = loader.load_with_sharding(num_samples=5000)
    print(f"✅ Wiki dataset loaded: {len(wiki_dataset)} samples")
except Exception as e:
    print(f"⚠️ Wiki dataset failed, creating synthetic: {e}")
    # Create synthetic text dataset
    wiki_dataset = Dataset.from_dict(
        {
            "text": [
                f"This is a sample article about topic {i}. " * 20 for i in range(1000)
            ]
        }
    )

In [None]:
# === Cell 3: Advanced Text Preprocessing ===
print("\n=== Advanced Text Preprocessing ===")

import re
import unicodedata
from collections import defaultdict


class AdvancedTextProcessor:
    """Advanced text processing with multiple languages and noise handling"""

    def __init__(self, languages: List[str] = ["en", "zh", "es", "fr"]):
        self.languages = languages
        self.stats = defaultdict(int)

        # Load language-specific resources
        try:
            import spacy

            self.nlp_models = {}
            for lang in languages:
                try:
                    model_name = {"en": "en_core_web_sm", "zh": "zh_core_web_sm"}.get(
                        lang, f"{lang}_core_news_sm"
                    )
                    self.nlp_models[lang] = spacy.load(model_name)
                except:
                    print(f"⚠️ Spacy model for {lang} not available")
        except ImportError:
            print("⚠️ Spacy not available, using basic processing")
            self.nlp_models = {}

    def detect_language(self, text: str) -> str:
        """Simple language detection based on character patterns"""
        # Chinese characters
        if re.search(r"[\u4e00-\u9fff]", text):
            return "zh"
        # Arabic characters
        elif re.search(r"[\u0600-\u06ff]", text):
            return "ar"
        # Cyrillic characters
        elif re.search(r"[\u0400-\u04ff]", text):
            return "ru"
        # Spanish specific patterns
        elif re.search(r"[ñáéíóúü]", text.lower()):
            return "es"
        # French specific patterns
        elif re.search(r"[àâäéèêëïîôùûüÿç]", text.lower()):
            return "fr"
        else:
            return "en"

    def advanced_clean(self, text: str) -> Dict[str, Any]:
        """Advanced text cleaning with multiple strategies"""
        original_length = len(text)

        # 1. Unicode normalization
        text = unicodedata.normalize("NFKC", text)

        # 2. Remove control characters except newlines and tabs
        text = "".join(
            char
            for char in text
            if unicodedata.category(char) != "Cc" or char in "\n\t"
        )

        # 3. Fix common encoding issues
        text = text.replace("\ufffd", "")  # Remove replacement characters

        # 4. Normalize whitespace
        text = re.sub(r"\s+", " ", text)
        text = text.strip()

        # 5. Remove repeated characters (like "sooooo" -> "so")
        text = re.sub(r"(.)\1{3,}", r"\1\1", text)

        # 6. Language detection
        detected_lang = self.detect_language(text)

        # 7. Language-specific cleaning
        if detected_lang == "zh":
            # Remove English mixed in Chinese text (if too much)
            english_ratio = (
                len(re.findall(r"[a-zA-Z]", text)) / len(text) if text else 0
            )
            if english_ratio > 0.5:
                text = re.sub(r"[a-zA-Z]+", "", text)

        # 8. Quality metrics
        quality_score = self.calculate_quality_score(text, original_length)

        self.stats["processed"] += 1
        self.stats[f"lang_{detected_lang}"] += 1

        return {
            "cleaned_text": text,
            "original_length": original_length,
            "cleaned_length": len(text),
            "detected_language": detected_lang,
            "quality_score": quality_score,
            "compression_ratio": (
                len(text) / original_length if original_length > 0 else 0
            ),
        }

    def calculate_quality_score(self, text: str, original_length: int) -> float:
        """Calculate text quality score (0-1)"""
        if not text or len(text) < 10:
            return 0.0

        score = 1.0

        # Penalize very short or very long texts
        if len(text) < 50:
            score *= 0.5
        elif len(text) > 10000:
            score *= 0.8

        # Penalize high numbers/special characters ratio
        special_ratio = len(re.findall(r"[^a-zA-Z\u4e00-\u9fff\s]", text)) / len(text)
        if special_ratio > 0.3:
            score *= 1 - special_ratio

        # Reward proper sentence structure
        sentences = text.split(".")
        if len(sentences) > 1:
            score *= min(1.2, 1 + len(sentences) * 0.01)

        return min(1.0, score)


# Initialize processor and apply to dataset
text_processor = AdvancedTextProcessor()


def process_text_advanced(examples: Dict[str, List]) -> Dict[str, List]:
    """Apply advanced text processing to batch"""
    results = defaultdict(list)

    for text in examples["text"]:
        processed = text_processor.advanced_clean(text)
        for key, value in processed.items():
            results[key].append(value)

    return dict(results)


# Apply advanced processing
print("Applying advanced text processing...")
start_time = time.time()

processed_text_dataset = wiki_dataset.map(
    process_text_advanced, batched=True, batch_size=64, num_proc=min(4, os.cpu_count())
)

processing_time = time.time() - start_time
print(f"Processing completed in {processing_time:.2f}s")
print(f"Processing stats: {dict(text_processor.stats)}")

# Analyze results
quality_scores = processed_text_dataset["quality_score"]
lang_distribution = Counter(processed_text_dataset["detected_language"])

print(
    f"Quality score stats: mean={np.mean(quality_scores):.3f}, std={np.std(quality_scores):.3f}"
)
print(f"Language distribution: {dict(lang_distribution)}")

In [None]:
# === Cell 4: Complex Image Processing Pipeline ===
print("\n=== Complex Image Processing Pipeline ===")


class AdvancedImageProcessor:
    """Advanced image processing with quality control and augmentation"""

    def __init__(self, target_size: Tuple[int, int] = (224, 224)):
        self.target_size = target_size
        self.stats = defaultdict(int)

        # Define augmentation pipeline
        self.augmentation_pipeline = A.Compose(
            [
                A.RandomResizedCrop(
                    height=target_size[0], width=target_size[1], scale=(0.8, 1.0)
                ),
                A.HorizontalFlip(p=0.5),
                A.OneOf(
                    [
                        A.MotionBlur(blur_limit=3),
                        A.MedianBlur(blur_limit=3),
                        A.Blur(blur_limit=3),
                    ],
                    p=0.3,
                ),
                A.OneOf(
                    [
                        A.CLAHE(clip_limit=2),
                        A.RandomBrightnessContrast(
                            brightness_limit=0.2, contrast_limit=0.2
                        ),
                        A.RandomGamma(gamma_limit=(80, 120)),
                    ],
                    p=0.5,
                ),
                A.HueSaturationValue(
                    hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=0.3
                ),
                A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ]
        )

        # Quality control pipeline
        self.quality_pipeline = A.Compose(
            [
                A.Resize(height=target_size[0], width=target_size[1]),
                A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ]
        )

    def detect_image_quality(self, image: np.ndarray) -> Dict[str, float]:
        """Detect various image quality metrics"""
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        else:
            gray = image

        # Blur detection using Laplacian variance
        blur_score = cv2.Laplacian(gray, cv2.CV_64F).var()

        # Brightness analysis
        brightness = np.mean(gray)

        # Contrast analysis
        contrast = np.std(gray)

        # Edge density
        edges = cv2.Canny(gray, 50, 150)
        edge_density = np.sum(edges > 0) / edges.size

        # Noise estimation (using high-frequency content)
        noise_score = np.std(cv2.GaussianBlur(gray, (5, 5), 0) - gray)

        return {
            "blur_score": float(blur_score),
            "brightness": float(brightness / 255.0),
            "contrast": float(contrast / 255.0),
            "edge_density": float(edge_density),
            "noise_score": float(noise_score),
        }

    def calculate_image_hash(self, image: np.ndarray) -> str:
        """Calculate perceptual hash for duplicate detection"""
        pil_image = Image.fromarray(image) if isinstance(image, np.ndarray) else image
        return str(imagehash.phash(pil_image))

    def process_image_advanced(
        self, image: Union[np.ndarray, Image.Image], apply_augmentation: bool = False
    ) -> Dict[str, Any]:
        """Advanced image processing with quality analysis"""

        # Convert to numpy array if PIL Image
        if isinstance(image, Image.Image):
            image_array = np.array(image)
        else:
            image_array = image

        original_shape = image_array.shape

        # Quality analysis on original image
        quality_metrics = self.detect_image_quality(image_array)

        # Calculate hash for duplicate detection
        image_hash = self.calculate_image_hash(image_array)

        # Apply appropriate pipeline
        if apply_augmentation:
            processed = self.augmentation_pipeline(image=image_array)
        else:
            processed = self.quality_pipeline(image=image_array)

        processed_image = processed["image"]

        # Calculate quality score
        quality_score = self.calculate_quality_score(quality_metrics)

        self.stats["processed"] += 1
        self.stats["quality_passed"] += 1 if quality_score > 0.5 else 0

        return {
            "processed_image": processed_image,
            "original_shape": original_shape,
            "quality_metrics": quality_metrics,
            "quality_score": quality_score,
            "image_hash": image_hash,
            "is_augmented": apply_augmentation,
        }

    def calculate_quality_score(self, metrics: Dict[str, float]) -> float:
        """Calculate overall quality score from metrics"""
        score = 1.0

        # Penalize very blurry images
        if metrics["blur_score"] < 100:
            score *= 0.3
        elif metrics["blur_score"] < 500:
            score *= 0.7

        # Penalize very dark or very bright images
        brightness = metrics["brightness"]
        if brightness < 0.1 or brightness > 0.9:
            score *= 0.5

        # Penalize very low contrast images
        if metrics["contrast"] < 0.1:
            score *= 0.4

        # Penalize images with very few edges (possibly corrupted)
        if metrics["edge_density"] < 0.01:
            score *= 0.2

        return min(1.0, score)


# Create synthetic image dataset for demonstration
def create_synthetic_image_dataset(num_images: int = 100) -> Dataset:
    """Create synthetic images with varying quality"""
    images = []
    labels = []

    for i in range(num_images):
        # Create random image with varying quality
        if i % 4 == 0:
            # High quality image
            img = np.random.randint(50, 200, (224, 224, 3), dtype=np.uint8)
            img = cv2.GaussianBlur(img, (1, 1), 0)  # Slight blur
        elif i % 4 == 1:
            # Low quality (blurry) image
            img = np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)
            img = cv2.GaussianBlur(img, (15, 15), 0)  # Heavy blur
        elif i % 4 == 2:
            # Dark image
            img = np.random.randint(0, 50, (224, 224, 3), dtype=np.uint8)
        else:
            # Bright image
            img = np.random.randint(200, 255, (224, 224, 3), dtype=np.uint8)

        images.append(img)
        labels.append(i % 10)  # 10 classes

    return Dataset.from_dict({"image": images, "label": labels})


# Create and process image dataset
print("Creating synthetic image dataset...")
image_dataset = create_synthetic_image_dataset(200)

image_processor = AdvancedImageProcessor()


def process_images_batch(examples: Dict[str, List]) -> Dict[str, List]:
    """Process batch of images with quality control"""
    results = defaultdict(list)

    for image in examples["image"]:
        # Apply augmentation to 50% of images
        apply_aug = np.random.random() > 0.5
        processed = image_processor.process_image_advanced(
            image, apply_augmentation=apply_aug
        )

        for key, value in processed.items():
            results[key].append(value)

    return dict(results)


# Process images
print("Processing images with quality control...")
start_time = time.time()

processed_image_dataset = image_dataset.map(
    process_images_batch,
    batched=True,
    batch_size=16,
    num_proc=2,  # Lower for image processing
)

processing_time = time.time() - start_time
print(f"Image processing completed in {processing_time:.2f}s")
print(f"Image processing stats: {dict(image_processor.stats)}")

# Analyze image quality distribution
quality_scores = processed_image_dataset["quality_score"]
print(
    f"Image quality stats: mean={np.mean(quality_scores):.3f}, std={np.std(quality_scores):.3f}"
)
print(
    f"High quality images (>0.7): {sum(1 for score in quality_scores if score > 0.7)}"
)

In [None]:
# === Cell 5: Advanced Audio Processing ===
print("\n=== Advanced Audio Processing ===")


class AdvancedAudioProcessor:
    """Advanced audio processing with noise reduction and feature extraction"""

    def __init__(self, target_sr: int = 16000, segment_length: float = 10.0):
        self.target_sr = target_sr
        self.segment_length = segment_length
        self.stats = defaultdict(int)

    def detect_silence(
        self,
        audio: np.ndarray,
        sr: int,
        threshold: float = 0.01,
        min_duration: float = 0.5,
    ) -> List[Tuple[float, float]]:
        """Detect silent segments in audio"""
        # Calculate energy in sliding windows
        frame_length = int(0.025 * sr)  # 25ms frames
        hop_length = frame_length // 2

        energy = []
        for i in range(0, len(audio) - frame_length, hop_length):
            frame = audio[i : i + frame_length]
            energy.append(np.sum(frame**2))

        energy = np.array(energy)
        energy = energy / np.max(energy) if np.max(energy) > 0 else energy

        # Find silent regions
        silent_frames = energy < threshold
        silent_segments = []

        in_silence = False
        start_frame = 0

        for i, is_silent in enumerate(silent_frames):
            if is_silent and not in_silence:
                start_frame = i
                in_silence = True
            elif not is_silent and in_silence:
                duration = (i - start_frame) * hop_length / sr
                if duration >= min_duration:
                    start_time = start_frame * hop_length / sr
                    end_time = i * hop_length / sr
                    silent_segments.append((start_time, end_time))
                in_silence = False

        return silent_segments

    def extract_advanced_features(self, audio: np.ndarray, sr: int) -> Dict[str, Any]:
        """Extract comprehensive audio features"""
        features = {}

        # Basic stats
        features["rms_energy"] = float(np.sqrt(np.mean(audio**2)))
        features["zero_crossing_rate"] = float(
            np.mean(librosa.feature.zero_crossing_rate(audio))
        )

        # Spectral features
        spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
        features["spectral_centroid_mean"] = float(np.mean(spectral_centroids))
        features["spectral_centroid_std"] = float(np.std(spectral_centroids))

        spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
        features["spectral_rolloff_mean"] = float(np.mean(spectral_rolloff))

        # MFCC features
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        features["mfcc_mean"] = mfccs.mean(axis=1).tolist()
        features["mfcc_std"] = mfccs.std(axis=1).tolist()

        # Chroma features
        chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
        features["chroma_mean"] = float(np.mean(chroma))

        # Tempo estimation
        try:
            tempo, _ = librosa.beat.beat_track(y=audio, sr=sr)
            features["tempo"] = float(tempo)
        except:
            features["tempo"] = 0.0

        return features

    def process_audio_advanced(self, audio_data: Dict[str, Any]) -> Dict[str, Any]:
        """Advanced audio processing with quality control"""
        audio_array = audio_data["array"]
        original_sr = audio_data["sampling_rate"]

        # Resample if needed
        if original_sr != self.target_sr:
            audio_array = librosa.resample(
                audio_array, orig_sr=original_sr, target_sr=self.target_sr
            )

        # Normalize audio
        if np.max(np.abs(audio_array)) > 0:
            audio_array = audio_array / np.max(np.abs(audio_array))

        # Apply noise reduction if audio seems noisy
        try:
            # Simple noise reduction
            noise_reduced = nr.reduce_noise(y=audio_array, sr=self.target_sr)
        except:
            noise_reduced = audio_array

        # Detect silence
        silent_segments = self.detect_silence(audio_array, self.target_sr)
        silence_ratio = sum(end - start for start, end in silent_segments) / (
            len(audio_array) / self.target_sr
        )

        # Segment audio if too long
        max_samples = int(self.segment_length * self.target_sr)
        if len(audio_array) > max_samples:
            # Take middle segment to avoid silence at beginning/end
            start_idx = (len(audio_array) - max_samples) // 2
            audio_array = audio_array[start_idx : start_idx + max_samples]

        # Extract features
        features = self.extract_advanced_features(audio_array, self.target_sr)

        # Calculate quality score
        quality_score = self.calculate_audio_quality_score(features, silence_ratio)

        self.stats["processed"] += 1
        self.stats["noise_reduced"] += (
            1 if np.mean(np.abs(noise_reduced - audio_array)) > 0.01 else 0
        )

        return {
            "processed_audio": audio_array.tolist(),
            "noise_reduced_audio": noise_reduced.tolist(),
            "sampling_rate": self.target_sr,
            "original_length": len(audio_data["array"]) / original_sr,
            "processed_length": len(audio_array) / self.target_sr,
            "silence_ratio": silence_ratio,
            "audio_features": features,
            "quality_score": quality_score,
        }

    def calculate_audio_quality_score(
        self, features: Dict[str, Any], silence_ratio: float
    ) -> float:
        """Calculate audio quality score"""
        score = 1.0

        # Penalize too much silence
        if silence_ratio > 0.7:
            score *= 0.3
        elif silence_ratio > 0.5:
            score *= 0.6

        # Penalize very low energy
        if features["rms_energy"] < 0.01:
            score *= 0.4

        # Penalize monotone audio (very low spectral centroid std)
        if features["spectral_centroid_std"] < 100:
            score *= 0.7

        return min(1.0, score)


# Create synthetic audio dataset
def create_synthetic_audio_dataset(num_samples: int = 50) -> Dataset:
    """Create synthetic audio samples with varying quality"""
    audio_samples = []

    for i in range(num_samples):
        # Create different types of audio
        duration = np.random.uniform(3, 15)  # 3-15 seconds
        samples = int(duration * 16000)

        if i % 4 == 0:
            # Speech-like signal
            fundamental = 200 + np.random.normal(0, 50)
            t = np.linspace(0, duration, samples)
            audio = np.sin(2 * np.pi * fundamental * t) * np.exp(-t / 2)
            # Add formants
            audio += 0.3 * np.sin(2 * np.pi * fundamental * 2 * t)
            audio += 0.2 * np.sin(2 * np.pi * fundamental * 3 * t)
        elif i % 4 == 1:
            # Music-like signal
            frequencies = [440, 554, 659]  # A major chord
            t = np.linspace(0, duration, samples)
            audio = sum(np.sin(2 * np.pi * f * t) for f in frequencies) / len(
                frequencies
            )
        elif i % 4 == 2:
            # Noisy signal
            audio = np.random.normal(0, 0.1, samples)
        else:
            # Silent signal with occasional blips
            audio = np.random.normal(0, 0.01, samples)
            for _ in range(5):
                pos = np.random.randint(0, samples - 1000)
                audio[pos : pos + 1000] += 0.5 * np.sin(
                    2 * np.pi * 1000 * np.linspace(0, 1000 / 16000, 1000)
                )

        # Add some noise to all signals
        audio += np.random.normal(0, 0.02, len(audio))

        audio_samples.append({"array": audio, "sampling_rate": 16000})

    return Dataset.from_dict({"audio": audio_samples, "id": list(range(num_samples))})


# Create and process audio dataset
print("Creating synthetic audio dataset...")
audio_dataset = create_synthetic_audio_dataset(30)

audio_processor = AdvancedAudioProcessor()


def process_audio_batch(examples: Dict[str, List]) -> Dict[str, List]:
    """Process batch of audio samples"""
    results = defaultdict(list)

    for audio_data in examples["audio"]:
        processed = audio_processor.process_audio_advanced(audio_data)
        for key, value in processed.items():
            results[key].append(value)

    return dict(results)


# Process audio
print("Processing audio with advanced pipeline...")
start_time = time.time()

processed_audio_dataset = audio_dataset.map(
    process_audio_batch,
    batched=True,
    batch_size=8,
    num_proc=1,  # Audio processing is CPU intensive
)

processing_time = time.time() - start_time
print(f"Audio processing completed in {processing_time:.2f}s")
print(f"Audio processing stats: {dict(audio_processor.stats)}")

# Analyze audio quality
audio_quality_scores = processed_audio_dataset["quality_score"]
silence_ratios = processed_audio_dataset["silence_ratio"]

print(
    f"Audio quality stats: mean={np.mean(audio_quality_scores):.3f}, std={np.std(audio_quality_scores):.3f}"
)
print(
    f"Silence ratio stats: mean={np.mean(silence_ratios):.3f}, std={np.std(silence_ratios):.3f}"
)

In [None]:
# === Cell 6: Data Quality Control & Anomaly Detection ===
print("\n=== Data Quality Control & Anomaly Detection ===")


class DataQualityController:
    """Advanced data quality control and anomaly detection"""

    def __init__(self):
        self.stats = defaultdict(int)
        self.quality_thresholds = {
            "text_quality": 0.3,
            "image_quality": 0.5,
            "audio_quality": 0.4,
        }
        self.duplicate_hashes = set()

    def detect_text_duplicates(
        self, texts: List[str], similarity_threshold: float = 0.85
    ) -> List[Tuple[int, int, float]]:
        """Detect near-duplicate texts using TF-IDF similarity"""
        if len(texts) < 2:
            return []

        # Use TF-IDF vectorization
        vectorizer = TfidfVectorizer(
            max_features=1000, stop_words="english", ngram_range=(1, 2)
        )

        try:
            tfidf_matrix = vectorizer.fit_transform(texts)
            # Calculate cosine similarity
            similarity_matrix = cosine_similarity(tfidf_matrix)

            duplicates = []
            for i in range(len(texts)):
                for j in range(i + 1, len(texts)):
                    if similarity_matrix[i, j] > similarity_threshold:
                        duplicates.append((i, j, similarity_matrix[i, j]))

            return duplicates
        except Exception as e:
            print(f"⚠️ TF-IDF similarity calculation failed: {e}")
            return []

    def detect_outliers_iqr(self, data: List[float], factor: float = 1.5) -> List[int]:
        """Detect outliers using IQR method"""
        if len(data) < 4:
            return []

        q1 = np.percentile(data, 25)
        q3 = np.percentile(data, 75)
        iqr = q3 - q1

        lower_bound = q1 - factor * iqr
        upper_bound = q3 + factor * iqr

        outliers = []
        for i, value in enumerate(data):
            if value < lower_bound or value > upper_bound:
                outliers.append(i)

        return outliers

    def analyze_dataset_quality(
        self, dataset: Dataset, text_column: str = None, quality_column: str = None
    ) -> Dict[str, Any]:
        """Comprehensive dataset quality analysis"""
        analysis = {
            "total_samples": len(dataset),
            "memory_usage_mb": (
                dataset.data.nbytes / (1024 * 1024)
                if hasattr(dataset.data, "nbytes")
                else 0
            ),
            "columns": dataset.column_names,
            "quality_issues": [],
        }

        # Analyze quality scores if available
        if quality_column and quality_column in dataset.column_names:
            quality_scores = dataset[quality_column]
            analysis["quality_stats"] = {
                "mean": float(np.mean(quality_scores)),
                "std": float(np.std(quality_scores)),
                "min": float(np.min(quality_scores)),
                "max": float(np.max(quality_scores)),
                "median": float(np.median(quality_scores)),
            }

            # Find low quality samples
            threshold = self.quality_thresholds.get(
                quality_column.replace("_score", ""), 0.5
            )
            low_quality_indices = [
                i for i, score in enumerate(quality_scores) if score < threshold
            ]
            analysis["low_quality_samples"] = len(low_quality_indices)
            analysis["low_quality_ratio"] = len(low_quality_indices) / len(dataset)

            # Detect outliers in quality scores
            outliers = self.detect_outliers_iqr(quality_scores)
            analysis["quality_outliers"] = len(outliers)

        # Analyze text duplicates if text column provided
        if text_column and text_column in dataset.column_names:
            sample_size = min(1000, len(dataset))  # Sample for large datasets
            sample_indices = np.random.choice(len(dataset), sample_size, replace=False)
            sample_texts = [dataset[int(i)][text_column] for i in sample_indices]

            duplicates = self.detect_text_duplicates(sample_texts)
            analysis["duplicate_pairs"] = len(duplicates)
            analysis["estimated_duplicate_ratio"] = len(duplicates) * 2 / sample_size

        # Check for missing values
        for column in dataset.column_names:
            try:
                sample_values = dataset[column][:100]  # Check first 100 samples
                none_count = sum(1 for v in sample_values if v is None or v == "")
                if none_count > 0:
                    analysis["quality_issues"].append(
                        f"Column '{column}' has missing values"
                    )
            except:
                pass

        return analysis

    def create_quality_report(self, datasets: Dict[str, Dataset]) -> str:
        """Generate comprehensive quality report"""
        report = ["# Dataset Quality Report", ""]

        for name, dataset in datasets.items():
            report.append(f"## Dataset: {name}")
            report.append("")

            # Determine quality column
            quality_cols = [
                col for col in dataset.column_names if "quality_score" in col
            ]
            quality_col = quality_cols[0] if quality_cols else None

            # Determine text column
            text_cols = [col for col in dataset.column_names if "text" in col.lower()]
            text_col = text_cols[0] if text_cols else None

            analysis = self.analyze_dataset_quality(dataset, text_col, quality_col)

            report.append(f"- **Total Samples**: {analysis['total_samples']:,}")
            report.append(f"- **Memory Usage**: {analysis['memory_usage_mb']:.1f} MB")
            report.append(f"- **Columns**: {', '.join(analysis['columns'])}")

            if "quality_stats" in analysis:
                stats = analysis["quality_stats"]
                report.append(
                    f"- **Quality Score**: {stats['mean']:.3f} ± {stats['std']:.3f}"
                )
                report.append(
                    f"- **Low Quality Samples**: {analysis['low_quality_samples']} ({analysis['low_quality_ratio']:.1%})"
                )

            if "duplicate_pairs" in analysis:
                report.append(
                    f"- **Estimated Duplicates**: {analysis['estimated_duplicate_ratio']:.1%}"
                )

            if analysis["quality_issues"]:
                report.append("- **Issues**: " + ", ".join(analysis["quality_issues"]))

            report.append("")

        return "\n".join(report)


# Initialize quality controller and analyze datasets
quality_controller = DataQualityController()

# Prepare datasets for analysis
datasets_to_analyze = {
    "processed_text": processed_text_dataset,
    "processed_images": processed_image_dataset,
    "processed_audio": processed_audio_dataset,
}

print("Analyzing dataset quality...")
quality_report = quality_controller.create_quality_report(datasets_to_analyze)
print(quality_report)

In [None]:
# === Cell 7: GPU-Accelerated Processing & Distributed Processing ===
print("\n=== GPU-Accelerated & Distributed Processing ===")


class GPUAcceleratedProcessor:
    """GPU-accelerated preprocessing operations"""

    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

    def gpu_batch_normalize(self, data: List[np.ndarray]) -> List[np.ndarray]:
        """GPU-accelerated batch normalization"""
        if not data:
            return data

        # Convert to tensor and move to GPU
        tensors = [torch.from_numpy(arr).float().to(self.device) for arr in data]

        # Batch normalize
        normalized = []
        for tensor in tensors:
            # Normalize to [0, 1] range
            tensor_min = tensor.min()
            tensor_max = tensor.max()
            if tensor_max > tensor_min:
                normalized_tensor = (tensor - tensor_min) / (tensor_max - tensor_min)
            else:
                normalized_tensor = tensor

            normalized.append(normalized_tensor.cpu().numpy())

        return normalized

    def gpu_image_transforms(
        self, images: List[np.ndarray], transforms: List[str] = ["resize", "normalize"]
    ) -> List[np.ndarray]:
        """GPU-accelerated image transformations"""
        if not images or not torch.cuda.is_available():
            return images

        # Convert to tensor batch
        image_tensors = []
        for img in images:
            if len(img.shape) == 3:
                tensor = (
                    torch.from_numpy(img).permute(2, 0, 1).float().to(self.device)
                )  # HWC to CHW
            else:
                tensor = torch.from_numpy(img).float().to(self.device)
            image_tensors.append(tensor)

        # Batch process
        processed_images = []
        for tensor in image_tensors:
            if "resize" in transforms:
                tensor = F.interpolate(
                    tensor.unsqueeze(0),
                    size=(224, 224),
                    mode="bilinear",
                    align_corners=False,
                ).squeeze(0)

            if "normalize" in transforms:
                tensor = (tensor - tensor.mean()) / (tensor.std() + 1e-8)

            # Convert back to numpy
            if len(tensor.shape) == 3:
                processed_img = tensor.permute(1, 2, 0).cpu().numpy()  # CHW to HWC
            else:
                processed_img = tensor.cpu().numpy()

            processed_images.append(processed_img)

        return processed_images


class DistributedProcessor:
    """Distributed processing for large-scale datasets"""

    def __init__(self, num_workers: int = None):
        self.num_workers = num_workers or min(8, os.cpu_count())

    def parallel_map(
        self, dataset: Dataset, processing_func: callable, chunk_size: int = 1000
    ) -> Dataset:
        """Apply processing function in parallel chunks"""
        print(f"Processing dataset with {self.num_workers} workers...")

        # Split dataset into chunks
        dataset_size = len(dataset)
        chunks = []

        for i in range(0, dataset_size, chunk_size):
            end_idx = min(i + chunk_size, dataset_size)
            chunk = dataset.select(range(i, end_idx))
            chunks.append(chunk)

        print(f"Created {len(chunks)} chunks of size ~{chunk_size}")

        # Process chunks in parallel
        with ProcessPoolExecutor(max_workers=self.num_workers) as executor:
            start_time = time.time()

            # Create partial function with fixed parameters
            process_chunk = partial(
                self._process_chunk, processing_func=processing_func
            )

            # Submit all chunks
            futures = [executor.submit(process_chunk, chunk) for chunk in chunks]

            # Collect results
            processed_chunks = []
            for i, future in enumerate(futures):
                try:
                    result = future.result(timeout=300)  # 5 minute timeout
                    processed_chunks.append(result)
                    print(f"Completed chunk {i+1}/{len(chunks)}")
                except Exception as e:
                    print(f"❌ Chunk {i+1} failed: {e}")
                    processed_chunks.append(
                        chunks[i]
                    )  # Use original chunk if processing fails

            processing_time = time.time() - start_time
            print(f"Parallel processing completed in {processing_time:.2f}s")

        # Concatenate all processed chunks
        if processed_chunks:
            return concatenate_datasets(processed_chunks)
        else:
            return dataset

    def _process_chunk(self, chunk: Dataset, processing_func: callable) -> Dataset:
        """Process a single chunk of data"""
        try:
            return chunk.map(processing_func, batched=True, batch_size=32)
        except Exception as e:
            print(f"Chunk processing error: {e}")
            return chunk


# Demo GPU acceleration if available
gpu_processor = GPUAcceleratedProcessor()

if torch.cuda.is_available():
    print("Testing GPU-accelerated processing...")

    # Test with a small batch of synthetic images
    test_images = [
        np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8) for _ in range(10)
    ]

    start_time = time.time()
    gpu_processed = gpu_processor.gpu_image_transforms(test_images)
    gpu_time = time.time() - start_time

    print(f"GPU processing time: {gpu_time:.4f}s for {len(test_images)} images")
    print(f"Processed image shape: {gpu_processed[0].shape}")
else:
    print("GPU not available, skipping GPU acceleration demo")

# Demo distributed processing
distributed_processor = DistributedProcessor(num_workers=2)


def simple_text_length_calc(examples):
    """Simple processing function for distributed demo"""
    return {"char_count": [len(text) for text in examples["cleaned_text"]]}


print("Testing distributed processing...")
if len(processed_text_dataset) > 100:
    # Use subset for demo
    subset = processed_text_dataset.select(range(100))
    distributed_result = distributed_processor.parallel_map(
        subset, simple_text_length_calc, chunk_size=25
    )
    print(f"Distributed processing completed: {len(distributed_result)} samples")
else:
    print("Dataset too small for distributed processing demo")

In [None]:
# === Cell 8: Adaptive Batch Processing & Dynamic Memory Management ===
print("\n=== Adaptive Batch Processing & Dynamic Memory Management ===")


class AdaptiveBatchProcessor:
    """Adaptive batch processing with dynamic memory management"""

    def __init__(self, initial_batch_size: int = 32, memory_threshold_gb: float = 8.0):
        self.initial_batch_size = initial_batch_size
        self.memory_threshold_gb = memory_threshold_gb
        self.batch_size_history = []
        self.processing_times = []

    def get_current_memory_usage(self) -> float:
        """Get current memory usage in GB"""
        process = psutil.Process(os.getpid())
        return process.memory_info().rss / 1024 / 1024 / 1024

    def estimate_optimal_batch_size(
        self, sample_processing_time: float, sample_memory_usage: float
    ) -> int:
        """Estimate optimal batch size based on memory and time constraints"""
        available_memory = self.memory_threshold_gb - self.get_current_memory_usage()

        # Calculate batch size based on memory constraint
        memory_based_batch = max(1, int(available_memory / (sample_memory_usage + 0.1)))

        # Calculate batch size based on time constraint (target: 1-5 seconds per batch)
        target_batch_time = 3.0  # seconds
        time_based_batch = max(
            1, int(target_batch_time / max(sample_processing_time, 0.01))
        )

        # Take the minimum to respect both constraints
        optimal_batch = min(memory_based_batch, time_based_batch, 128)  # Cap at 128

        return max(1, optimal_batch)

    def adaptive_map(
        self, dataset: Dataset, processing_func: callable, sample_size: int = 10
    ) -> Dataset:
        """Apply processing with adaptive batch sizing"""
        print("Starting adaptive batch processing...")

        # Sample small batch to estimate resource usage
        sample_indices = np.random.choice(
            len(dataset), min(sample_size, len(dataset)), replace=False
        )
        sample_data = dataset.select(sample_indices.tolist())

        # Measure processing time and memory for sample
        initial_memory = self.get_current_memory_usage()
        start_time = time.time()

        sample_result = sample_data.map(processing_func, batched=True, batch_size=1)

        sample_time = (time.time() - start_time) / len(sample_data)
        memory_per_sample = (self.get_current_memory_usage() - initial_memory) / len(
            sample_data
        )

        # Estimate optimal batch size
        optimal_batch_size = self.estimate_optimal_batch_size(
            sample_time, memory_per_sample
        )

        print(f"Estimated optimal batch size: {optimal_batch_size}")
        print(f"Sample processing time: {sample_time:.4f}s per item")
        print(f"Memory usage per sample: {memory_per_sample*1024:.2f}MB")

        # Process full dataset with adaptive batching
        processed_samples = 0
        current_batch_size = optimal_batch_size
        batch_results = []

        while processed_samples < len(dataset):
            end_idx = min(processed_samples + current_batch_size, len(dataset))
            batch_data = dataset.select(range(processed_samples, end_idx))

            # Monitor processing time and memory
            batch_start_time = time.time()
            memory_before = self.get_current_memory_usage()

            try:
                batch_result = batch_data.map(
                    processing_func, batched=True, batch_size=current_batch_size
                )
                batch_results.append(batch_result)

                batch_time = time.time() - batch_start_time
                memory_after = self.get_current_memory_usage()

                # Log batch statistics
                print(
                    f"Processed batch {len(batch_results)}: {len(batch_data)} samples in {batch_time:.2f}s, "
                    f"Memory: {memory_after:.2f}GB"
                )

                # Adjust batch size for next iteration
                if batch_time > 5.0:  # Too slow
                    current_batch_size = max(1, int(current_batch_size * 0.8))
                elif (
                    batch_time < 1.0 and memory_after < self.memory_threshold_gb * 0.8
                ):  # Can go faster
                    current_batch_size = min(128, int(current_batch_size * 1.2))

                processed_samples = end_idx

            except Exception as e:
                print(
                    f"❌ Batch processing failed with batch size {current_batch_size}: {e}"
                )
                # Reduce batch size and retry
                current_batch_size = max(1, current_batch_size // 2)
                if current_batch_size == 1:
                    print("❌ Cannot process even single samples, skipping batch")
                    processed_samples = end_idx

            # Memory cleanup
            if memory_after > self.memory_threshold_gb * 0.9:
                gc.collect()
                print(
                    f"Memory cleanup triggered. Usage: {self.get_current_memory_usage():.2f}GB"
                )

        # Concatenate all batch results
        if batch_results:
            final_result = concatenate_datasets(batch_results)
            print(f"✅ Adaptive processing completed: {len(final_result)} samples")
            return final_result
        else:
            print("❌ No batches processed successfully")
            return dataset


# Demo adaptive processing
adaptive_processor = AdaptiveBatchProcessor(
    initial_batch_size=16, memory_threshold_gb=4.0
)


def memory_intensive_processing(examples):
    """Memory-intensive processing function for demo"""
    results = []
    for text in examples["cleaned_text"]:
        # Simulate memory-intensive operation
        large_array = np.random.randn(1000, 100)  # 100KB per sample
        processed_value = np.sum(large_array) + len(text)
        results.append(processed_value)
    return {"memory_intensive_result": results}


print("Testing adaptive batch processing...")
if len(processed_text_dataset) > 50:
    subset = processed_text_dataset.select(range(50))
    adaptive_result = adaptive_processor.adaptive_map(
        subset, memory_intensive_processing
    )
    print(f"Adaptive processing result: {len(adaptive_result)} samples")

In [None]:
# === Cell 9: Comprehensive Performance Testing & Optimization Recommendations ===
print("\n=== Performance Testing & Optimization ===")


class PerformanceBenchmark:
    """Comprehensive performance benchmarking for dataset processing"""

    def __init__(self):
        self.results = {}

    def benchmark_processing_methods(
        self,
        dataset: Dataset,
        processing_func: callable,
        methods: List[str] = ["sequential", "batched", "parallel"],
    ) -> Dict[str, Dict]:
        """Benchmark different processing methods"""
        results = {}
        sample_size = min(100, len(dataset))
        test_dataset = dataset.select(range(sample_size))

        for method in methods:
            print(f"Benchmarking {method} processing...")

            # Memory before
            memory_before = (
                psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
            )  # MB
            start_time = time.time()

            try:
                if method == "sequential":
                    result = test_dataset.map(processing_func, batched=False)
                elif method == "batched":
                    result = test_dataset.map(
                        processing_func, batched=True, batch_size=16
                    )
                elif method == "parallel":
                    result = test_dataset.map(
                        processing_func, batched=True, batch_size=16, num_proc=2
                    )

                processing_time = time.time() - start_time
                memory_after = (
                    psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
                )  # MB
                memory_used = memory_after - memory_before

                results[method] = {
                    "processing_time": processing_time,
                    "samples_per_second": sample_size / processing_time,
                    "memory_used_mb": memory_used,
                    "success": True,
                }

                print(
                    f"  ✅ {method}: {processing_time:.2f}s, {sample_size/processing_time:.1f} samples/s"
                )

            except Exception as e:
                results[method] = {
                    "processing_time": float("inf"),
                    "samples_per_second": 0,
                    "memory_used_mb": 0,
                    "success": False,
                    "error": str(e),
                }
                print(f"  ❌ {method}: Failed - {e}")

        return results

    def generate_optimization_recommendations(
        self, benchmark_results: Dict[str, Dict], dataset_info: Dict[str, Any]
    ) -> List[str]:
        """Generate optimization recommendations based on benchmark results"""
        recommendations = []

        # Find best performing method
        successful_methods = {
            k: v for k, v in benchmark_results.items() if v.get("success", False)
        }

        if successful_methods:
            best_method = min(
                successful_methods.keys(),
                key=lambda x: successful_methods[x]["processing_time"],
            )

            recommendations.append(f"🏆 Best performing method: {best_method}")

            # Memory optimization
            memory_usage = [v["memory_used_mb"] for v in successful_methods.values()]
            if max(memory_usage) > 1000:  # > 1GB
                recommendations.append(
                    "🔧 Consider reducing batch size to lower memory usage"
                )

            # Speed optimization
            speeds = [v["samples_per_second"] for v in successful_methods.values()]
            if max(speeds) < 10:  # Less than 10 samples/second
                recommendations.append(
                    "⚡ Consider GPU acceleration or simpler preprocessing"
                )

            # Parallel processing
            if "parallel" in successful_methods and "batched" in successful_methods:
                parallel_speed = successful_methods["parallel"]["samples_per_second"]
                batched_speed = successful_methods["batched"]["samples_per_second"]

                if parallel_speed > batched_speed * 1.5:
                    recommendations.append(
                        "🚀 Parallel processing shows significant improvement"
                    )
                else:
                    recommendations.append(
                        "⚠️ Parallel processing overhead may not be worth it"
                    )

        # Dataset-specific recommendations
        if dataset_info.get("total_samples", 0) > 100000:
            recommendations.append(
                "📊 Large dataset: Consider streaming mode and chunked processing"
            )

        if dataset_info.get("memory_usage_mb", 0) > 5000:
            recommendations.append(
                "💾 High memory usage: Consider data compression or feature selection"
            )

        return recommendations


# Run comprehensive benchmarks
benchmark = PerformanceBenchmark()


# Simple processing function for benchmarking
def benchmark_processing_func(examples):
    """Simple processing function for benchmarking"""
    if isinstance(examples, dict) and "cleaned_text" in examples:
        # Batched mode
        return {"word_count": [len(text.split()) for text in examples["cleaned_text"]]}
    else:
        # Single sample mode
        return {"word_count": len(examples["cleaned_text"].split())}


print("Running comprehensive performance benchmarks...")

# Benchmark text processing
if len(processed_text_dataset) > 0:
    text_benchmark_results = benchmark.benchmark_processing_methods(
        processed_text_dataset,
        benchmark_processing_func,
        methods=["sequential", "batched", "parallel"],
    )

    text_dataset_info = quality_controller.analyze_dataset_quality(
        processed_text_dataset
    )
    text_recommendations = benchmark.generate_optimization_recommendations(
        text_benchmark_results, text_dataset_info
    )

    print("\n📈 Text Processing Benchmark Results:")
    for method, results in text_benchmark_results.items():
        if results["success"]:
            print(
                f"  {method}: {results['processing_time']:.2f}s, {results['samples_per_second']:.1f} samples/s"
            )

    print("\n💡 Text Processing Recommendations:")
    for rec in text_recommendations:
        print(f"  {rec}")


In [None]:
# === Final Smoke Test & Validation ===
print("\n" + "=" * 70)
print("🧪 COMPREHENSIVE SMOKE TEST")
print("=" * 70)


def run_comprehensive_smoke_test():
    """Run comprehensive smoke tests for all advanced processing functions"""
    tests_passed = 0
    total_tests = 0

    # Test 1: Advanced text processing
    total_tests += 1
    try:
        test_processor = AdvancedTextProcessor()
        result = test_processor.advanced_clean(
            "This is a test sentence with répéated characters!!!"
        )
        assert "quality_score" in result
        assert "detected_language" in result
        tests_passed += 1
        print("✅ Advanced text processing")
    except Exception as e:
        print(f"❌ Advanced text processing: {e}")

    # Test 2: Image quality detection
    total_tests += 1
    try:
        test_img_processor = AdvancedImageProcessor()
        test_image = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
        result = test_img_processor.process_image_advanced(test_image)
        assert "quality_score" in result
        assert "quality_metrics" in result
        tests_passed += 1
        print("✅ Advanced image processing")
    except Exception as e:
        print(f"❌ Advanced image processing: {e}")

    # Test 3: Audio processing
    total_tests += 1
    try:
        test_audio_processor = AdvancedAudioProcessor()
        test_audio = {"array": np.random.randn(8000), "sampling_rate": 16000}
        result = test_audio_processor.process_audio_advanced(test_audio)
        assert "quality_score" in result
        assert "audio_features" in result
        tests_passed += 1
        print("✅ Advanced audio processing")
    except Exception as e:
        print(f"❌ Advanced audio processing: {e}")

    # Test 4: Quality control
    total_tests += 1
    try:
        test_qc = DataQualityController()
        test_dataset = Dataset.from_dict(
            {"text": ["test1", "test2"], "quality_score": [0.8, 0.6]}
        )
        analysis = test_qc.analyze_dataset_quality(
            test_dataset, "text", "quality_score"
        )
        assert "total_samples" in analysis
        assert "quality_stats" in analysis
        tests_passed += 1
        print("✅ Data quality control")
    except Exception as e:
        print(f"❌ Data quality control: {e}")

    # Test 5: GPU acceleration (if available)
    total_tests += 1
    try:
        test_gpu_processor = GPUAcceleratedProcessor()
        test_data = [np.random.randn(10, 10) for _ in range(3)]
        result = test_gpu_processor.gpu_batch_normalize(test_data)
        assert len(result) == len(test_data)
        tests_passed += 1
        print("✅ GPU acceleration")
    except Exception as e:
        print(f"❌ GPU acceleration: {e}")

    # Test 6: Adaptive batch processing
    total_tests += 1
    try:
        test_adaptive = AdaptiveBatchProcessor()
        memory_usage = test_adaptive.get_current_memory_usage()
        assert isinstance(memory_usage, float)
        assert memory_usage > 0
        tests_passed += 1
        print("✅ Adaptive batch processing")
    except Exception as e:
        print(f"❌ Adaptive batch processing: {e}")

    # Test 7: Performance benchmarking
    total_tests += 1
    try:
        test_benchmark = PerformanceBenchmark()
        recommendations = test_benchmark.generate_optimization_recommendations(
            {
                "method1": {
                    "success": True,
                    "processing_time": 1.0,
                    "samples_per_second": 10,
                    "memory_used_mb": 100,
                }
            },
            {"total_samples": 1000, "memory_usage_mb": 100},
        )
        assert isinstance(recommendations, list)
        tests_passed += 1
        print("✅ Performance benchmarking")
    except Exception as e:
        print(f"❌ Performance benchmarking: {e}")

    print(f"\n🎯 Advanced Processing Tests: {tests_passed}/{total_tests} passed")
    return tests_passed == total_tests


# Run comprehensive smoke test
all_advanced_tests_passed = run_comprehensive_smoke_test()

# === Summary & Advanced Insights ===
print("\n" + "=" * 70)
print("📋 ADVANCED DATASET PROCESSING SUMMARY")
print("=" * 70)

print("\n✅ 完成項目 (Completed Advanced Items):")
print("• 超大型資料集處理：TB級資料的串流處理與分片管理")
print("• 進階多模態前處理：複雜文本清理、圖像品質檢測、音頻降噪")
print("• 智能品質控制：異常檢測、重複去除、自動化品質評分")
print("• GPU加速處理：批次正規化、圖像變換的GPU優化")
print("• 分散式處理架構：多程序並行、動態記憶體管理")
print("• 自適應批次處理：根據記憶體與時間約束動態調整batch size")
print("• 全面性能測試：基準測試與優化建議自動生成")

print("\n🧠 進階核心概念 (Advanced Core Concepts):")
print("• 記憶體映射 vs 串流處理：何時使用哪種策略處理超大資料集")
print("• 品質感知處理：整合品質評分到處理管線中的重要性")
print("• GPU記憶體管理：避免OOM的tensor生命週期管理")
print("• 工作負載分散：CPU密集 vs I/O密集任務的不同並行策略")
print("• 自適應系統設計：根據運行時資源動態調整處理參數")
print("• 效能剖析驅動優化：從基準測試數據得出可行的優化策略")

print("\n⚠️ 進階陷阱 (Advanced Pitfalls):")
print("• GPU記憶體碎片：頻繁的tensor創建/釋放導致記憶體碎片")
print("• 過度並行化：worker數量超過CPU核心數反而降低效能")
print("• 批次大小調優：忽略記憶體與計算的非線性關係")
print("• 品質閾值設定：過嚴格的品質要求可能移除有用資料")
print("• 串流處理狀態：無狀態處理vs有狀態聚合的設計選擇")
print("• 異步處理錯誤：分散式處理中的錯誤處理與復原機制")

print("\n🚀 生產環境建議 (Production Recommendations):")
print("• 資料管線監控：實施處理速度、記憶體使用、錯誤率的即時監控")
print("• 彈性擴展設計：支援從單機到叢集的無縫擴展")
print("• 資料版本控制：追蹤處理參數變更對下游模型效能的影響")
print("• 快取策略優化：多層快取(記憶體/SSD/網路)的智能管理")
print("• 容錯處理機制：graceful degradation與自動重試策略")
print("• 效能基線建立：定期基準測試以檢測效能退化")

print("\n📊 處理規模指引 (Scale Guidelines):")
print("• < 1GB: 直接記憶體處理，標準batch size")
print("• 1-10GB: 使用記憶體映射，增加batch size，開啟多程序")
print("• 10-100GB: 串流處理 + 分片，GPU加速，分散式處理")
print("• > 100GB: 叢集處理，資料分區，增量處理策略")

print("\n💡 下一步進階主題 (Next Advanced Topics):")
print("• 聯邦學習資料處理：分散式資料的隱私保護處理")
print("• 即時資料流處理：Apache Kafka + Spark Streaming整合")
print("• 多模態融合前處理：跨模態特徵對齊與同步")
print("• 自動化資料清理：ML驅動的資料品質提升")
print("• 邊緣計算優化：移動裝置上的輕量化處理管線")

# Final memory cleanup
gc.collect()
final_memory = get_memory_usage()
print(f"\n💾 Final Memory Usage: {final_memory}")

print(
    f"\n🎯 Overall Advanced Processing Status: {'SUCCESS' if all_advanced_tests_passed else 'NEEDS_REVIEW'}"
)

# Save processing statistics for future analysis
processing_stats = {
    "text_processing": (
        dict(text_processor.stats) if "text_processor" in locals() else {}
    ),
    "image_processing": (
        dict(image_processor.stats) if "image_processor" in locals() else {}
    ),
    "audio_processing": (
        dict(audio_processor.stats) if "audio_processor" in locals() else {}
    ),
    "memory_usage_gb": final_memory,
    "tests_passed": all_advanced_tests_passed,
}

print(f"\n📈 Session Processing Statistics:")
for category, stats in processing_stats.items():
    if stats and isinstance(stats, dict):
        print(f"  {category}: {stats}")

print("\n" + "=" * 70)
print("🎓 ADVANCED DATASET PROCESSING COMPLETE")
print("=" * 70)

In [None]:
# === Final Advanced Smoke Test (5-line validation) ===
from datasets import Dataset
import numpy as np

# Create test dataset
test_ds = Dataset.from_dict({"text": ["advanced test"], "data": [np.random.randn(10)]})
# Test advanced processing pipeline
processor = AdvancedTextProcessor()
result = processor.advanced_clean("test")
assert "quality_score" in result and isinstance(result["quality_score"], float)
print("🎯 Advanced Notebook 03b 驗收通過！進階大型資料集處理管線運作正常")


## 6. 延伸章節小結

### ✅ 進階完成項目 (Advanced Completed Items)
- **TB級資料處理能力**：串流處理、分片管理、分散式架構
- **智能品質控制系統**：多維度品質評分、異常檢測、重複去除
- **GPU加速處理管線**：CUDA優化的批次處理、記憶體管理
- **自適應系統設計**：動態batch sizing、記憶體感知處理
- **全面效能分析框架**：基準測試、瓶頸識別、優化建議生成

### 🧠 進階核心原理 (Advanced Core Principles)
- **可擴展性設計模式**：從單機到叢集的無縫擴展策略
- **品質感知處理**：將資料品質作為處理決策的核心參數
- **資源感知調度**：根據系統資源動態調整處理策略
- **多模態整合處理**：統一的品質控制與異常檢測框架
- **生產級錯誤處理**：graceful degradation 與自動恢復機制

### 📈 實戰應用場景 (Real-world Applications)
1. **大型語料庫清理**：TB級文本資料的品質控制與去重
2. **多媒體內容管理**：圖片/音頻的品質評估與自動分類
3. **即時資料流處理**：高吞吐量的串流資料前處理
4. **分散式訓練資料準備**：多節點環境下的資料分發與同步
5. **邊緣計算優化**：資源受限環境下的高效處理策略

### 🚀 下一步整合建議 (Next Integration Steps)
1. **立即應用**：將進階處理技巧整合到後續的 fine-tuning 和 RAG 系統中
2. **擴展方向**：探索 Apache Spark / Dask 等大資料處理框架的整合
3. **監控完善**：建立資料處理的 MLOps 監控與告警系統
4. **效能優化**：針對特定硬體環境進行客製化優化

**何時使用這些進階技巧**：當資料量超過單機記憶體限制、需要保證資料品質、要求高吞吐量處理、或部署到生產環境時。這些技巧特別適用於構建robust的資料處理管線，為後續的模型訓練和推理提供高品質的資料基礎。