# Dataset Preview & Verification

This notebook provides tools to visualize and verify the mathematical art dataset. It helps ensure data quality and provides an interactive way to explore the dataset entries.

## Features:
- Load and display sample images with their associated formulas
- Check data quality and consistency
- Interactive exploration of dataset metadata
- Verification of image-formula pairs

In [None]:
# Import Required Libraries
import json
import os
import random
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load Dataset Metadata
def load_metadata(metadata_path="metadata.jsonl"):
    """Load metadata from JSONL file"""
    metadata = []
    if os.path.exists(metadata_path):
        with open(metadata_path, 'r') as f:
            for line in f:
                metadata.append(json.loads(line.strip()))
    return metadata

# Load the metadata
metadata = load_metadata()
print(f"Loaded {len(metadata)} entries from metadata.jsonl")

# Convert to DataFrame for easier analysis
df = pd.DataFrame(metadata)
if not df.empty:
    print("\nDataset Overview:")
    print(df.head())
else:
    print("No metadata found. Dataset appears to be empty.")

In [None]:
# Dataset Quality Checks
def check_dataset_quality(metadata, images_dir="images/"):
    """Perform various quality checks on the dataset"""
    print("=== Dataset Quality Report ===\n")
    
    if not metadata:
        print("❌ No metadata entries found!")
        return
    
    # Check for missing images
    missing_images = []
    existing_images = []
    
    for entry in metadata:
        image_path = entry.get('image_path', '')
        if os.path.exists(image_path):
            existing_images.append(image_path)
        else:
            missing_images.append(image_path)
    
    print(f"📊 Total entries: {len(metadata)}")
    print(f"✅ Images found: {len(existing_images)}")
    print(f"❌ Missing images: {len(missing_images)}")
    
    if missing_images:
        print(f"\nMissing image files:")
        for img in missing_images[:5]:  # Show first 5
            print(f"  - {img}")
        if len(missing_images) > 5:
            print(f"  ... and {len(missing_images) - 5} more")
    
    # Check for required fields
    required_fields = ['image_path', 'formula', 'description']
    incomplete_entries = []
    
    for i, entry in enumerate(metadata):
        missing_fields = [field for field in required_fields if field not in entry or not entry[field]]
        if missing_fields:
            incomplete_entries.append((i, missing_fields))
    
    print(f"\n📝 Complete entries: {len(metadata) - len(incomplete_entries)}")
    print(f"⚠️  Incomplete entries: {len(incomplete_entries)}")
    
    if incomplete_entries:
        print(f"\nIncomplete entries (missing fields):")
        for idx, fields in incomplete_entries[:3]:
            print(f"  Entry {idx}: missing {fields}")
    
    # Check resolution consistency
    if metadata:
        resolutions = [entry.get('resolution', [0, 0]) for entry in metadata]
        unique_resolutions = list(set(tuple(r) for r in resolutions))
        print(f"\n🖼️  Unique resolutions: {unique_resolutions}")
        
        # Check for common tags
        all_tags = []
        for entry in metadata:
            all_tags.extend(entry.get('tags', []))
        
        if all_tags:
            tag_counts = pd.Series(all_tags).value_counts()
            print(f"\n🏷️  Top 5 tags:")
            for tag, count in tag_counts.head().items():
                print(f"  {tag}: {count}")

# Run quality checks
check_dataset_quality(metadata)

In [None]:
# Visualize Sample Entries
def display_sample_entries(metadata, num_samples=4, random_seed=42):
    """Display sample entries with images and metadata"""
    if not metadata:
        print("No metadata available to display")
        return
    
    # Filter entries that have existing images
    valid_entries = [entry for entry in metadata if os.path.exists(entry.get('image_path', ''))]
    
    if not valid_entries:
        print("No valid entries with existing images found")
        return
    
    random.seed(random_seed)
    sample_entries = random.sample(valid_entries, min(num_samples, len(valid_entries)))
    
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    axes = axes.flatten()
    
    for i, entry in enumerate(sample_entries):
        if i >= len(axes):
            break
            
        try:
            # Load and display image
            img = Image.open(entry['image_path'])
            axes[i].imshow(img)
            axes[i].axis('off')
            
            # Create title with formula and description
            title = f"Formula: {entry.get('formula', 'N/A')}\n"
            title += f"Desc: {entry.get('description', 'N/A')}"
            axes[i].set_title(title, fontsize=10, wrap=True)
            
        except Exception as e:
            axes[i].text(0.5, 0.5, f"Error loading\n{entry.get('image_path', 'Unknown')}\n{str(e)}", 
                        ha='center', va='center', transform=axes[i].transAxes)
            axes[i].set_title(f"Error: {entry.get('formula', 'N/A')}")
    
    # Hide unused subplots
    for i in range(len(sample_entries), len(axes)):
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    # Display detailed metadata
    print("\n=== Detailed Metadata ===")
    for i, entry in enumerate(sample_entries):
        print(f"\nEntry {i+1}:")
        for key, value in entry.items():
            if key != 'image_path':  # Don't repeat the path
                print(f"  {key}: {value}")

# Display samples (this will show placeholder message until images exist)
print("Sample visualization (will show actual images when they exist):")
display_sample_entries(metadata)

In [None]:
# Interactive Dataset Exploration
def explore_by_tag(metadata, tag=None):
    """Filter and display entries by tag"""
    if not metadata:
        return
    
    if tag is None:
        # Show all available tags
        all_tags = []
        for entry in metadata:
            all_tags.extend(entry.get('tags', []))
        
        if all_tags:
            tag_counts = pd.Series(all_tags).value_counts()
            print("Available tags:")
            for t, count in tag_counts.items():
                print(f"  {t}: {count} images")
        else:
            print("No tags found in metadata")
        return
    
    # Filter by tag
    filtered_entries = [entry for entry in metadata 
                       if tag.lower() in [t.lower() for t in entry.get('tags', [])]]
    
    print(f"Found {len(filtered_entries)} entries with tag '{tag}':")
    for i, entry in enumerate(filtered_entries):
        print(f"{i+1}. {entry.get('description', 'No description')} - {entry.get('formula', 'No formula')}")

def search_by_formula_keyword(metadata, keyword):
    """Search entries by formula keyword"""
    if not metadata:
        return
    
    matches = [entry for entry in metadata 
              if keyword.lower() in entry.get('formula', '').lower()]
    
    print(f"Found {len(matches)} entries containing '{keyword}' in formula:")
    for i, entry in enumerate(matches):
        print(f"{i+1}. {entry.get('formula', 'No formula')} - {entry.get('description', 'No description')}")

# Example explorations
print("=== Interactive Dataset Exploration ===\n")

print("1. All available tags:")
explore_by_tag(metadata)

print("\n2. Entries with 'spiral' tag:")
explore_by_tag(metadata, 'spiral')

print("\n3. Formulas containing 'sin':")
search_by_formula_keyword(metadata, 'sin')

print("\n4. Formulas containing 'theta':")
search_by_formula_keyword(metadata, 'theta')