# ARC Dataset Analysis

This notebook analyzes the ARC dataset to identify patterns and optimize the solution approach.

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import seaborn as sns

# Load ARC data
def load_arc_data():
    with open('../arc-prize-2025/arc-agi_training_challenges.json', 'r') as f:
        train_challenges = json.load(f)
    with open('../arc-prize-2025/arc-agi_training_solutions.json', 'r') as f:
        train_solutions = json.load(f)
    return train_challenges, train_solutions

train_challenges, train_solutions = load_arc_data()
print(f"Loaded {len(train_challenges)} training tasks")

In [None]:
# Analyze grid sizes
def analyze_grid_sizes(challenges):
    sizes = []
    for task_id, task in challenges.items():
        for example in task['train'] + task['test']:
            input_grid = example['input']
            sizes.append((len(input_grid), len(input_grid[0])))
    
    size_counter = Counter(sizes)
    print("Most common grid sizes:")
    for size, count in size_counter.most_common(10):
        print(f"  {size[0]}x{size[1]}: {count} grids")
    
    return sizes

sizes = analyze_grid_sizes(train_challenges)

In [None]:
# Analyze color usage
def analyze_colors(challenges):
    color_usage = Counter()
    
    for task_id, task in challenges.items():
        for example in task['train'] + task['test']:
            grid = np.array(example['input'])
            unique_colors = np.unique(grid)
            for color in unique_colors:
                color_usage[color] += np.sum(grid == color)
    
    print("Color usage frequency:")
    for color, count in color_usage.most_common():
        print(f"  Color {color}: {count} cells")
    
    return color_usage

color_usage = analyze_colors(train_challenges)

In [None]:
# Analyze transformation patterns
def analyze_transformations(challenges, solutions):
    patterns = {
        'identity': 0,  # Input == Output
        'size_change': 0,  # Different grid sizes
        'color_change': 0,  # Different color sets
        'shape_change': 0,  # Different non-zero patterns
        'rotation': 0,  # Rotational symmetry
        'reflection': 0,  # Reflection symmetry
    }
    
    for task_id, task in challenges.items():
        if task_id not in solutions:
            continue
            
        task_solutions = solutions[task_id]
        
        for i, example in enumerate(task['train']):
            input_grid = np.array(example['input'])
            output_grid = np.array(example['output'])
            
            # Check for identity transformation
            if np.array_equal(input_grid, output_grid):
                patterns['identity'] += 1
                continue
            
            # Check for size change
            if input_grid.shape != output_grid.shape:
                patterns['size_change'] += 1
            
            # Check for color change
            input_colors = set(np.unique(input_grid))
            output_colors = set(np.unique(output_grid))
            if input_colors != output_colors:
                patterns['color_change'] += 1
            
            # Check for rotation (90, 180, 270 degrees)
            if input_grid.shape == output_grid.shape:
                for k in [1, 2, 3]:
                    if np.array_equal(np.rot90(input_grid, k), output_grid):
                        patterns['rotation'] += 1
                        break
                
                # Check for reflection
                if (np.array_equal(np.fliplr(input_grid), output_grid) or 
                    np.array_equal(np.flipud(input_grid), output_grid)):
                    patterns['reflection'] += 1
    
    print("Transformation patterns:")
    for pattern, count in patterns.items():
        print(f"  {pattern}: {count} examples")
    
    return patterns

patterns = analyze_transformations(train_challenges, train_solutions)

In [None]:
# Visualize analysis results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Grid sizes distribution
size_counter = Counter(sizes)
common_sizes = dict(size_counter.most_common(10))
size_labels = [f"{s[0]}x{s[1]}" for s in common_sizes.keys()]
axes[0,0].bar(range(len(common_sizes)), list(common_sizes.values()))
axes[0,0].set_xticks(range(len(common_sizes)))
axes[0,0].set_xticklabels(size_labels, rotation=45)
axes[0,0].set_title('Most Common Grid Sizes')
axes[0,0].set_ylabel('Frequency')

# Color usage
colors = list(range(10))  # ARC uses colors 0-9
color_counts = [color_usage.get(c, 0) for c in colors]
axes[0,1].bar(colors, color_counts)
axes[0,1].set_title('Color Usage Frequency')
axes[0,1].set_xlabel('Color')
axes[0,1].set_ylabel('Usage Count')

# Transformation patterns
pattern_names = list(patterns.keys())
pattern_counts = list(patterns.values())
axes[1,0].bar(pattern_names, pattern_counts)
axes[1,0].set_title('Transformation Patterns')
axes[1,0].set_ylabel('Frequency')
axes[1,0].tick_params(axis='x', rotation=45)

# Task complexity (number of training examples)
train_counts = [len(task['train']) for task in train_challenges.values()]
axes[1,1].hist(train_counts, bins=range(1, max(train_counts)+2), alpha=0.7)
axes[1,1].set_title('Number of Training Examples per Task')
axes[1,1].set_xlabel('Training Examples')
axes[1,1].set_ylabel('Number of Tasks')

plt.tight_layout()
plt.show()

print("\nAnalysis Summary:")
print(f"- Total training tasks: {len(train_challenges)}")
print(f"- Most common grid size: {Counter(sizes).most_common(1)[0]}")
print(f"- Most used color: {color_usage.most_common(1)[0]}")
print(f"- Most common transformation: {max(patterns.items(), key=lambda x: x[1])}")