## Overview
This notebook provides an exploratory data analysis of the CZII Cryo-ET Particle Detection Challenge dataset. It focuses on:
- Loading and visualizing tomogram data
- Analyzing particle distributions
- Visualizing different particle types:
  - apo-ferritin (easy)
  - beta-amylase (impossible, not scored)
  - beta-galactosidase (hard)
  - ribosome (easy)
  - thyroglobulin (hard)
  - virus-like-particle (easy)

## Table of Contents
1. Loading Required Libraries
2. Reading Tomogram Data
3. Loading Particle Coordinates
4. Visualizing Particles in Tomogram Slices
5. Statistical Analysis of Particle Distribution

## Loading Required Libraries

In [None]:
# First, let's import the necessary libraries
import zarr
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

## Reading Tomogram Data

In [None]:
# Define the path to the zarr file
zarr_path = Path('/home/naoya/kaggle/czii/input/czii-cryo-et-object-identification/train/static/ExperimentRuns/TS_5_4/VoxelSpacing10.000/denoised.zarr')

# Open the zarr array
zarr_store = zarr.open(str(zarr_path))

# Print basic information about the zarr store
print("Zarr store structure:")
print(zarr_store.tree())

In [None]:
# Get the highest resolution data (scale 0)
tomogram = zarr_store['0'][:]

print(f"Tomogram shape: {tomogram.shape}")
print(f"Data type: {tomogram.dtype}")
print(f"Min value: {tomogram.min()}")
print(f"Max value: {tomogram.max()}")
print(f"Mean value: {tomogram.mean()}")

In [None]:
# Reading and printing the structure of the apo-ferritin JSON
import json
from pathlib import Path

# Read the JSON file for apo-ferritin
json_path = Path('/home/naoya/kaggle/czii/input/czii-cryo-et-object-identification/train/overlay/ExperimentRuns/TS_5_4/Picks/apo-ferritin.json')

with open(json_path, 'r') as f:
    data = json.load(f)

# Examine the structure
print("Keys in the JSON file:", data.keys())
print("\nFirst few points:")
print(json.dumps(data['points'][:2], indent=2))  # Print first 2 points for examination

## Loading Particle Coordinates

In [None]:
import json
import numpy as np
from pathlib import Path

def load_apo_ferritin_coordinates(experiment_name='TS_5_4'):
    """Load apo-ferritin coordinates from JSON file."""
    json_path = Path('/home/naoya/kaggle/czii/input/czii-cryo-et-object-identification/train/overlay/ExperimentRuns') / experiment_name / 'Picks/apo-ferritin.json'

    try:
        with open(json_path, 'r') as f:
            data = json.load(f)

        # Extract coordinates from the points array
        coords = []
        for point in data['points']:
            coords.append([
                point['location']['z'],
                point['location']['y'],
                point['location']['x']
            ])

        coords = np.array(coords)
        print(f"Loaded {len(coords)} apo-ferritin coordinates")

        # Print some basic statistics
        print("\nCoordinate ranges:")
        print(f"Z range: {coords[:, 0].min():.1f} to {coords[:, 0].max():.1f}")
        print(f"Y range: {coords[:, 1].min():.1f} to {coords[:, 1].max():.1f}")
        print(f"X range: {coords[:, 2].min():.1f} to {coords[:, 2].max():.1f}")

        return coords

    except Exception as e:
        print(f"Error reading coordinates: {e}")
        return np.array([])

# Load the coordinates
apo_ferritin_coords = load_apo_ferritin_coordinates()

In [None]:
print("Tomogram shape:", tomogram.shape)
print("\nCoordinate ranges before scaling:")
print(f"Z range: {apo_ferritin_coords[:, 0].min():.1f} to {apo_ferritin_coords[:, 0].max():.1f}")
print(f"Y range: {apo_ferritin_coords[:, 1].min():.1f} to {apo_ferritin_coords[:, 1].max():.1f}")
print(f"X range: {apo_ferritin_coords[:, 2].min():.1f} to {apo_ferritin_coords[:, 2].max():.1f}")

In [None]:
# Let's create a function to scale the coordinates
def scale_coordinates(coords, tomogram_shape):
    """Scale coordinates to match tomogram dimensions."""
    scaled_coords = coords.copy()
    
    # Scale factors for each dimension
    scale_z = tomogram_shape[0] / coords[:, 0].max()
    scale_y = tomogram_shape[1] / coords[:, 1].max()
    scale_x = tomogram_shape[2] / coords[:, 2].max()
    
    # Apply scaling
    scaled_coords[:, 0] = coords[:, 0] * scale_z
    scaled_coords[:, 1] = coords[:, 1] * scale_y
    scaled_coords[:, 2] = coords[:, 2] * scale_x
    
    return scaled_coords

# Scale the coordinates
scaled_coords = scale_coordinates(apo_ferritin_coords, tomogram.shape)

print("\nCoordinate ranges after scaling:")
print(f"Z range: {scaled_coords[:, 0].min():.1f} to {scaled_coords[:, 0].max():.1f}")
print(f"Y range: {scaled_coords[:, 1].min():.1f} to {scaled_coords[:, 1].max():.1f}")
print(f"X range: {scaled_coords[:, 2].min():.1f} to {scaled_coords[:, 2].max():.1f}")

## Visualizing Particles in Tomogram Slices

In [None]:
from mpl_toolkits.axes_grid1 import ImageGrid

# Updated visualization function
def visualize_apo_ferritin(tomogram, coords, n_slices=3, slice_thickness=10):
    """
    Visualize apo-ferritin particles in tomogram slices.
    """
    fig = plt.figure(figsize=(20, 10))
    grid = ImageGrid(fig, 111,
                    nrows_ncols=(1, n_slices),
                    axes_pad=0.3,
                    share_all=True,
                    cbar_location="right",
                    cbar_mode="single",
                    cbar_size="5%",
                    cbar_pad=0.1)

    # Normalize tomogram data
    vmin, vmax = np.percentile(tomogram, (1, 99))
    normalized_tomogram = np.clip((tomogram - vmin) / (vmax - vmin), 0, 1)

    # Calculate evenly spaced z-positions
    z_positions = np.linspace(0, tomogram.shape[0]-1, n_slices, dtype=int)

    # Plot each slice
    for idx, ax in enumerate(grid):
        z = z_positions[idx]

        # Show tomogram slice
        im = ax.imshow(normalized_tomogram[z, :, :], cmap='gray', vmin=0, vmax=1)

        # Find particles near this slice
        mask = np.abs(coords[:, 0] - z) < slice_thickness
        if np.any(mask):
            ax.scatter(
                coords[mask, 2], coords[mask, 1],
                color='red', marker='o', s=100,
                facecolors='none', linewidth=2,
                label='apo-ferritin'
            )

        ax.set_title(f'Slice Z={z}\n({np.sum(mask)} particles visible)')
        ax.grid(False)

        # Set the axes limits to match the tomogram dimensions
        ax.set_xlim(0, tomogram.shape[2])
        ax.set_ylim(tomogram.shape[1], 0)  # Inverted y-axis to match image coordinates

    # Add colorbar and title
    grid.cbar_axes[0].colorbar(im)

    plt.suptitle(
        'Apo-ferritin Particles in Tomogram Slices\n' +
        f'Showing particles within ±{slice_thickness} units of each slice',
        fontsize=16, y=1.05
    )

    # Add legend to the first subplot
    grid[0].legend(bbox_to_anchor=(1.5, 1.0))

    plt.show()

    # return fig

In [None]:
# Create the visualization with scaled coordinates
visualize_apo_ferritin(tomogram, scaled_coords)

## Visualizing 6 Particles in Tomogram Slices with difficulty levels

In [None]:
# Define particle types with their properties
PARTICLE_TYPES = {
    'apo-ferritin': {'color': 'red', 'marker': 'o', 'difficulty': 'easy'},
    'beta-amylase': {'color': 'gray', 'marker': 's', 'difficulty': 'impossible'},
    'beta-galactosidase': {'color': 'blue', 'marker': '^', 'difficulty': 'hard'},
    'ribosome': {'color': 'green', 'marker': 'D', 'difficulty': 'easy'},
    'thyroglobulin': {'color': 'purple', 'marker': 'p', 'difficulty': 'hard'},
    'virus-like-particle': {'color': 'orange', 'marker': '*', 'difficulty': 'easy'}
}

def load_all_particle_coordinates(experiment_name='TS_5_4'):
    """Load coordinates for all particle types."""
    base_path = Path('/kaggle/input/czii-cryo-et-object-identification/train/overlay/ExperimentRuns')
    particle_coords = {}
    
    for particle_type in PARTICLE_TYPES.keys():
        json_path = base_path / experiment_name / 'Picks' / f'{particle_type}.json'
        try:
            with open(json_path, 'r') as f:
                data = json.load(f)
                coords = []
                for point in data['points']:
                    coords.append([
                        point['location']['z'],
                        point['location']['y'],
                        point['location']['x']
                    ])
                particle_coords[particle_type] = np.array(coords)
                print(f"Loaded {len(coords)} {particle_type} coordinates")
        except Exception as e:
            print(f"Error reading {particle_type} coordinates: {e}")
            particle_coords[particle_type] = np.array([])
    
    return particle_coords

In [None]:
# Updated particle types with high-contrast colors
PARTICLE_TYPES = {
    'apo-ferritin': {'color': '#FF3333', 'marker': 'o', 'difficulty': 'easy'},          # Bright red
    'beta-amylase': {'color': '#FFFFFF', 'marker': 's', 'difficulty': 'impossible'},     # White
    'beta-galactosidase': {'color': '#33FFFF', 'marker': '^', 'difficulty': 'hard'},    # Cyan
    'ribosome': {'color': '#33FF33', 'marker': 'D', 'difficulty': 'easy'},              # Bright green
    'thyroglobulin': {'color': '#FF33FF', 'marker': 'p', 'difficulty': 'hard'},         # Magenta
    'virus-like-particle': {'color': '#FFFF33', 'marker': '*', 'difficulty': 'easy'}     # Yellow
}

def visualize_all_particles(tomogram, particle_coords, n_slices=3, slice_thickness=20):  # Increased slice thickness
    """
    Visualize all particle types in tomogram slices with an overlay legend.
    """
    fig = plt.figure(figsize=(20, 10))
    grid = ImageGrid(fig, 111,
                    nrows_ncols=(1, n_slices),
                    axes_pad=0.3,
                    share_all=True,
                    cbar_location="right",
                    cbar_mode="single",
                    cbar_size="5%",
                    cbar_pad=0.1)

    # Normalize tomogram data
    vmin, vmax = np.percentile(tomogram, (1, 99))
    normalized_tomogram = np.clip((tomogram - vmin) / (vmax - vmin), 0, 1)

    # Find z-positions with maximum particle density
    all_z_coords = []
    for coords in particle_coords.values():
        if len(coords) > 0:
            all_z_coords.extend(coords[:, 0])

    if all_z_coords:
        z_coords = np.array(all_z_coords)
        z_density = np.histogram(z_coords, bins=50)[0]
        highest_density_indices = np.argsort(z_density)[-n_slices:]
        z_positions = np.linspace(z_coords.min(), z_coords.max(), 51)[highest_density_indices]
    else:
        z_positions = np.linspace(0, tomogram.shape[0]-1, n_slices, dtype=int)

    # Plot each slice
    for idx, ax in enumerate(grid):
        z = int(z_positions[idx])

        # Show tomogram slice
        im = ax.imshow(normalized_tomogram[z, :, :], cmap='gray', vmin=0, vmax=1)

        # Plot each particle type
        particles_in_slice = 0
        particle_counts = {}

        for particle_type, coords in particle_coords.items():
            if len(coords) > 0:
                # Find particles near this slice
                mask = np.abs(coords[:, 0] - z) < slice_thickness
                if np.any(mask):
                    style = PARTICLE_TYPES[particle_type]
                    ax.scatter(
                        coords[mask, 2], coords[mask, 1],
                        color=style['color'], marker=style['marker'],
                        s=100, facecolors='none', linewidth=2,
                        label=f"{particle_type}\n({style['difficulty']})"
                    )
                    count = np.sum(mask)
                    particles_in_slice += count
                    particle_counts[particle_type] = count

        # Create detailed title showing counts for each particle type
        title_parts = [f'Slice Z={z}']
        if particle_counts:
            for ptype, count in particle_counts.items():
                if count > 0:
                    title_parts.append(f'{ptype}: {count}')
        title = '\n'.join(title_parts)
        ax.set_title(title, fontsize=8)

        ax.grid(False)

        # Set the axes limits to match the tomogram dimensions
        ax.set_xlim(0, tomogram.shape[2])
        ax.set_ylim(tomogram.shape[1], 0)  # Inverted y-axis to match image coordinates

        # Add legend with semi-transparent background for better visibility
        if idx == 0:  # Only add legend to first subplot
            handles, labels = ax.get_legend_handles_labels()
            legend = ax.legend(
                handles, labels,
                bbox_to_anchor=(0.02, 0.98),
                loc='upper left',
                borderaxespad=0.,
                framealpha=0.8,
                facecolor='black',
                edgecolor='white',
                labelcolor='white',
                fontsize=8
            )

            for handle in handles:
                handle.set_edgecolor('black')
                handle.set_linewidth(1.5)

    # Add colorbar and title
    grid.cbar_axes[0].colorbar(im)

    plt.suptitle('All Particle Types in Tomogram Slices\n' +
                f'Showing particles within ±{slice_thickness} units of each slice',
                fontsize=16, y=1.05)

    plt.show()

    # Print overall particle statistics
    print("\nOverall Particle Statistics:")
    print("-" * 50)
    for particle_type, coords in particle_coords.items():
        if len(coords) > 0:
            print(f"\n{particle_type} ({PARTICLE_TYPES[particle_type]['difficulty']}):")
            print(f"Total particles: {len(coords)}")
            print(f"Z range: {coords[:, 0].min():.1f} to {coords[:, 0].max():.1f}")

    # return fig

In [None]:
# Load all particle coordinates
all_particle_coords = load_all_particle_coordinates()

# Scale coordinates for each particle type
scaled_particle_coords = {
    particle_type: scale_coordinates(coords, tomogram.shape)
    for particle_type, coords in all_particle_coords.items()
}

# Create visualization with all particle types
visualize_all_particles(tomogram, scaled_particle_coords)

# Print statistics for each particle type
print("\nParticle Statistics:")
print("-" * 50)
for particle_type, coords in scaled_particle_coords.items():
    if len(coords) > 0:
        print(f"\n{particle_type} ({PARTICLE_TYPES[particle_type]['difficulty']}):")
        print(f"Number of particles: {len(coords)}")
        print(f"Z range: {coords[:, 0].min():.1f} to {coords[:, 0].max():.1f}")
        print(f"Y range: {coords[:, 1].min():.1f} to {coords[:, 1].max():.1f}")
        print(f"X range: {coords[:, 2].min():.1f} to {coords[:, 2].max():.1f}")