In [1]:
import numpy as np
import os
import json
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv3D, MaxPooling3D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report
import copick
import zarr

def load_picks(picks_folder, voxel_spacing):
    picks = {}
    for json_file in os.listdir(picks_folder):
        if json_file.endswith('.json'):
            json_path = os.path.join(picks_folder, json_file)
            with open(json_path, 'r') as file:
                pick_data = json.load(file)
            picks[json_file[:-5]] = np.array([
                [
                    point['location']['x'] / voxel_spacing.voxel_size,
                    point['location']['y'] / voxel_spacing.voxel_size,
                    point['location']['z'] / voxel_spacing.voxel_size
                ]
                for point in pick_data['points']
            ])
    return picks

def extract_patches(data, picks, patch_size=16):
    patches = []
    labels = []
    half_size = patch_size // 2

    for particle, locations in picks.items():
        for loc in locations:
            x, y, z = map(int, loc)
            # Ensure patch is within bounds
            if (x - half_size >= 0 and x + half_size < data.shape[2] and
                y - half_size >= 0 and y + half_size < data.shape[1] and
                z - half_size >= 0 and z + half_size < data.shape[0]):
                patch = data[z-half_size:z+half_size,
                             y-half_size:y+half_size,
                             x-half_size:x+half_size]
                patches.append(patch)
                labels.append(particle)
    return np.array(patches), np.array(labels)

def process_dataset(config_path, dataset_type='train'):
    copick_root = copick.from_file(config_path)
    runs = copick_root.runs
    run = copick_root.get_run(runs[0].name)
    
    voxel_spacing = run.get_voxel_spacing(10.000)
    
    # Access the specific tomogram
    tomogram = voxel_spacing.get_tomogram("denoised")
    
    # Access the Zarr data
    zarr_store = tomogram.zarr()
    zarr_group = zarr.open(zarr_store)
    
    # Load the tomogram data
    tomogram_vals = zarr_group['0']  # Adjust the key if needed
    
    # Path to the Picks folder
    picks_folder = os.path.join(
        '/Users/jake.brannigan/Documents/Kaggle/CryoET/Data/czii-cryo-et-object-identification',
        dataset_type,
        'overlay',
        'ExperimentRuns',
        'TS_5_4',
        'Picks'
    )
    
    picks = load_picks(picks_folder, voxel_spacing)
    patches, labels = extract_patches(tomogram_vals, picks, patch_size=16)
    
    return patches, labels


In [3]:
# Path to your copick configuration
config_path = '../../copick_config.json'

# Process training data
train_patches, train_labels = process_dataset(config_path, dataset_type='train')

# Normalize patches
train_patches = train_patches / np.max(train_patches)

# Encode labels
label_map = {name: idx for idx, name in enumerate(set(train_labels))}
encoded_train_labels = np.array([label_map[label] for label in train_labels])

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_patches, encoded_train_labels, test_size=0.2, random_state=42
)

# Add a channel dimension for CNN
X_train = X_train[..., np.newaxis]
X_val = X_val[..., np.newaxis]

In [6]:
# Define the 3D CNN model
model = Sequential([
    Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=(16, 16, 16, 1)),
    MaxPooling3D(pool_size=(2, 2, 2)),
    Conv3D(64, kernel_size=(3, 3, 3), activation='relu'),
    MaxPooling3D(pool_size=(2, 2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(label_map), activation='softmax')  # Output layer for classification
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=0.001), 
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    X_train, y_train, 
    epochs=20, 
    batch_size=32, 
    validation_data=(X_val, y_val), 
    verbose=1
)

# Save the trained model
# model.save('particle_detection_model.h5')

Epoch 1/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step - accuracy: 0.2393 - loss: 1.7408 - val_accuracy: 0.6071 - val_loss: 1.4230
Epoch 2/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.3621 - loss: 1.5309 - val_accuracy: 0.6429 - val_loss: 1.2745
Epoch 3/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 0.4526 - loss: 1.4058 - val_accuracy: 0.6071 - val_loss: 1.1371
Epoch 4/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 0.6124 - loss: 1.1601 - val_accuracy: 0.7500 - val_loss: 0.9877
Epoch 5/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step - accuracy: 0.7186 - loss: 1.0318 - val_accuracy: 0.6786 - val_loss: 0.8279
Epoch 6/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - accuracy: 0.7560 - loss: 0.8568 - val_accuracy: 0.7857 - val_loss: 0.6869
Epoch 7/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━

In [27]:
def sliding_window_predict(model, tomogram, patch_size=16, step=8, threshold=0.5, label_map=None):
    """
    Performs sliding window predictions on the test tomogram.

    Args:
        model (Keras model): Trained CNN model.
        tomogram (numpy array): 3D tomogram volume.
        patch_size (int): Size of cubic patches.
        step (int): Step size for sliding window.
        threshold (float): Confidence threshold for predictions.
        label_map (dict): Map of label indices to particle names.

    Returns:
        list: Predicted particles with their positions and labels.
    """
    half_size = patch_size // 2
    z_max, y_max, x_max = tomogram.shape

    predictions = []
    for z in range(half_size, z_max - half_size, step):
        print(f'{z}/{z_max}')
        for y in range(half_size, y_max - half_size, step):
            for x in range(half_size, x_max - half_size, step):
                patch = tomogram[z - half_size:z + half_size,
                                 y - half_size:y + half_size,
                                 x - half_size:x + half_size]
                patch = patch[np.newaxis, ..., np.newaxis]  # Add batch and channel dimensions
                patch = patch / np.max(patch)  # Normalize
                
                pred = model.predict(patch, verbose=0)
                max_prob = np.max(pred)
                label_idx = np.argmax(pred)
                # print([key for key,value in label_map if value == label_idx][0])

                if max_prob > threshold:
                    predictions.append({
                        "position": (x, y, z),
                        "label": [key for key,value in label_map.items() if value == label_idx][0],
                        "confidence": max_prob
                    })

    return predictions

def prepare_test_data(config_path, dataset_type='test'):
    """
    Prepares test data for predictions.

    Args:
        config_path (str): Path to the copick configuration file.
        dataset_type (str): Dataset type ('test').

    Returns:
        tomogram_vals (numpy array): The test tomogram data.
    """
    copick_root = copick.from_file(config_path)
    runs = copick_root.runs
    run = copick_root.get_run(runs[0].name)
    
    voxel_spacing = run.get_voxel_spacing(10.000)
    
    # Access the specific tomogram
    tomogram = voxel_spacing.get_tomogram("denoised")
    
    # Access the Zarr data
    zarr_store = tomogram.zarr()
    zarr_group = zarr.open(zarr_store)
    
    # Load the tomogram data
    tomogram_vals = zarr_group['0']  # Adjust the key if needed
    
    return tomogram_vals

In [9]:
def save_predictions(predictions, output_path):
    """
    Saves predictions in overlay format.

    Args:
        predictions (list): List of predicted particles with positions and labels.
        output_path (str): Path to save the JSON file.
    """
    overlay_data = {
        "points": [
            {
                "location": {"x": pos[0], "y": pos[1], "z": pos[2]},
                "label": label,
                "confidence": confidence
            }
            for pred in predictions
            for pos, label, confidence in [(pred["position"], pred["label"], pred["confidence"])]
        ]
    }

    with open(output_path, 'w') as json_file:
        json.dump(overlay_data, json_file, indent=4)

In [28]:
# Load the trained model (if not already in memory)
# model = load_model('particle_detection_model.h5')

# Process test data
tomogram_vals =  prepare_test_data(config_path)

# Example usage
predictions = sliding_window_predict(model, tomogram_vals, patch_size=16, step=8, threshold=0.5, label_map=label_map)
save_predictions(predictions, 'output_predictions.json')

8/184


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x104eaa660>>
Traceback (most recent call last):
  File "/Users/jake.brannigan/Documents/Kaggle/CryoET/venv/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


KeyboardInterrupt: 