In [1]:
import os
from pathlib import Path
import time
import cv2
import numpy as np
import shutil
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

def find_organisms_in_raw_image(raw_image_path, organism_folder):
    """
    Find extracted organisms in the raw image and return their bounding box coordinates.
    
    Args:
        raw_image_path: Path to the raw plankton image
        organism_folder: Path to the folder containing extracted organism images
        
    Returns:
        List of tuples (organism_identifier, x, y, width, height) where x,y is the top-left corner
    """
    # Load the raw image
    raw_img = cv2.imread(raw_image_path)
    raw_img_rgb = cv2.cvtColor(raw_img, cv2.COLOR_BGR2RGB)
    
    # Get all organism image paths
    organism_paths = list(Path(organism_folder).glob("*.jpg"))
    
    if not organism_paths:
        print(f"No organism images found in {organism_folder}")
        return []
    
    # Results list to store: (organism_identifier, x, y, width, height)
    detections = []
    
    i=0
    # For each organism image
    for org_path in organism_paths:
        i=i+1
        print(f"Processing {i} out of {len(organism_paths)}")
        # Load organism image
        org_img = cv2.imread(str(org_path))
        org_img_rgb = cv2.cvtColor(org_img, cv2.COLOR_BGR2RGB)
        
        # Use template matching to find the organism in the raw image
        result = cv2.matchTemplate(raw_img_rgb, org_img_rgb, cv2.TM_CCOEFF_NORMED)
        min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)
        
        # If good match found (adjust threshold as needed)
        if max_val > 0.8:  # You may need to adjust this threshold
            # Get organism dimensions
            h, w = org_img_rgb.shape[:2]
            
            # Store detection data: (filename, x, y, width, height)
            detections.append((org_path.name, max_loc[0], max_loc[1], w, h))
            print(f"Found {org_path.name} at position {max_loc} with confidence {max_val:.2f}")
        else:
            print(f"Could not find a good match for {org_path.name}. Best match: {max_val:.2f}")
            
            # Try alternative approach with feature matching if template matching fails
            if max_val > 0.5:  # Still somewhat of a match
                detections.append((org_path.name, max_loc[0], max_loc[1], w, h))
                print(f"Using best available match for {org_path.name}")
    
    return detections

def visualize_detections(raw_image_path, detections, output_path):
    """
    Create a visualization of the raw image with bounding boxes around detected organisms.
    
    Args:
        raw_image_path: Path to the raw plankton image
        detections: List of tuples (organism_identifier, x, y, width, height)
        output_path: Path to save the visualization
    """
    # Load the raw image for visualization
    raw_img = cv2.imread(raw_image_path)
    raw_img_rgb = cv2.cvtColor(raw_img, cv2.COLOR_BGR2RGB)
    
    # Create figure and axis
    fig, ax = plt.subplots(1, figsize=(16, 10))
    
    # Display the raw image
    ax.imshow(raw_img_rgb)
    
    # Add bounding boxes for each detection
    for i, (identifier, x, y, w, h) in enumerate(detections):
        # Create a rectangle patch
        rect = Rectangle((x, y), w, h, linewidth=2, edgecolor='r', facecolor='none')
        
        # Add the rectangle to the plot
        ax.add_patch(rect)
        
        # Extract organism type from identifier
        org_type = identifier.split('_')[2] if len(identifier.split('_')) > 2 else 'organism'
        
        # Add label
        ax.text(x, y-5, f"{i+1}: {filename}", color='red', fontsize=8, 
                bbox=dict(facecolor='white', alpha=0.7))
    
    # Remove axis ticks
    ax.set_xticks([])
    ax.set_yticks([])
    
    # Set title
    ax.set_title(f"Detected Organisms: {len(detections)}")
    
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Save the visualization
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"Visualization saved to {output_path}")
    
    # Also create a version with CV2 for direct image manipulation
    for identifier, x, y, w, h in detections:
        cv2.rectangle(raw_img, (x, y), (x+w, y+h), (0, 0, 255), 2)
    
    cv2.imwrite(output_path.replace('.png', '_cv2.jpg'), raw_img)
    
def export_to_yolo_format(raw_image_path, detections, output_folder, class_mapping=None):
    """
    Export detections to YOLO format.
    
    YOLO format: <class> <x_center> <y_center> <width> <height>
    Where all values are normalized between 0 and 1.
    
    Args:
        raw_image_path: Path to the raw plankton image
        detections: List of tuples (organism_filename, x, y, width, height)
        output_folder: Folder to save YOLO annotations
        class_mapping: Dictionary mapping organism filenames to class indices
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Get raw image dimensions
    raw_img = cv2.imread(raw_image_path)
    img_height, img_width = raw_img.shape[:2]
    
    # If no class mapping provided, create a simple one
    if class_mapping is None:
        # Extract organism types from filenames (assuming format like "plankton_type_*.jpg")
        organism_types = set()
        for filename, _, _, _, _ in detections:
            # Try to get organism type from filename
            parts = filename.split('_')
            if len(parts) > 1:
                organism_type = parts[0]
            else:
                organism_type = "organism"  # Default if can't extract from filename
            organism_types.add(organism_type)
        
        # Create mapping from organism types to class indices
        class_mapping = {organism_type: i for i, organism_type in enumerate(sorted(organism_types))}
        
        # Create class names file
        with open(os.path.join(output_folder, 'classes.txt'), 'w') as f:
            for organism_type in sorted(organism_types):
                f.write(f"{organism_type}\n")
    
    # Create YOLO annotation file
    base_name = os.path.splitext(os.path.basename(raw_image_path))[0]
    annotation_path = os.path.join(output_folder, f"{base_name}.txt")
    
    with open(annotation_path, 'w') as f:
        for filename, x, y, width, height in detections:
            # Determine class
            parts = filename.split('_')
            if len(parts) > 1:
                organism_type = parts[0]
            else:
                organism_type = "organism"
            
            # Get class index
            if organism_type in class_mapping:
                class_idx = class_mapping[organism_type]
            else:
                print(f"Warning: No class mapping for {organism_type}, using 0")
                class_idx = 0
            
            # Convert to YOLO format (normalized)
            x_center = (x + width / 2) / img_width
            y_center = (y + height / 2) / img_height
            w_normalized = width / img_width
            h_normalized = height / img_height
            
            # Write to file
            f.write(f"{class_idx} {x_center:.6f} {y_center:.6f} {w_normalized:.6f} {h_normalized:.6f}\n")
    
    print(f"YOLO annotations saved to {annotation_path}")
    print(f"Class mapping: {class_mapping}")
    
    # Create YOLO dataset configuration
    dataset_config = f"""
# YOLO Dataset Configuration
train: {output_folder}/train
val: {output_folder}/val
test: {output_folder}/test

# number of classes
nc: {len(class_mapping)}

# class names
names: {list(sorted(class_mapping.keys()))}
"""
    
    with open(os.path.join(output_folder, 'dataset.yaml'), 'w') as f:
        f.write(dataset_config)
    
    print(f"Dataset configuration saved to {os.path.join(output_folder, 'dataset.yaml')}")
    
    # Copy the raw image to the YOLO images folder
    images_folder = os.path.join(output_folder, 'images')
    os.makedirs(images_folder, exist_ok=True)
    
    image_output_path = os.path.join(images_folder, os.path.basename(raw_image_path))
    cv2.imwrite(image_output_path, raw_img)
    
    # Create directory structure for YOLO dataset
    for split in ['train', 'val', 'test']:
        os.makedirs(os.path.join(output_folder, split, 'images'), exist_ok=True)
        os.makedirs(os.path.join(output_folder, split, 'labels'), exist_ok=True)

In [2]:
"""Main function to run the workflow."""
# Replace these paths with your actual paths
raw_image_path = r"C:\Users\acer\Desktop\Work_IGB\Georgia Zooplankton\igb-georgia\input\M3A_2011-08-27__45um_above200um_x1_2400dpi_1-of-3.jpg"  # Path to your raw plankton image
organism_folder = r"C:\Users\acer\Desktop\Work_IGB\Georgia Zooplankton\igb-georgia\output"    # Path to folder with extracted organism images
output_folder = r"C:\Users\acer\Desktop\Work_IGB\Georgia Zooplankton\igb-georgia\detection_output"                           # Output folder for results

# Find organisms in the raw image
detections = find_organisms_in_raw_image(raw_image_path, organism_folder)

if detections:
    # Visualize detections
    visualize_detections(raw_image_path, detections, os.path.join(output_folder, "detection_visualization.png"))
    
    # Export to YOLO format
    export_to_yolo_format(raw_image_path, detections, os.path.join(output_folder, "yolo_dataset"))
    
    print(f"Successfully processed {len(detections)} organisms")
else:
    print("No organisms were detected in the raw image")

Processing 1 out of 5
Found M3A_2011-08-27__45um_above200um_x1_2400dpi_1-of-3_sorted_0_1_588_958.jpg at position (3984, 3259) with confidence 0.86
Processing 2 out of 5
Found M3A_2011-08-27__45um_above200um_x1_2400dpi_1-of-3_sorted_0_3_698_200.jpg at position (3421, 2300) with confidence 0.97
Processing 3 out of 5
Found M3A_2011-08-27__45um_above200um_x1_2400dpi_1-of-3_sorted_0_6_227_485.jpg at position (5797, 2955) with confidence 0.96
Processing 4 out of 5
Could not find a good match for M3A_2011-08-27__45um_above200um_x1_2400dpi_1-of-3_sorted_0_8_296_496.jpg. Best match: 0.48
Processing 5 out of 5
Could not find a good match for M3A_2011-08-27__45um_above200um_x1_2400dpi_1-of-3_sorted_0_9_111_104.jpg. Best match: 0.63
Using best available match for M3A_2011-08-27__45um_above200um_x1_2400dpi_1-of-3_sorted_0_9_111_104.jpg
Visualization saved to C:\Users\acer\Desktop\Work_IGB\Georgia Zooplankton\igb-georgia\detection_output\detection_visualization.png
YOLO annotations saved to C:\Users