# Copy raw images into one folder to use for CellProfiler processing

Currently, the images are located nest deep within multiple folders. 
For best practices, we will copy the images (preserving metadata) to one folder that can be used for CellProfiler processing.
This file is modified from its original version: https://github.com/WayScience/GFF_2D_organoid_prototyping .

## Import libraries

In [1]:
import argparse
import pathlib
import shutil
import sys

import tqdm

## Set paths and variables

In [2]:
argparse = argparse.ArgumentParser(
    description="Copy files from one directory to another"
)
argparse.add_argument("--HPC", action="store_true", help="Type of compute to run on")

# Parse arguments
args = argparse.parse_args(args=sys.argv[1:] if "ipykernel" not in sys.argv[0] else [])
HPC = args.HPC

print(f"HPC: {HPC}")

HPC: False


In [3]:
# Define parent and destination directories in a single dictionary
dir_mapping = {
    # "NF0014": {
    #     "parent": pathlib.Path(
    #         "/media/18tbdrive/GFF_organoid_data/Cell Painting-NF0014 Thawed3-Pilot Drug Screening/NF0014-Thawed 3 (Raw image files)-Combined/NF0014-Thawed 3 (Raw image files)-Combined copy"
    #         if not HPC
    #         else "/pl/active/koala/GFF_Data/GFF-Raw/NF0014-Thawed 3 (Raw image files)-Combined/NF0014-Thawed 3 (Raw image files)-Combined copy"
    #     ).resolve(strict=True),
    #     "destination": pathlib.Path("../../data/NF0014_raw_images").resolve(),
    # },
    # "NF0016": {
    #     "parent": pathlib.Path(
    #         "/media/18tbdrive/GFF_organoid_data/NF0016 Cell Painting-Pilot Drug Screening-selected/NF0016-Cell Painting Images/NF0016-images copy"
    #         if not HPC
    #         else "/pl/active/koala/GFF_Data/GFF-Raw/NF0016 Cell Painting-Pilot Drug Screening-selected/NF0016-Cell Painting Images/NF0016-images copy"
    #     ).resolve(strict=True),
    #     "destination": pathlib.Path("../../data/NF0016_raw_images").resolve(),
    # },
    "NF0017": {
        "parent": pathlib.Path(
            "/media/18tbdrive/GFF_organoid_data/NF0017-T3-P7 (AGP, Mito Parameter optimization)/Acquisition 03-07-2025"
            if not HPC
            else "/pl/active/koala/GFF_Data/GFF-Raw/NF0017-T3-P7 (AGP, Mito Parameter optimization)/Acquisition 03-07-2025"  # TODO: Update this later if not correct
        ).resolve(strict=True),
        "destination": pathlib.Path(
            "../../data/raw_images/NF0017_raw_images"
        ).resolve(),
    },
    # "NF0018": {
    #     "parent": pathlib.Path(
    #         "/media/18tbdrive/GFF_organoid_data/NF0018 (T6) Cell Painting-Pilot Drug Screeining-selected/NF0018-Cell Painting Images/NF0018-All Acquisitions"
    #         if not HPC
    #         else "/pl/active/koala/GFF_Data/GFF-Raw/NF0018 (T6) Cell Painting-Pilot Drug Screeining-selected/NF0018-Cell Painting Images/NF0018-All Acquisitions"
    #     ).resolve(strict=True),
    #     "destination": pathlib.Path("../../data/NF0018_raw_images").resolve(),
    # },
}

# Image extensions that we are looking to copy
image_extensions = {".tif", ".tiff"}

## Reach the nested images and copy to one folder

### Set QC functions that determine if a well/site is of good quality to process based on file structure

In [4]:
def has_consistent_naming(well_dir: pathlib.Path) -> bool:
    """Check that all nested folders within a well directory have the same names as the well directory itself.

    Args:
        well_dir (pathlib.Path): Path to a single well directory.

    Returns:
        bool: True if all nested folders inside this well directory have the same name as the well directory, False otherwise.
    """
    # Get the name of the well directory (this will be the expected folder name)
    well_name = well_dir.name

    # Get the immediate subdirectories in the well directory (e.g., Field_1, Field_2)
    sub_dirs = [d for d in well_dir.iterdir() if d.is_dir()]

    if not sub_dirs:
        return False  # No nested folders found, treat as inconsistent

    # Check if each subdirectory contains a nested folder with the same name as the well directory
    for sub in sub_dirs:
        nested_folders = [d.name for d in sub.iterdir() if d.is_dir()]
        if well_name not in nested_folders:
            return False  # Inconsistent folder structure found

    return True  # All subdirectories have a nested folder with the same name as the well directory


def is_image_folder_empty(nested_dir: pathlib.Path) -> bool:
    """Check if a nested directory contains any images.

    Args:
        nested_dir (pathlib.Path): Path to a directory nested within the well directory

    Returns:
        bool: Boolean indicating whether the nested directory contains any images
    """
    return not any(
        image.suffix.lower() in image_extensions for image in nested_dir.rglob("*")
    )


def has_equal_images_per_channel(
    nested_dir: pathlib.Path, channel_names: list[str]
) -> bool:
    """Check if all specified channels have the same number of images by looking for the channel name in the filenames.

    Args:
        nested_dir (pathlib.Path): Path to a directory nested within the well directory.
        channel_names (list[str]): List of strings of the channel names found in the nested directory.

    Returns:
        bool: Boolean indicating whether all specified channels have the same number of images.
    """
    # Initialize counts for each channel
    channel_counts = {channel: 0 for channel in channel_names}

    # Count images for each channel based on the channel name in the filename
    for image in nested_dir.rglob("*"):  # Search for all files recursively
        if image.suffix.lower() in image_extensions:  # Ensure it's an image file
            for channel in channel_names:
                if (
                    channel in image.name
                ):  # If the channel name is found in the image filename
                    channel_counts[channel] += 1

    # Get the unique set of image counts (if all counts are equal, there should be only one unique value)
    image_counts = set(channel_counts.values())

    # If all counts are equal and non-zero, return True; otherwise, return False
    return len(image_counts) == 1 and 0 not in image_counts

Run this cell through the script

In [5]:
# Set channel names
channel_names = {"405", "488", "555", "640", "TRANS", "Merge"}

# Loop through each key in the mapping to copy data from the parent to the destination
for key, paths in dir_mapping.items():
    parent_dir = paths["parent"]
    dest_dir = paths["destination"]

    print(f"Processing {key}: {parent_dir} -> {dest_dir}")

    # Ensure the destination directory exists
    dest_dir.mkdir(parents=True, exist_ok=True)

    # Get all well-level directories
    well_dirs = [d for d in parent_dir.iterdir() if d.is_dir()]

    if not well_dirs:
        print(f"Skipping {key}: No well directories found")
        continue

    for well_dir in well_dirs:
        if not has_consistent_naming(well_dir):
            print(
                f"Skipping {well_dir.stem}: Inconsistent nested folder names within well"
            )
            continue
        for nested_dir in well_dir.iterdir():
            print(nested_dir.stem)
            if not nested_dir.is_dir():
                continue

            if is_image_folder_empty(nested_dir):
                print(f"Skipping {nested_dir}: No images found")
                continue

            if not has_equal_images_per_channel(nested_dir, channel_names):
                print(f"Skipping {nested_dir}: Unequal images per channel")
                continue  # Skip processing this folder

            # Create corresponding destination directory
            dest_well_dir = dest_dir / well_dir.name
            dest_well_dir.mkdir(parents=True, exist_ok=True)

            # Copy images to destination, skipping files with 'Tile' in their name
            for image in tqdm.tqdm(nested_dir.rglob("*")):
                if (
                    image.suffix.lower() in image_extensions
                    and "Tile" not in image.name
                ):
                    shutil.copy2(image, dest_well_dir)

    print(f"Completed processing {key}: {parent_dir} -> {dest_dir}")

Processing NF0017: /media/18tbdrive/GFF_organoid_data/NF0017-T3-P7 (AGP, Mito Parameter optimization)/Acquisition 03-07-2025 -> /media/18tbdrive/1.Github_Repositories/GFF_3D_organoid_profiling_pipeline/data/raw_images/NF0017_raw_images
C8-3-(5-185)-(19-320)


196it [00:05, 38.09it/s]


C8-3-(5-185)-(19-320)
C9-4-(10-115)-(24-350)


328it [00:11, 28.86it/s]  


C9-4-(10-115)-(24-350)
C9-1-(5-115)-(19-250)


240it [00:07, 30.07it/s]  


C9-1-(5-115)-(19-250)
C9-6-(5-115)-(19-250)


262it [00:08, 32.05it/s]  


C9-6-(5-115)-(19-250)
C9-1-(5-220)-(19-355)


240it [00:07, 30.34it/s]  


C9-1-(5-220)-(19-355)
C9-6-(14-115)-(29-350)


262it [00:08, 32.05it/s]  


C9-6-(14-115)-(29-350)
C9-3-(5-150)-(19-285)


262it [00:10, 25.98it/s]  


C9-3-(5-150)-(19-285)
C9-3-(5-115)-(19-250)


262it [00:08, 32.41it/s]  


C9-3-(5-115)-(19-250)
C8-1-(5-185)-(19-320)


130it [00:03, 33.49it/s]


C8-1-(5-185)-(19-320)
C9-6-(5-220)-(19-355)


262it [00:10, 25.25it/s]  


C9-6-(5-220)-(19-355)
C8-1-(14-115)-(29-350)
C8-1-(14-115)-(29-350)


130it [00:03, 35.72it/s]


C9-5-(14-115)-(29-350)


263it [00:08, 31.68it/s]  


.DS_Store
C9-5-(14-115)-(29-350)
C9-1-(5-150)-(19-285)


240it [00:08, 29.78it/s] 


C9-1-(5-150)-(19-285)
C9-5-(5-150)-(19-285)
C9-5-(5-150)-(19-285)


262it [00:08, 30.67it/s]  


C7-1-(14-115)-(29-350)


130it [00:04, 28.25it/s]


C7-1-(14-115)-(29-350)
C5-1-(5-185)-(19-320)


196it [00:04, 42.67it/s] 


C5-1-(5-185)-(19-320)
C9-2-(5-185)-(19-320)


152it [00:04, 34.69it/s]


C9-2-(5-185)-(19-320)
C9-3-(5-220)-(19-355)


262it [00:08, 32.63it/s] 


C9-3-(5-220)-(19-355)
C9-4-(14-115)-(29-350)


328it [00:10, 30.44it/s]  


C9-4-(14-115)-(29-350)
C8-3-(5-150)-(19-285)


196it [00:06, 31.51it/s]


C8-3-(5-150)-(19-285)
C5-1-(5-220)-(19-355)


196it [00:06, 28.50it/s]


C5-1-(5-220)-(19-355)
C8-2-(14-115)-(29-350)


108it [00:02, 38.63it/s]


C8-2-(14-115)-(29-350)
C9-3-(5-185)-(19-320)


262it [00:08, 30.30it/s] 


C9-3-(5-185)-(19-320)
C8-3-(5-220)-(19-355)


196it [00:06, 31.53it/s]


C8-3-(5-220)-(19-355)
C8-2-(5-115)-(19-250)


108it [00:04, 26.72it/s]


C8-2-(5-115)-(19-250)
C5-1-(14-115)-(29-350)
C5-1-(14-115)-(29-350)


196it [00:06, 28.06it/s]


C9-2-(10-115)-(24-350)


152it [00:04, 34.06it/s]


C9-2-(10-115)-(24-350)
C9-1-(10-115)-(24-350)


240it [00:08, 29.73it/s] 


C9-1-(10-115)-(24-350)
C9-4-(5-220)-(19-355)
C9-4-(5-220)-(19-355)


328it [00:10, 32.54it/s]  


C8-2-(5-220)-(19-355)
C8-2-(5-220)-(19-355)


108it [00:03, 31.04it/s]


C8-1-(5-220)-(19-355)


130it [00:04, 31.64it/s]


C8-1-(5-220)-(19-355)
C9-6-(10-115)-(24-350)


262it [00:10, 25.20it/s] 


C9-6-(10-115)-(24-350)
C9-6-(5-185)-(19-320)


262it [00:08, 32.11it/s]  


C9-6-(5-185)-(19-320)
C5-1-(10-115)-(24-350)
C5-1-(10-115)-(24-350)


196it [00:05, 33.22it/s]


C9-5-(5-220)-(19-355)


262it [00:08, 30.25it/s] 


C9-5-(5-220)-(19-355)
C7-1-(5-115)-(19-250)
C7-1-(5-115)-(19-250)


130it [00:04, 28.20it/s]


C9-3-(14-115)-(29-350)
C9-3-(14-115)-(29-350)


262it [00:09, 27.86it/s] 


C7-1-(10-115)-(24-350)
C7-1-(10-115)-(24-350)


130it [00:04, 32.11it/s]


C9-3-(10-115)-(24-350)
C9-3-(10-115)-(24-350)


262it [00:10, 25.10it/s]  


C8-2-(5-185)-(19-320)
C8-2-(5-185)-(19-320)


108it [00:03, 35.82it/s]


C5-1-(5-150)-(19-285)
C5-1-(5-150)-(19-285)


196it [00:05, 35.26it/s]


C7-1-(5-185)-(19-320)
C7-1-(5-185)-(19-320)


130it [00:06, 19.75it/s]


C8-3-(10-115)-(24-350)


196it [00:06, 31.92it/s]


C8-3-(10-115)-(24-350)
C8-1-(5-150)-(19-285)
C8-1-(5-150)-(19-285)


130it [00:03, 36.99it/s]


C8-3-(5-115)-(19-250)


196it [00:06, 31.27it/s]


C8-3-(5-115)-(19-250)
C9-1-(5-185)-(19-320)


240it [00:08, 29.44it/s] 


C9-1-(5-185)-(19-320)
C9-4-(5-115)-(19-250)
C9-4-(5-115)-(19-250)


328it [00:10, 31.53it/s]  


C8-2-(5-150)-(19-285)
C8-2-(5-150)-(19-285)


108it [00:03, 31.12it/s]


C9-4-(5-150)-(19-285)
C9-4-(5-150)-(19-285)


328it [00:10, 30.37it/s]  


C9-5-(5-115)-(19-250)
C9-5-(5-115)-(19-250)


262it [00:09, 26.84it/s]  


C9-6-(5-150)-(19-285)
C9-6-(5-150)-(19-285)


262it [00:08, 31.00it/s]  


C7-1-(5-220)-(19-355)
C7-1-(5-220)-(19-355)


130it [00:03, 34.59it/s]


.DS_Store
C5-1-(5-115)-(19-250)


196it [00:02, 71.14it/s] 


C5-1-(5-115)-(19-250)
C7-1-(5-150)-(19-285)
C7-1-(5-150)-(19-285)


130it [00:04, 28.08it/s]


C9-4-(5-185)-(19-320)


328it [00:10, 31.80it/s]  


C9-4-(5-185)-(19-320)
C8-2-(10-115)-(24-350)
C8-2-(10-115)-(24-350)


108it [00:03, 33.36it/s]


C9-5-(10-115)-(24-350)


262it [00:08, 30.25it/s] 


C9-5-(10-115)-(24-350)
C9-2-(14-115)-(29-350)
C9-2-(14-115)-(29-350)


152it [00:05, 26.96it/s]


C8-1-(10-115)-(24-350)


130it [00:03, 33.67it/s]


C8-1-(10-115)-(24-350)
C9-1-(14-115)-(29-350)
C9-1-(14-115)-(29-350)


240it [00:08, 27.68it/s] 


C9-2-(5-220)-(19-355)
C9-2-(5-220)-(19-355)


152it [00:04, 36.69it/s]


C9-2-(5-150)-(19-285)
C9-2-(5-150)-(19-285)


152it [00:04, 33.77it/s]


C9-2-(5-115)-(19-250)
C9-2-(5-115)-(19-250)


152it [00:06, 25.16it/s]


C9-5-(5-185)-(19-320)
C9-5-(5-185)-(19-320)


262it [00:08, 32.29it/s]  


C8-1-(5-115)-(19-250)
C8-1-(5-115)-(19-250)


130it [00:04, 28.26it/s]


C8-3-(14-115)-(29-350)
C8-3-(14-115)-(29-350)


196it [00:05, 35.38it/s]

Completed processing NF0017: /media/18tbdrive/GFF_organoid_data/NF0017-T3-P7 (AGP, Mito Parameter optimization)/Acquisition 03-07-2025 -> /media/18tbdrive/1.Github_Repositories/GFF_3D_organoid_profiling_pipeline/data/raw_images/NF0017_raw_images



