# Restructure data using key

This code processes images, provided by class in seperate folders, by two institutes (CEFAS & WMR). The images in the folders are moved to new folders, sometimes merged, following a csv file (the "key" file) provided by the researchers.

In [1]:
# import modules

import pandas as pd
import os
import shutil
from pathlib import Path

In [2]:
# Read the CSV file
df = pd.read_csv("data/OSPAR_key.csv")
df.head()

Unnamed: 0,folder,detailed_cat,Source,check,check2
0,artefact_cleaning_fibre,artefacts,WMR,artefact_cleaning_fibre,0
1,artefact_long_line,artefacts,WMR,artefact_long_line,0
2,nt-bubbles,bubbles,CEFAS,nt-bubbles,0
3,bubbles,bubbles,WMR,bubbles,0
4,copepod_calanoida_acartia-spp,copepods,CEFAS,copepod_calanoida_acartia-spp,0


In [3]:
# Define the path for the merged folder inside the 'data' directory
merged_dir = Path("data") / "OSPAR_merged"

# Create the directories if they don't exist
merged_dir.mkdir(parents=True, exist_ok=True)

print(f"'merged' folder created at: {merged_dir.resolve()}")


'merged' folder created at: /lustre/backup/WUR/WMR/hoeke007/Project003c_Plankton_imager_03/data/OSPAR_merged


In [4]:
# OPTIONAL: Delete folder

# Check if it exists, then delete
#if merged_dir.exists():
#    shutil.rmtree(merged_dir)
#    print(f"Deleted folder: {merged_dir.resolve()}")
#else:
#    print(f"Folder not found: {merged_dir.resolve()}")


In [5]:
%%time

# Loop through each row in the DataFrame
for idx, row in df.iterrows():
    # Get the source base path and folder name
    source_base = Path(row["Source"])
    folder_name = row["folder"]
    category = row["detailed_cat"]

    # Full path to the source folder: Source/folder
    source_folder = "data" / source_base / folder_name

    # Full destination path: data/OSPAR_merged/detailed_cat
    dest_folder = Path("data") / "OSPAR_merged" / category

    # Create destination folder if it doesn't exist
    dest_folder.mkdir(parents=True, exist_ok=True)

    # Check if the source folder exists before attempting to copy
    if not source_folder.exists():
        print(f"❌ Source folder does not exist: {source_folder}")
        continue  # Skip this row if the source doesn't exist

    # Iterate over the contents of the source folder
    for item in source_folder.iterdir():
        # Destination path for the current item
        dest_path = dest_folder / item.name

        # If a file or folder with the same name already exists in destination
        if dest_path.exists():
            print(f"⚠️ File already exists, skipping: {dest_path}")
        else:
            # Copy folders (recursively) or files accordingly
            if item.is_dir():
                shutil.copytree(item, dest_path)
            else:
                shutil.copy2(item, dest_path)

    print(f"✅ Finished copying contents from {source_folder} to {dest_folder}")


✅ Finished copying contents from data/WMR/artefact_cleaning_fibre to data/OSPAR_merged/artefacts
✅ Finished copying contents from data/WMR/artefact_long_line to data/OSPAR_merged/artefacts
✅ Finished copying contents from data/CEFAS/nt-bubbles to data/OSPAR_merged/bubbles
✅ Finished copying contents from data/WMR/bubbles to data/OSPAR_merged/bubbles
✅ Finished copying contents from data/CEFAS/copepod_calanoida_acartia-spp to data/OSPAR_merged/copepods
✅ Finished copying contents from data/CEFAS/copepod_calanoida_calanus-spp to data/OSPAR_merged/copepods
✅ Finished copying contents from data/CEFAS/copepod_calanoida_centropages-spp to data/OSPAR_merged/copepods
✅ Finished copying contents from data/CEFAS/copepod_calanoida_temora-spp to data/OSPAR_merged/copepods
✅ Finished copying contents from data/CEFAS/copepod_cyclopoida_oithona-spp to data/OSPAR_merged/copepods
✅ Finished copying contents from data/CEFAS/copepod_cyclopoida_oncaea-spp to data/OSPAR_merged/copepods
✅ Finished copying c