In [1]:
import pandas as pd
import ast
import os
import shutil

csv_path = "output_crops.csv"
data = pd.read_csv(csv_path)

In [2]:
def calculate_height(crop_position):
    # Convert the Crop Position string to a list
    crop_pos = ast.literal_eval(crop_position)
    return crop_pos[3] - crop_pos[1]    # y_max - y_min

data["Crop Height"] = data["Crop Position"].apply(calculate_height)
data["Height Invalid"] = data["Crop Height"].apply(lambda h: 1 if h < 225 or h > 600 else 0)

def calculate_width(crop_position):
    # Convert the Crop Position string to a list
    crop_pos = ast.literal_eval(crop_position)
    return crop_pos[2] - crop_pos[0]  # x_max - x_min

data["Crop Width"] = data["Crop Position"].apply(calculate_width)
data["Width Invalid"] = data["Crop Width"].apply(lambda w: 1 if w < 210 else 0)

In [3]:
def calculate_overlap(row, data):
    file_name = row["File Name"]
    crop_pos_1 = ast.literal_eval(row["Crop Position"])
    overlap_threshold = 0.3

    for _, other_row in data[data["File Name"] == file_name].iterrows():
        if row.name == other_row.name:
            continue
        crop_pos_2 = ast.literal_eval(other_row["Crop Position"])

        x_min_1, y_min_1, x_max_1, y_max_1 = crop_pos_1
        x_min_2, y_min_2, x_max_2, y_max_2 = crop_pos_2
        
        # Intersection coordinates
        x_min_inter = max(x_min_1, x_min_2)
        y_min_inter = max(y_min_1, y_min_2)
        x_max_inter = min(x_max_1, x_max_2)
        y_max_inter = min(y_max_1, y_max_2)

        # Calculate intersection area
        if x_min_inter < x_max_inter and y_min_inter < y_max_inter:
            intersection_area = (x_max_inter - x_min_inter) * (y_max_inter - y_min_inter)
            area_1 = (x_max_1 - x_min_1) * (y_max_1 - y_min_1)
            # area_2 = (x_max_2 - x_min_2) * (y_max_2 - y_min_2)
            overlap = intersection_area / area_1    # instead of min(area_1, area_2)
            if overlap > overlap_threshold:
                return 1
    return 0

data["Overlap"] = data.apply(lambda row: calculate_overlap(row, data), axis=1)

In [4]:
def is_above(centroid, crop_height):
    centroid_y = centroid[1]    # y-coordinate of the centroid
    return 1 if centroid_y < crop_height / 2 else 0

data["Above"] = data.apply(lambda row: is_above(ast.literal_eval(row["Centroid in Crop"]), row["Crop Height"]), axis=1)

In [5]:
data = data.drop(columns=["Crop Height", "Crop Width"])
data["Use"] = ((data["Above"] == 1) & (data["Height Invalid"] == 0) & (data["Width Invalid"] == 0) & (data["Overlap"] == 0)).astype(int)
output_path = "output_crops_selected.csv"
data.to_csv(output_path, index=False)
print(f"Updated CSV saved to {output_path}")

Updated CSV saved to output_crops_selected.csv


In [6]:
crops_folder = "crops"
output_root_folder = "crops_selected"
os.makedirs(output_root_folder, exist_ok=True)

conditions = {
    "Height Invalid": "Height Invalid",
    "Width Invalid": "Width Invalid",
    "Overlap": "Overlap",
    "Above": "Above",
    "Use": "Use"
}

for column, folder_name in conditions.items():

    print(f"Moving images to {folder_name} folder.")

    condition_folder = os.path.join(output_root_folder, folder_name)
    os.makedirs(condition_folder, exist_ok=True)

    filtered_data = data[data[column] == 1]
    
    for _, row in filtered_data.iterrows():    # Copy the corresponding crops
        crop_file_name = row["Crop File Name"]
        source_path = os.path.join(crops_folder, crop_file_name)
        target_path = os.path.join(condition_folder, crop_file_name)

        if os.path.exists(source_path):
            shutil.copy(source_path, target_path)
        else:
            print(f"Crop {crop_file_name} not found in {crops_folder}. Skipping...")


Moving images to Height Invalid folder.
Moving images to Width Invalid folder.
Moving images to Overlap folder.
Moving images to Above folder.
Moving images to Use folder.
