# Dataset preprocessing and splitting

## Import libraries

In [1]:
import os
import pandas as pd
import shutil
import random

## Dataset preprocessing

In [2]:
# Dataset preprocessing
# Define the source path
dataset_path = os.path.join(os.path.expanduser('~'), 'Desktop', 'laguna_dataset')

# Define the directory path for preprocessing
dataset_preprocessing_path = os.path.join(dataset_path, 'laguna_dataset_preprocessed')

# Check if the preprocessing directory and CSV file already exist
if os.path.exists(dataset_preprocessing_path) and os.path.exists(os.path.join(dataset_preprocessing_path, 'filtered_metadata.csv')):
    print("Preprocessing directory and metadata file already exist. Skipping data preprocessing.")
else:
    # Create the directory for dataset preprocessing inside the working directory
    os.makedirs(dataset_preprocessing_path, exist_ok=True)

    # Load metadata from the CSV file
    metadata_file_path = os.path.join(dataset_path, 'metadata.csv')  # Replace with your actual CSV file path
    if os.path.exists(metadata_file_path):
        metadata = pd.read_csv(metadata_file_path)
    else:
        print("Metadata file not found. Please provide the correct path to your metadata file.")
        exit()

    # Define the specific diseases and parts included
    diseases = ['Healthy', 'Bunchy top', 'Black sigatoka']
    parts = ['foliage', 'leaf']

    # Filter out rows with 'treeID' containing 'Unrecorded'
    metadata = metadata[metadata['treeID'] != 'Unrecorded']

    # Shuffle the metadata to distribute the data randomly
    metadata = metadata.sample(frac=1).reset_index(drop=True)

    # Initialize an empty list to store the metadata of the filtered images
    filtered_metadata = []

    # Iterate through the metadata, apply the "disease," "part," and "treeID" filters, and move the images
    for _, row in metadata.iterrows():
        imageID = row['imageID']
        disease = row['disease']
        part = row['part']
        treeID = row['treeID']

        if disease in diseases and part in parts and treeID != 'Unrecorded':
            # Remove the ".jpg" extension from imageID if present
            imageID = imageID.replace(".jpg", "")

            # Add the ".jpg" extension to the source_path
            source_path = os.path.join(dataset_path, f"{imageID}.jpg")
            dest_path = os.path.join(dataset_preprocessing_path, f"{imageID}.jpg")
            shutil.move(source_path, dest_path)

            # Append the filtered metadata to the list
            filtered_metadata.append(row)

    # Convert the list of dictionaries to a DataFrame
    filtered_metadata = pd.DataFrame(filtered_metadata)

    # Save the filtered metadata to a CSV file inside the dest_path
    csv_file_path = os.path.join(dataset_preprocessing_path, 'filtered_metadata.csv')
    filtered_metadata.to_csv(csv_file_path, index=False)

Preprocessing directory and metadata file already exist. Skipping data preprocessing.


In [3]:
# Check treeIDs with duplicate and count it
# Define the path to the 'filtered_metadata.csv' file
filtered_metadata_path = os.path.join(dataset_preprocessing_path, 'filtered_metadata.csv')

# Check if the filtered_metadata.csv file exists
if os.path.exists(filtered_metadata_path):
    # Load the 'filtered_metadata.csv' file into a DataFrame
    filtered_metadata = pd.read_csv(filtered_metadata_path)

    # Calculate the count of duplicates for each treeID
    treeID_duplicate_counts = filtered_metadata['treeID'].value_counts()

    # Display the treeID and its count of duplicates
    print("treeID - Count of Duplicates:")
    for treeID, count in treeID_duplicate_counts.items():
        print(f"{treeID} - {count}")

    # Check if the sum of duplicate counts is 883
    total_duplicates = treeID_duplicate_counts.sum()
    print(f"Total Duplicate Counts: {total_duplicates}")
    if total_duplicates == 883:
        print("The sum of duplicate counts is equal to 883.")
    else:
        print("The sum of duplicate counts is not equal to 883.")
else:
    print("The 'filtered_metadata.csv' file does not exist. Please make sure the file is in the correct location.")

treeID - Count of Duplicates:
Tree9 -Black Sigatoka - 7
3468a492-f8ff-4cab-999e-187336aefa4a - 6
f993968c-66bc-4d76-9628-0f1b28e1e7b6 - 6
4885df8d-19c3-4de7-b153-38951cc8d6d3 - 5
02d2506c-134c-422f-8c25-2cf8c0198912 - 5
45773803-b36b-4a75-9738-8f3a2d8f50f0 - 5
c6e0e0d2-c31f-4380-ad57-79a7607dd798 - 5
Tree3 - Black Sigatoka - 4
cca48b8d-2949-474d-a0f1-b912f1304a4f - 4
35a08476-5dbc-4cc4-b3d1-5206b38368b4 - 4
73092e7b-f6a2-482e-ba40-32759aa9f295 - 4
9459384b-bd64-4e68-8a0a-32b37794205b - 4
17a74d32-7ae7-400e-8285-251da4f03d0a - 4
4ad9afde-b707-4714-b599-c5362cc1faff - 4
7044fe1e-4e6c-4ad9-bb6d-e901d7fc4858 - 4
a69e10cd-1b66-4d13-adbd-2c060d0fcf1b - 4
6ec26887-b6db-48ce-9a09-ee3d924ca9af - 4
8f94d5d1-e672-4ceb-92a1-4d02436a3263 - 4
BT - 4
41372fa4-5f14-4a4e-8c34-f7135a602ed9 - 4
67e1f1f1-ab15-4272-a6e0-f8faf0a42169 - 4
f266b4b3-7489-4ba7-87c3-1653915b053c - 4
05bccd15-a00e-4f0b-8d74-cab12cde44b5 - 4
18ef53fd-7b9f-4012-be2a-7e9282f583eb - 3
Tree2 -Bunchy Top - 3
3023658b-61b3-49d7-b786-e29

## Dataset splitting

In [4]:
# Dataset Splitting 
# Define the diseases (class labels)
diseases = ['Healthy', 'Black sigatoka', 'Bunchy top']

# Create directories for train, valid, and test datasets
train_dir = os.path.join(dataset_preprocessing_path, 'train')
valid_dir = os.path.join(dataset_preprocessing_path, 'valid')
test_dir = os.path.join(dataset_preprocessing_path, 'test')

# Create subdirectories for each disease in train, valid, and test directories
for subset_dir in [train_dir, valid_dir, test_dir]:
    for disease in diseases:
        os.makedirs(os.path.join(subset_dir, disease), exist_ok=True)

# Rest of your code for splitting and copying images
train_ratio = 0.7
valid_ratio = 0.2
test_ratio = 0.1

# Shuffle the unique treeIDs to ensure randomness
unique_treeIDs = filtered_metadata['treeID'].unique()
shuffled_treeIDs = unique_treeIDs.copy()
random.shuffle(shuffled_treeIDs)

# Initialize dictionaries to keep track of image allocations
image_allocations = {treeID: None for treeID in shuffled_treeIDs}
allocated_counts = {'train': 0, 'valid': 0, 'test': 0}

# Split and move images to their respective subsets
for treeID in shuffled_treeIDs:
    # Determine the subset for the current treeID
    if allocated_counts['train'] / len(shuffled_treeIDs) < train_ratio:
        subset = 'train'
    elif allocated_counts['valid'] / len(shuffled_treeIDs) < valid_ratio:
        subset = 'valid'
    else:
        subset = 'test'

    # Get all images with the current treeID
    treeID_images = filtered_metadata[filtered_metadata['treeID'] == treeID]

    for index, row in treeID_images.iterrows():
        imageID = row['imageID']
        imageID = imageID.replace(".jpg", "")  # Remove the ".jpg" extension
        disease = row['disease']  # Assumes 'disease' column in the DataFrame

        # Construct the source and target paths
        image_path = os.path.join(dataset_preprocessing_path, f"{imageID}.jpg")
        target_dir = os.path.join(dataset_preprocessing_path, subset, disease, f"{imageID}.jpg")

        # Check if the source image exists
        if os.path.exists(image_path):
            print(f"Moving {imageID} to {target_dir}")
            shutil.move(image_path, target_dir)
        else:
            print(f"Source image {imageID} not found at {image_path}")

    # Update allocation and allocated counts
    allocated_counts[subset] += 1

print("Data splitting and copying completed.")

Source image a7242018-611e-4621-b3dc-43cccc1f1e11 not found at C:\Users\caskie\Desktop\laguna_dataset\laguna_dataset_preprocessed\a7242018-611e-4621-b3dc-43cccc1f1e11.jpg
Source image 51bce549-c676-4a88-93fe-c8bd3e466c18 not found at C:\Users\caskie\Desktop\laguna_dataset\laguna_dataset_preprocessed\51bce549-c676-4a88-93fe-c8bd3e466c18.jpg
Source image 41f43d31-0a56-40e7-8488-08e5404c1789 not found at C:\Users\caskie\Desktop\laguna_dataset\laguna_dataset_preprocessed\41f43d31-0a56-40e7-8488-08e5404c1789.jpg
Source image fd9c3c43-8f2c-4b0a-ab45-cc1e90eacf3c not found at C:\Users\caskie\Desktop\laguna_dataset\laguna_dataset_preprocessed\fd9c3c43-8f2c-4b0a-ab45-cc1e90eacf3c.jpg
Source image 1010ac5f-5f2c-4baa-bd07-b096f4ce8008 not found at C:\Users\caskie\Desktop\laguna_dataset\laguna_dataset_preprocessed\1010ac5f-5f2c-4baa-bd07-b096f4ce8008.jpg
Source image cd9e0ce6-5b74-46bb-a82c-693911eaa438 not found at C:\Users\caskie\Desktop\laguna_dataset\laguna_dataset_preprocessed\cd9e0ce6-5b74-4

Source image 369dafd7-5c19-402a-bcb0-90dd758b241a not found at C:\Users\caskie\Desktop\laguna_dataset\laguna_dataset_preprocessed\369dafd7-5c19-402a-bcb0-90dd758b241a.jpg
Source image 9d75b607-7dbd-40db-ac16-78ee4d92a774 not found at C:\Users\caskie\Desktop\laguna_dataset\laguna_dataset_preprocessed\9d75b607-7dbd-40db-ac16-78ee4d92a774.jpg
Source image bff6d94f-4434-483a-808e-40e557e29296 not found at C:\Users\caskie\Desktop\laguna_dataset\laguna_dataset_preprocessed\bff6d94f-4434-483a-808e-40e557e29296.jpg
Source image 2e2c6806-4672-4d7f-9874-84463776af77 not found at C:\Users\caskie\Desktop\laguna_dataset\laguna_dataset_preprocessed\2e2c6806-4672-4d7f-9874-84463776af77.jpg
Source image dea22ea2-099d-40d5-88e5-d26601140260 not found at C:\Users\caskie\Desktop\laguna_dataset\laguna_dataset_preprocessed\dea22ea2-099d-40d5-88e5-d26601140260.jpg
Source image 2f724707-62a2-45eb-8c7f-674743af271f not found at C:\Users\caskie\Desktop\laguna_dataset\laguna_dataset_preprocessed\2f724707-62a2-4

In [5]:
# Define the diseases (class labels)
diseases = ['Healthy', 'Black sigatoka', 'Bunchy top']
subsets = ['train', 'valid', 'test']

# Iterate through subsets and diseases to create CSV files
for subset in subsets:
    for disease in diseases:
        folder_path = os.path.join(dataset_preprocessing_path, subset, disease)
        images_in_folder = os.listdir(folder_path)

        # Filter data from filtered_metadata.csv for images in the folder
        matching_data = filtered_metadata[filtered_metadata['imageID'].isin(images_in_folder)]

        # Create a CSV file for the current folder
        csv_filename = os.path.join(folder_path, f"{subset}_{disease}.csv")

        # Write the matching data to the CSV file
        matching_data.to_csv(csv_filename, index=False)

print("CSV files created inside each folder.")

CSV files created inside each folder.


## Verify if images with from the same treeID is only in one subset

In [6]:
# Verify if Images from the same treeID is only in one subset (i.e., Train, Val, or Test).
# Define the diseases (class labels)
diseases = ['Healthy', 'Black sigatoka', 'Bunchy top']
subsets = ['train', 'valid', 'test']

# Iterate through subsets and diseases
for subset in subsets:
    for disease in diseases:
        folder_path = os.path.join(dataset_preprocessing_path, subset, disease)
        images_in_folder = os.listdir(folder_path)

        # Filter data from filtered_metadata.csv for images in the folder
        matching_data = filtered_metadata[filtered_metadata['imageID'].isin(images_in_folder)]

        print(f"TreeID counts in {subset}/{disease} folder:")
        treeID_counts = matching_data['treeID'].value_counts()
        for treeID, count in treeID_counts.items():
            print(f"- TreeID {treeID}: {count} times")

print("Counts printed for each folder.")

TreeID counts in train/Healthy folder:
- TreeID 3468a492-f8ff-4cab-999e-187336aefa4a: 6 times
- TreeID 8f94d5d1-e672-4ceb-92a1-4d02436a3263: 4 times
- TreeID 7044fe1e-4e6c-4ad9-bb6d-e901d7fc4858: 4 times
- TreeID a69e10cd-1b66-4d13-adbd-2c060d0fcf1b: 4 times
- TreeID 17a74d32-7ae7-400e-8285-251da4f03d0a: 4 times
- TreeID 67e1f1f1-ab15-4272-a6e0-f8faf0a42169: 4 times
- TreeID 6ec26887-b6db-48ce-9a09-ee3d924ca9af: 4 times
- TreeID cca48b8d-2949-474d-a0f1-b912f1304a4f: 4 times
- TreeID 05bccd15-a00e-4f0b-8d74-cab12cde44b5: 4 times
- TreeID 32cd26be-173f-4eb3-9f1e-de4232dbed4a: 3 times
- TreeID 3e0c3d99-c382-4664-832e-093c55ee56df: 3 times
- TreeID b2b874bc-8403-4df8-991c-6e5ae4f08222: 3 times
- TreeID 21cfb16b-eb23-4958-9293-b01afbc19c14: 3 times
- TreeID 84c1bf5d-77b7-4ac7-b3ae-9ec344c5afd3: 3 times
- TreeID f8a9f3a0-1eb1-4a31-8c7a-e99fa5c78580: 3 times
- TreeID a5857c0b-3a9b-4912-be39-1c8248e0eb4c: 3 times
- TreeID 09461581-764f-4e4f-9e00-18ece3b54290: 3 times
- TreeID 2530636d-2d30-419

## Test case to ensure that images from the same treeID are on the same subset 
- Create a set (data structure) for each subset (train, valid, test), the treeid is added on to the set of its subset.

- Ensure that the sum of the size of each set is equal to the size of all sets joined/union.

- This works because sets do not allow duplicates, 
if the sum of size of each set is equal to sum of the joined set, then there is no duplicate tree id between subsets 

In [None]:
diseases = [
    'Healthy',
    'Black sigatoka', 
    'Bunchy top',
]
# Map an set to each subset
subsets = {
    'train': set(), 
    'valid': set(), 
    'test': set(),
}


# Iterate through subsets and diseases
for subset in subsets:
    for disease in diseases:
        # Create a path leading to the subset and disease 
        folder_path = os.path.join(dataset_preprocessing_path, subset, disease)
        images_in_folder = os.listdir(folder_path)

        # Filter data from filtered_metadata.csv for images in the folder
        matching_data = filtered_metadata[filtered_metadata['imageID'].isin(images_in_folder)]

        # Add the treeid to the set of its corresponding subset 
        for treeID in matching_data["treeID"]:
            subsets[subset].add(treeID)

# Create a joined train, valid, and test set
unique_ids = subsets["train"] | subsets["valid"] | subsets["test"]
# Get the sum of the size of each set
expected_total_unique_ids = len(subsets["train"]) + len(subsets["valid"]) + len(subsets["test"])

# Ensure that there is no duplicate by comparing the size of the joined set and the sum of the three set
print(expected_total_unique_ids, len(unique_ids))
assert expected_total_unique_ids == len(unique_ids)