In [None]:
import os
import pandas as pd
import shutil
import random

In [None]:
# Dataset preprocessing
# Define the source path
dataset_path = os.path.join(os.path.expanduser('~'), 'Desktop', 'laguna_dataset')

# Define the directory path for preprocessing
dataset_preprocessing_path = os.path.join(dataset_path, 'laguna_dataset_preprocessed')

# Check if the preprocessing directory and CSV file already exist
if os.path.exists(dataset_preprocessing_path) and os.path.exists(os.path.join(dataset_preprocessing_path, 'filtered_metadata.csv')):
    print("Preprocessing directory and metadata file already exist. Skipping data preprocessing.")
else:
    # Create the directory for dataset preprocessing inside the working directory
    os.makedirs(dataset_preprocessing_path, exist_ok=True)

    # Load metadata from the CSV file
    metadata_file_path = os.path.join(dataset_path, 'metadata.csv')  # Replace with your actual CSV file path
    if os.path.exists(metadata_file_path):
        metadata = pd.read_csv(metadata_file_path)
    else:
        print("Metadata file not found. Please provide the correct path to your metadata file.")
        exit()

    # Define the specific diseases and parts included
    diseases = ['Healthy', 'Bunchy top', 'Black sigatoka']
    parts = ['foliage', 'leaf']

    # Filter out rows with 'treeID' containing 'Unrecorded'
    metadata = metadata[metadata['treeID'] != 'Unrecorded']

    # Shuffle the metadata to distribute the data randomly
    metadata = metadata.sample(frac=1).reset_index(drop=True)

    # Initialize an empty list to store the metadata of the filtered images
    filtered_metadata = []

    # Iterate through the metadata, apply the "disease," "part," and "treeID" filters, and move the images
    for _, row in metadata.iterrows():
        imageID = row['imageID']
        disease = row['disease']
        part = row['part']
        treeID = row['treeID']

        if disease in diseases and part in parts and treeID != 'Unrecorded':
            # Remove the ".jpg" extension from imageID if present
            imageID = imageID.replace(".jpg", "")

            # Add the ".jpg" extension to the source_path
            source_path = os.path.join(dataset_path, f"{imageID}.jpg")
            dest_path = os.path.join(dataset_preprocessing_path, f"{imageID}.jpg")
            shutil.move(source_path, dest_path)

            # Append the filtered metadata to the list
            filtered_metadata.append(row)

    # Convert the list of dictionaries to a DataFrame
    filtered_metadata = pd.DataFrame(filtered_metadata)

    # Save the filtered metadata to a CSV file inside the dest_path
    csv_file_path = os.path.join(dataset_preprocessing_path, 'filtered_metadata.csv')
    filtered_metadata.to_csv(csv_file_path, index=False)

In [None]:
# Check treeIDs with duplicate and count it
# Define the path to the 'filtered_metadata.csv' file
filtered_metadata_path = os.path.join(dataset_preprocessing_path, 'filtered_metadata.csv')

# Check if the filtered_metadata.csv file exists
if os.path.exists(filtered_metadata_path):
    # Load the 'filtered_metadata.csv' file into a DataFrame
    filtered_metadata = pd.read_csv(filtered_metadata_path)

    # Calculate the count of duplicates for each treeID
    treeID_duplicate_counts = filtered_metadata['treeID'].value_counts()

    # Display the treeID and its count of duplicates
    print("treeID - Count of Duplicates:")
    for treeID, count in treeID_duplicate_counts.items():
        print(f"{treeID} - {count}")

    # Check if the sum of duplicate counts is 883
    total_duplicates = treeID_duplicate_counts.sum()
    print(f"Total Duplicate Counts: {total_duplicates}")
    if total_duplicates == 883:
        print("The sum of duplicate counts is equal to 883.")
    else:
        print("The sum of duplicate counts is not equal to 883.")
else:
    print("The 'filtered_metadata.csv' file does not exist. Please make sure the file is in the correct location.")

In [None]:
# Dataset Splitting 
# Define the diseases (class labels)
diseases = ['Healthy', 'Black sigatoka', 'Bunchy top']

# Create directories for train, valid, and test datasets
train_dir = os.path.join(dataset_preprocessing_path, 'train')
valid_dir = os.path.join(dataset_preprocessing_path, 'valid')
test_dir = os.path.join(dataset_preprocessing_path, 'test')

# Create subdirectories for each disease in train, valid, and test directories
for subset_dir in [train_dir, valid_dir, test_dir]:
    for disease in diseases:
        os.makedirs(os.path.join(subset_dir, disease), exist_ok=True)

# Rest of your code for splitting and copying images
train_ratio = 0.7
valid_ratio = 0.2
test_ratio = 0.1

# Shuffle the unique treeIDs to ensure randomness
unique_treeIDs = filtered_metadata['treeID'].unique()
shuffled_treeIDs = unique_treeIDs.copy()
random.shuffle(shuffled_treeIDs)

# Initialize dictionaries to keep track of image allocations
image_allocations = {treeID: None for treeID in shuffled_treeIDs}
allocated_counts = {'train': 0, 'valid': 0, 'test': 0}

# Split and move images to their respective subsets
for treeID in shuffled_treeIDs:
    # Determine the subset for the current treeID
    if allocated_counts['train'] / len(shuffled_treeIDs) < train_ratio:
        subset = 'train'
    elif allocated_counts['valid'] / len(shuffled_treeIDs) < valid_ratio:
        subset = 'valid'
    else:
        subset = 'test'

    # Get all images with the current treeID
    treeID_images = filtered_metadata[filtered_metadata['treeID'] == treeID]

    for index, row in treeID_images.iterrows():
        imageID = row['imageID']
        imageID = imageID.replace(".jpg", "")  # Remove the ".jpg" extension
        disease = row['disease']  # Assumes 'disease' column in the DataFrame

        # Construct the source and target paths
        image_path = os.path.join(dataset_preprocessing_path, f"{imageID}.jpg")
        target_dir = os.path.join(dataset_preprocessing_path, subset, disease, f"{imageID}.jpg")

        # Check if the source image exists
        if os.path.exists(image_path):
            print(f"Moving {imageID} to {target_dir}")
            shutil.move(image_path, target_dir)
        else:
            print(f"Source image {imageID} not found at {image_path}")

    # Update allocation and allocated counts
    allocated_counts[subset] += 1

print("Data splitting and copying completed.")

In [None]:
# Verify if Images from the same treeID is only in one subset (i.e., Train, Val, or Test).
# Define the diseases (class labels)
diseases = ['Healthy', 'Black sigatoka', 'Bunchy top']
subsets = ['train', 'valid', 'test']

# Iterate through subsets and diseases
for subset in subsets:
    for disease in diseases:
        folder_path = os.path.join(dataset_preprocessing_path, subset, disease)
        images_in_folder = os.listdir(folder_path)

        # Filter data from filtered_metadata.csv for images in the folder
        matching_data = filtered_metadata[filtered_metadata['imageID'].isin(images_in_folder)]

        print(f"TreeID counts in {subset}/{disease} folder:")
        treeID_counts = matching_data['treeID'].value_counts()
        for treeID, count in treeID_counts.items():
            print(f"- TreeID {treeID}: {count} times")

print("Counts printed for each folder.")