# Dataset preprocessing

## WARNING: Do not use this notebook, use the update-dateset-splitting.ipyb to both preprocess and split the dataset instead

## Import libraries

In [11]:
import os
import pandas as pd
import shutil

## Dataset Preprocessing

In [12]:
# Change the working directory to the source directory
dataset_path = os.path.join(os.path.expanduser('~'), 'Desktop', 'laguna_dataset')
os.chdir(dataset_path)

# Create a directory for dataset preprocessing inside the working directory
dataset_preprocessing_path = os.path.join(dataset_path, 'laguna_dataset_preprocessing')
os.makedirs(dataset_preprocessing_path, exist_ok=True)

# Load metadata from the CSV file
metadata = pd.read_csv('metadata.csv')  # Replace 'metadata.csv' with your actual CSV file name

# Define the specific diseases and parts included
diseases = ['Healthy', 'Bunchy top', 'Black sigatoka']
parts = ['foliage', 'leaf']

# Filter out rows with 'treeID' containing 'Unrecorded'
metadata = metadata[metadata['treeID'] != 'Unrecorded']

# Shuffle the metadata to distribute the data randomly
metadata = metadata.sample(frac=1).reset_index(drop=True)

# Initialize an empty list to store the metadata of the filtered images
filtered_metadata = []

# Iterate through the metadata, apply the "disease," "part," and "treeID" filters, and move the images
for _, row in metadata.iterrows():
    imageID = row['imageID']
    disease = row['disease']
    part = row['part']
    treeID = row['treeID']

    if disease in diseases and part in parts and treeID != 'Unrecorded':
        # Remove the ".jpg" extension from imageID if present
        imageID = imageID.replace(".jpg", "")

        # Add the ".jpg" extension to the source_path
        source_path = os.path.join(dataset_path, f"{imageID}.jpg")
        dest_path = os.path.join(dataset_preprocessing_path, f"{imageID}.jpg")
        shutil.move(source_path, dest_path)

        # Append the filtered metadata to the list
        filtered_metadata.append(row)

# Convert the list of dictionaries to a DataFrame
filtered_metadata = pd.DataFrame(filtered_metadata)

# Save the filtered metadata to a CSV file inside the dest_path
csv_file_path = os.path.join(dataset_preprocessing_path, 'filtered_metadata.csv')
filtered_metadata.to_csv(csv_file_path, index=False)