In [1]:
# Imports from Video 10 - Image Preparation for Convulutional Neural Networks with TensorFlow's Keras API
# Source link: https://www.youtube.com/watch?v=_L2uYfVV48I&list=PLkUrsn8FkQFb5Gr_CY7HQErZRr6mPb-2q&index=11&t=222s

import numpy as np # linear algebra
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Flatten, BatchNormalization, Conv2D, MaxPool2D
from tensorflow.keras.optimizers import Adam 
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import confusion_matrix
import itertools
import shutil
import random
import glob
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
# Change the working directory to the source directory
dataset_path = os.path.join(os.path.expanduser('~'), 'Desktop', 'laguna_dataset')
os.chdir(dataset_path)

# Create a directory for dataset preprocessing inside the working directory
dataset_preprocessing_path = os.path.join(dataset_path, 'laguna_dataset_preprocessing')
os.makedirs(dataset_preprocessing_path, exist_ok=True)

# Load metadata from the CSV file
metadata = pd.read_csv('metadata.csv')  # Replace 'metadata.csv' with your actual CSV file name

# Define the specific diseases and parts included
diseases = ['Healthy', 'Bunchy top', 'Black sigatoka']
parts = ['foliage', 'leaf']

# Filter out rows with 'treeID' containing 'Unrecorded'
metadata = metadata[metadata['treeID'] != 'Unrecorded']

# Shuffle the metadata to distribute the data randomly
metadata = metadata.sample(frac=1).reset_index(drop=True)

# Iterate through the metadata, apply the "disease," "part," and "treeID" filters, and move the images
for _, row in metadata.iterrows():
    imageID = row['imageID']
    disease = row['disease']
    part = row['part']
    treeID = row['treeID']

    if disease in diseases and part in parts and treeID != 'Unrecorded':
        # Remove the ".jpg" extension from imageID if present
        imageID = imageID.replace(".jpg", "")

        # Add the ".jpg" extension to the source_path
        source_path = os.path.join(dataset_path, f"{imageID}.jpg")
        dest_path = os.path.join(dataset_preprocessing_path, f"{imageID}.jpg")
        shutil.move(source_path, dest_path)