# 0 - Data collection

In [48]:
import sys

# Verify Python executable path
# print(sys.executable)

In [49]:
# Install kaggle using the correct Python executable
#!{sys.executable} -m pip install kaggle

In [50]:
# conda list

In [51]:
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

---

In [52]:
def data_structure_preview(CUSTOM_DIR, PARTITIONS, CATEGORIES):
    '''
    PARAMETERS:
    - CUSTOM_DIR [str]: Path to the custom directory where dataset is downloaded
    - PARTITIONS [list]: List of dataset partitions (e.g., ['train', 'test', 'valitoin'])

    FUNCTIONALITY:
    |  Print the number of images in each category folder.
    '''
    import os

    # Checking folders number of images in all test, train and valid folders
    for i in PARTITIONS:
        print(f"\n--- {i} folder ---")
        base_path = os.path.join(CUSTOM_DIR, i)
        for category in CATEGORIES:
            category_path = os.path.join(base_path, category)
            if os.path.exists(category_path):
                num_files = len([f for f in os.listdir(category_path) if os.path.isfile(os.path.join(category_path, f))])
                print(f"{category}: {num_files} images")
            else:
                print(f"{category}: Folder does not exist")

---

## 0.1 - "Lettuce plant Disease Dataset"
[Kaggle â€¢ _santosh shaha_ Dataset](https://www.kaggle.com/datasets/santoshshaha/lettuce-plant-disease-dataset)

Structure:                                              <br>
ðŸ“‚ lettuce ( Total: *2813* _files_ )                   <br>
|   train ( Total: *2362* _files_ )                     <br> 
|   |--> Bacterial (*732* _files_)                       <br>
|   |--> fungal (*581* _files_)                          <br>
|   |--> healthy (*1049* _files_)                         <br>
|   test ( Total: *226* _files_ )                      <br>
|   |--> Bacterial (*75* _files_)                       <br>
|   |--> fungal (*75* _files_)                       <br>
|   |--> healthy (*76* _files_)                       <br>
|   val ( Total: *225* _files_ )                      <br>
|   |--> Bacterial (*75* _files_)                       <br>
|   |--> fungal (*75* _files_)                       <br>
|   |--> healthy (*75* _files_)                       <br>

In [53]:
# Replace with proper dataset reference
dataset_1 = "santoshshaha/lettuce-plant-disease-dataset"

custom_dir_1 = "../Datasets"
# Download to a custom folder
api.dataset_download_files(dataset_1, path=custom_dir_1, unzip=True)

Dataset URL: https://www.kaggle.com/datasets/santoshshaha/lettuce-plant-disease-dataset


In [54]:
custom_dir_1 = custom_dir_1 + "/lettuce"
partitions = ["train", "test", "valid"]
categories = ["Bacterial", "fungal", "healthy", "diseased"]

data_structure_preview(CUSTOM_DIR=custom_dir_1, PARTITIONS=partitions, CATEGORIES=categories)


--- train folder ---
Bacterial: 732 images
fungal: 581 images
healthy: 1049 images
diseased: Folder does not exist

--- test folder ---
Bacterial: 75 images
fungal: 75 images
healthy: 76 images
diseased: Folder does not exist

--- valid folder ---
Bacterial: 75 images
fungal: 75 images
healthy: 75 images
diseased: Folder does not exist


## 0.2 - "PEST_AI Plant Leaf Disease Recognition Dataset"
[Kaggle â€¢ _Ibrahima Gabar Diop_ Dataset](https://www.kaggle.com/datasets/ibrahimagabardiop/niayes-crops-disease-v2)

---

In [55]:
def keep_desired_plants(DESIRED_PLANTS, CUSTOM_DIR, PARTITIONS):
    
    '''
    PARAMETERS:
    - DESIRED_PLANTS [list]: List of desired plant categories to keep
    - CUSTOM_DIR [str]: Path to the custom directory where dataset is downloaded
    - PARTITIONS [list]: List of dataset partitions (e.g., ['train', 'test', 'valitoin'])
    
    FUNCTIONALITY:
    |  Remove folders of undesired plant categories from the dataset.
    '''
    import os
    import shutil

    # List all the subfolders in the "val directory" of Train, Test and Validation data
    train_subfolders = [
        # List subfolders inside Train partition
        name for name in os.listdir(os.path.join(CUSTOM_DIR, PARTITIONS[0]))
        if os.path.isdir(os.path.join(CUSTOM_DIR, PARTITIONS[0], name)) and name not in DESIRED_PLANTS
    ]
    test_subfolders = [
        # List subfolders inside Test partition
        name for name in os.listdir(os.path.join(CUSTOM_DIR, PARTITIONS[1]))
        if os.path.isdir(os.path.join(CUSTOM_DIR, PARTITIONS[1], name)) and name not in DESIRED_PLANTS
    ]
    val_subfolders = [
        # List subfolders inside Val partition
        name for name in os.listdir(os.path.join(CUSTOM_DIR, PARTITIONS[2]))
        if os.path.isdir(os.path.join(CUSTOM_DIR, PARTITIONS[2], name)) and name not in DESIRED_PLANTS
    ]
    unwanted_subfolders = {PARTITIONS[0]: train_subfolders, PARTITIONS[1]: test_subfolders, PARTITIONS[2]: val_subfolders}
    # Print the identified unwanted subfolders
    print("Selected unwanted subfolders Length (train):", len(unwanted_subfolders[PARTITIONS[0]]))
    print("Selected unwanted subfolders Length (test):", len(unwanted_subfolders[PARTITIONS[1]]))
    print("Selected unwanted subfolders Length (val):", len(unwanted_subfolders[PARTITIONS[2]]))

    # Remove unwanted subfolders
    for partition in PARTITIONS:
        for subfolder in unwanted_subfolders[partition]:
            subfolder_path = os.path.join(CUSTOM_DIR, partition, subfolder)
            if os.path.exists(subfolder_path):
                shutil.rmtree(subfolder_path)
                print(f"Removed unwanted subfolder: {subfolder_path}")

---

In [56]:
# Replace with proper dataset reference
dataset_2 = "ibrahimagabardiop/niayes-crops-disease-v2"

custom_dir_2 = "../Datasets/lettuce_2"
# Download to a custom folder
api.dataset_download_files(dataset_2, path=custom_dir_2, unzip=True)

Dataset URL: https://www.kaggle.com/datasets/ibrahimagabardiop/niayes-crops-disease-v2


In [57]:
desired_plants_lettuce_2 = ["Lettuce__Bacterial", "Lettuce__fungal", "Lettuce__healthy"]
desired_plants_cabbage_bacterialAndhealthy = [ "Cabbage__Black_Rot", "Cabbage__Bacterial_spot_rot", "Cabbage__Healthy"]
desired_plants_cabbage_fungal = ["Cabbage__Alternaria_Leaf_Spot", "Cabbage__Downy_Mildew", "Cabbage__ring_spot"]

desired_plants_2 = desired_plants_lettuce_2 + desired_plants_cabbage_bacterialAndhealthy + desired_plants_cabbage_fungal

In [58]:
partitions_2 = ["train", "test", "val"]

keep_desired_plants(DESIRED_PLANTS=desired_plants_2, CUSTOM_DIR=custom_dir_2, PARTITIONS=partitions_2)

Selected unwanted subfolders Length (train): 27
Selected unwanted subfolders Length (test): 27
Selected unwanted subfolders Length (val): 27
Removed unwanted subfolder: ../Datasets/lettuce_2\train\Bell_pepper__Bell_pepper_Bacterial_spot
Removed unwanted subfolder: ../Datasets/lettuce_2\train\Bell_pepper__Bell_pepper_Healthy
Removed unwanted subfolder: ../Datasets/lettuce_2\train\Brinjal__Diseased_Brinjal_Leaf_-_Cercospora_Leaf_Spot
Removed unwanted subfolder: ../Datasets/lettuce_2\train\Brinjal__Fresh_Brinjal_Leaf
Removed unwanted subfolder: ../Datasets/lettuce_2\train\Cabbage__Cabbage_aphid_colony
Removed unwanted subfolder: ../Datasets/lettuce_2\train\Cabbage__club_root
Removed unwanted subfolder: ../Datasets/lettuce_2\train\Citrus__Citrus_Black_spot
Removed unwanted subfolder: ../Datasets/lettuce_2\train\Citrus__Citrus_canker
Removed unwanted subfolder: ../Datasets/lettuce_2\train\Citrus__Citrus_greening
Removed unwanted subfolder: ../Datasets/lettuce_2\train\Citrus__Citrus_Healthy


In [59]:
categories_2 = desired_plants_2 + ["diseased"]

data_structure_preview(CUSTOM_DIR=custom_dir_2, PARTITIONS=partitions_2, CATEGORIES=categories_2)


--- train folder ---
Lettuce__Bacterial: 547 images
Lettuce__fungal: 476 images
Lettuce__healthy: 805 images
Cabbage__Black_Rot: 140 images
Cabbage__Bacterial_spot_rot: 140 images
Cabbage__Healthy: 140 images
Cabbage__Alternaria_Leaf_Spot: 140 images
Cabbage__Downy_Mildew: 140 images
Cabbage__ring_spot: 140 images
diseased: Folder does not exist

--- test folder ---
Lettuce__Bacterial: 118 images
Lettuce__fungal: 103 images
Lettuce__healthy: 173 images
Cabbage__Black_Rot: 30 images
Cabbage__Bacterial_spot_rot: 30 images
Cabbage__Healthy: 30 images
Cabbage__Alternaria_Leaf_Spot: 30 images
Cabbage__Downy_Mildew: 30 images
Cabbage__ring_spot: 30 images
diseased: Folder does not exist

--- val folder ---
Lettuce__Bacterial: 117 images
Lettuce__fungal: 102 images
Lettuce__healthy: 172 images
Cabbage__Black_Rot: 30 images
Cabbage__Bacterial_spot_rot: 30 images
Cabbage__Healthy: 30 images
Cabbage__Alternaria_Leaf_Spot: 30 images
Cabbage__Downy_Mildew: 30 images
Cabbage__ring_spot: 30 images


# 1 - Data pre-processing

## 1.1 - "Lettuce plant Disease Dataset"
[Kaggle â€¢ _santosh shaha_ Dataset](https://www.kaggle.com/datasets/santoshshaha/lettuce-plant-disease-dataset)

### Adjusting folders

---

In [60]:
def adjust_classes_lettuce(DATA_SOURCE, CUSTOM_DIR, PARTITIONS):
    '''
    PARAMETERS:
    - CUSTOM_DIR [str]: Path to the custom directory where dataset is downloaded
    - BACTERIAL_FOLDER [str]: Name of the bacterial disease folder
    - FUNGAL_FOLDER [str]: Name of the fungal disease folder
    - PARTITIONS [list]: List of dataset partitions (e.g., ['train', 'test', 'valitoin'])
    
    FUNCTIONALITY:
    |  Rename specific disease folders to general disease categories.
    '''
    import os


    for partition in PARTITIONS:
        print(f"\n--- Processing {partition} folder ---")

        # Define the paths for the Bacterial and Fungal foldersL
        bacterial_folder = os.path.join(CUSTOM_DIR, partition, "Bacterial")
        fungal_folder = os.path.join(CUSTOM_DIR, partition, "fungal")

        # Create the diseased folder if it doesn't exist
        diseased_folder = os.path.join(CUSTOM_DIR, partition, "diseased")
        os.makedirs(diseased_folder, exist_ok=True)

        # Move all files from Bacterial and Fungal folders to diseased folder
        for folder in [bacterial_folder, fungal_folder]:
            if os.path.exists(folder):
                # Renaming Bacterial files and moving
                if folder == bacterial_folder:
                    for filename in os.listdir(folder):
                        src = os.path.join(folder, filename)
                        dst = os.path.join(diseased_folder, f"{DATA_SOURCE}-bacterial_{filename}")
                        if os.path.isfile(src):
                            os.rename(src, dst)
                
                # Renaming Fungal files and moving
                elif folder == fungal_folder:
                    for filename in os.listdir(folder):
                        src = os.path.join(folder, filename)
                        dst = os.path.join(diseased_folder, f"{DATA_SOURCE}-fungal_{filename}")
                        if os.path.isfile(src):
                            os.rename(src, dst)

---

In [61]:
adjust_classes_lettuce(DATA_SOURCE="lettuce", CUSTOM_DIR=custom_dir_1, PARTITIONS=partitions)


--- Processing train folder ---

--- Processing test folder ---

--- Processing valid folder ---


In [62]:
data_structure_preview(CUSTOM_DIR=custom_dir_1, PARTITIONS=partitions, CATEGORIES=categories)


--- train folder ---
Bacterial: 0 images
fungal: 0 images
healthy: 1049 images
diseased: 1313 images

--- test folder ---
Bacterial: 0 images
fungal: 0 images
healthy: 76 images
diseased: 150 images

--- valid folder ---
Bacterial: 0 images
fungal: 0 images
healthy: 75 images
diseased: 150 images


---

In [63]:
def delete_empty_lettuce_folders(PARTITIONS):
    '''
    PARAMETERS:
    - CUSTOM_DIR [str]: Path to the custom directory where dataset is downloaded
    - PARTITIONS [list]: List of dataset partitions (e.g., ['train', 'test', 'valitoin'])
    
    FUNCTIONALITY:
    |  Delete empty folders from the dataset.
    '''
    import os
    

    for partition in PARTITIONS:
        print(f"\n--- Checking {partition} folder for empty directories ---")

        # Define the paths for the Bacterial and Fungal foldersL
        bacterial_folder = os.path.join(custom_dir_1, partition, "Bacterial")
        fungal_folder = os.path.join(custom_dir_1, partition, "fungal")

        for folder in [bacterial_folder, fungal_folder]:
            if os.path.exists(folder) and not os.listdir(folder):  # Check if folder exists and is empty
                os.rmdir(folder)
                print(f"Deleted empty folder: {folder}")
            else:
                print(f"Folder not deleted (either does not exist or is not empty): {folder}")

---

In [64]:
delete_empty_lettuce_folders(partitions)


--- Checking train folder for empty directories ---
Deleted empty folder: ../Datasets/lettuce\train\Bacterial
Deleted empty folder: ../Datasets/lettuce\train\fungal

--- Checking test folder for empty directories ---
Deleted empty folder: ../Datasets/lettuce\test\Bacterial
Deleted empty folder: ../Datasets/lettuce\test\fungal

--- Checking valid folder for empty directories ---
Deleted empty folder: ../Datasets/lettuce\valid\Bacterial
Deleted empty folder: ../Datasets/lettuce\valid\fungal


In [65]:
data_structure_preview(CUSTOM_DIR=custom_dir_1, PARTITIONS=partitions, CATEGORIES=categories)


--- train folder ---
Bacterial: Folder does not exist
fungal: Folder does not exist
healthy: 1049 images
diseased: 1313 images

--- test folder ---
Bacterial: Folder does not exist
fungal: Folder does not exist
healthy: 76 images
diseased: 150 images

--- valid folder ---
Bacterial: Folder does not exist
fungal: Folder does not exist
healthy: 75 images
diseased: 150 images


## 1.2 - "PEST_AI Plant Leaf Disease Recognition Dataset" (_for_ Lettuce_2 files)
[Kaggle â€¢ _Ibrahima Gabar Diop_ Dataset](https://www.kaggle.com/datasets/ibrahimagabardiop/niayes-crops-disease-v2)

### Adjusting folders

---

In [66]:
def adjust_classes_lettuce_2(DATA_SOURCE, CUSTOM_DIR, PARTITIONS):
    '''
    PARAMETERS:
    - CUSTOM_DIR [str]: Path to the custom directory where dataset is downloaded
    - BACTERIAL_FOLDER [str]: Name of the bacterial disease folder
    - FUNGAL_FOLDER [str]: Name of the fungal disease folder
    - PARTITIONS [list]: List of dataset partitions (e.g., ['train', 'test', 'valitoin'])
    
    FUNCTIONALITY:
    |  Rename specific disease folders to general disease categories.
    '''
    import os


    for partition in PARTITIONS:
        print(f"\n--- Processing {partition} folder ---")

        # Define the paths for the Bacterial and Fungal foldersL
        bacterial_folder_2 = os.path.join(CUSTOM_DIR, partition, "Lettuce__Bacterial")
        fungal_folder_2 = os.path.join(CUSTOM_DIR, partition, "Lettuce__fungal")

        # Create the diseased folder if it doesn't exist
        diseased_folder_2 = os.path.join(CUSTOM_DIR, partition, "diseased")
        os.makedirs(diseased_folder_2, exist_ok=True)

        # Move all files from Bacterial and Fungal folders to diseased folder
        for folder in [bacterial_folder_2, fungal_folder_2]:
            if os.path.exists(folder):
                # Renaming Bacterial files and moving
                if folder == bacterial_folder_2:
                    for filename in os.listdir(folder):
                        src = os.path.join(folder, filename)
                        dst = os.path.join(diseased_folder_2, f"{DATA_SOURCE}-bacterial_{filename}")
                        if os.path.isfile(src):
                            os.rename(src, dst)
                
                # Renaming Fungal files and moving
                elif folder == fungal_folder_2:
                    for filename in os.listdir(folder):
                        src = os.path.join(folder, filename)
                        dst = os.path.join(diseased_folder_2, f"{DATA_SOURCE}-fungal_{filename}")
                        if os.path.isfile(src):
                            os.rename(src, dst)

---

In [67]:
adjust_classes_lettuce_2(DATA_SOURCE="lettuce_2", CUSTOM_DIR=custom_dir_2, PARTITIONS=partitions_2)


--- Processing train folder ---

--- Processing test folder ---

--- Processing val folder ---


In [68]:
data_structure_preview(CUSTOM_DIR=custom_dir_2, PARTITIONS=partitions_2, CATEGORIES=categories_2)


--- train folder ---
Lettuce__Bacterial: 0 images
Lettuce__fungal: 0 images
Lettuce__healthy: 805 images
Cabbage__Black_Rot: 140 images
Cabbage__Bacterial_spot_rot: 140 images
Cabbage__Healthy: 140 images
Cabbage__Alternaria_Leaf_Spot: 140 images
Cabbage__Downy_Mildew: 140 images
Cabbage__ring_spot: 140 images
diseased: 1023 images

--- test folder ---
Lettuce__Bacterial: 0 images
Lettuce__fungal: 0 images
Lettuce__healthy: 173 images
Cabbage__Black_Rot: 30 images
Cabbage__Bacterial_spot_rot: 30 images
Cabbage__Healthy: 30 images
Cabbage__Alternaria_Leaf_Spot: 30 images
Cabbage__Downy_Mildew: 30 images
Cabbage__ring_spot: 30 images
diseased: 221 images

--- val folder ---
Lettuce__Bacterial: 0 images
Lettuce__fungal: 0 images
Lettuce__healthy: 172 images
Cabbage__Black_Rot: 30 images
Cabbage__Bacterial_spot_rot: 30 images
Cabbage__Healthy: 30 images
Cabbage__Alternaria_Leaf_Spot: 30 images
Cabbage__Downy_Mildew: 30 images
Cabbage__ring_spot: 30 images
diseased: 219 images


---

In [69]:
def delete_empty_lettuce_2_folders(CUSTOM_DIR, PARTITIONS):
    '''
    PARAMETERS:
    - CUSTOM_DIR [str]: Path to the custom directory where dataset is downloaded
    - PARTITIONS [list]: List of dataset partitions (e.g., ['train', 'test', 'valitoin'])
    
    FUNCTIONALITY:
    |  Delete empty folders from the dataset.
    '''
    import os
    

    for partition in PARTITIONS:
        print(f"\n--- Checking {partition} folder for empty directories ---")

        # Define the paths for the Bacterial and Fungal foldersL
        bacterial_folder_2 = os.path.join(CUSTOM_DIR, partition, "Lettuce__Bacterial")
        fungal_folder_2 = os.path.join(CUSTOM_DIR, partition, "Lettuce__fungal")

        for folder in [bacterial_folder_2, fungal_folder_2]:
            if os.path.exists(folder) and not os.listdir(folder):  # Check if folder exists and is empty
                os.rmdir(folder)
                print(f"Deleted empty folder: {folder}")
            else:
                print(f"Folder not deleted (either does not exist or is not empty): {folder}")

---

In [70]:
delete_empty_lettuce_2_folders(custom_dir_2, partitions_2)


--- Checking train folder for empty directories ---
Deleted empty folder: ../Datasets/lettuce_2\train\Lettuce__Bacterial
Deleted empty folder: ../Datasets/lettuce_2\train\Lettuce__fungal

--- Checking test folder for empty directories ---
Deleted empty folder: ../Datasets/lettuce_2\test\Lettuce__Bacterial
Deleted empty folder: ../Datasets/lettuce_2\test\Lettuce__fungal

--- Checking val folder for empty directories ---
Deleted empty folder: ../Datasets/lettuce_2\val\Lettuce__Bacterial
Deleted empty folder: ../Datasets/lettuce_2\val\Lettuce__fungal


## 1.3 - "PEST_AI Plant Leaf Disease Recognition Dataset" (_for_ Cabbage files)
[Kaggle â€¢ _Ibrahima Gabar Diop_ Dataset](https://www.kaggle.com/datasets/ibrahimagabardiop/niayes-crops-disease-v2)

### Adjusting folders

---

In [71]:
def adjust_classes_cabbage(DATA_SOURCE, CUSTOM_SRC, CUSTOM_DST, PARTITIONS):
    '''
    PARAMETERS:
    - CUSTOM_DIR [str]: Path to the custom directory where dataset is downloaded
    - BACTERIAL_FOLDER [str]: Name of the bacterial disease folder
    - FUNGAL_FOLDER [str]: Name of the fungal disease folder
    - PARTITIONS [list]: List of dataset partitions (e.g., ['train', 'test', 'valitoin'])
    
    FUNCTIONALITY:
    |  Rename specific disease folders to general disease categories.
    '''
    import os


    for partition in PARTITIONS:
        print(f"\n--- Processing {partition} folder ---")

        # Define the paths for the sources _Bacterial_, _Fungal_ and _Healthy_ folders
        bacterial_folder_3_1 = os.path.join(CUSTOM_SRC, partition, "Cabbage__Black_Rot")
        bacterial_folder_3_2 = os.path.join(CUSTOM_SRC, partition, "Cabbage__Bacterial_spot_rot")
        fungal_folder_3_1 = os.path.join(CUSTOM_SRC, partition, "Cabbage__Alternaria_Leaf_Spot")
        fungal_folder_3_2 = os.path.join(CUSTOM_SRC, partition, "Cabbage__Downy_Mildew")
        fungal_folder_3_3 = os.path.join(CUSTOM_SRC, partition, "Cabbage__ring_spot")
        healthy_folder_3_SRC = os.path.join(CUSTOM_SRC, partition, "Cabbage__Healthy")

        # Create the _diseased_ and _healthy_ folders if it doesn't exist
        diseased_folder_3 = os.path.join(CUSTOM_DST, partition, "Cabbage_Diseased")
        os.makedirs(diseased_folder_3, exist_ok=True)
        healthy_folder_3_DST = os.path.join(CUSTOM_DST, partition, "Cabbage_Healthy")
        os.makedirs(healthy_folder_3_DST, exist_ok=True)

        # Move all files from _Bacterial_, _Fungal_ and _Healthy_ folders to diseased folder
        for folder in [bacterial_folder_3_1, bacterial_folder_3_2, fungal_folder_3_1, fungal_folder_3_2, fungal_folder_3_3, healthy_folder_3_SRC]:
            if os.path.exists(folder):
                # Renaming _Bacterial_ files and moving
                if folder == bacterial_folder_3_1:
                    for filename in os.listdir(folder):
                        src = os.path.join(folder, filename)
                        dst = os.path.join(diseased_folder_3, f"{DATA_SOURCE}-bacterial_1_{filename}")
                        if os.path.isfile(src):
                            os.rename(src, dst)
                elif folder == bacterial_folder_3_2:
                    for filename in os.listdir(folder):
                        src = os.path.join(folder, filename)
                        dst = os.path.join(diseased_folder_3, f"{DATA_SOURCE}-bacterial_2_{filename}")
                        if os.path.isfile(src):
                            os.rename(src, dst)
                
                # Renaming _Fungal_ files and moving
                elif folder == fungal_folder_3_1:
                    for filename in os.listdir(folder):
                        src = os.path.join(folder, filename)
                        dst = os.path.join(diseased_folder_3, f"{DATA_SOURCE}-fungal_1_{filename}")
                        if os.path.isfile(src):
                            os.rename(src, dst)
                elif folder == fungal_folder_3_2:
                    for filename in os.listdir(folder):
                        src = os.path.join(folder, filename)
                        dst = os.path.join(diseased_folder_3, f"{DATA_SOURCE}-fungal_2_{filename}")
                        if os.path.isfile(src):
                            os.rename(src, dst)
                elif folder == fungal_folder_3_3:
                    for filename in os.listdir(folder):
                        src = os.path.join(folder, filename)
                        dst = os.path.join(diseased_folder_3, f"{DATA_SOURCE}-fungal_3_{filename}")
                        if os.path.isfile(src):
                            os.rename(src, dst)

                # Renaming _Healthy_ files and moving
                elif folder == healthy_folder_3_SRC:
                    for filename in os.listdir(folder):
                        src = os.path.join(folder, filename)
                        dst = os.path.join(healthy_folder_3_DST, f"{DATA_SOURCE}-healthy_{filename}")
                        if os.path.isfile(src):
                            os.rename(src, dst)

---

In [72]:
# VariÃ¡veis definidas para o Cabbage Data
custom_dir_3 = "../Datasets/cabbage"
partitions_2 = ["train", "test", "val"]
categories_3 = ["Cabbage__Healthy", "Cabbage_Healthy", "Cabbage_Diseased", "diseased"]

In [73]:
adjust_classes_cabbage(DATA_SOURCE="cabbage", CUSTOM_SRC=custom_dir_2, CUSTOM_DST=custom_dir_3, PARTITIONS=partitions_2)


--- Processing train folder ---

--- Processing test folder ---

--- Processing val folder ---


In [74]:
data_structure_preview(CUSTOM_DIR=custom_dir_3, PARTITIONS=partitions_2, CATEGORIES=categories_3)


--- train folder ---
Cabbage__Healthy: Folder does not exist
Cabbage_Healthy: 140 images
Cabbage_Diseased: 700 images
diseased: Folder does not exist

--- test folder ---
Cabbage__Healthy: Folder does not exist
Cabbage_Healthy: 30 images
Cabbage_Diseased: 150 images
diseased: Folder does not exist

--- val folder ---
Cabbage__Healthy: Folder does not exist
Cabbage_Healthy: 30 images
Cabbage_Diseased: 150 images
diseased: Folder does not exist


---

In [79]:
def delete_empty_cabbage_folders(CUSTOM_DIR, PARTITIONS):
    '''
    PARAMETERS:
    - CUSTOM_DIR [str]: Path to the custom directory where dataset is downloaded
    - PARTITIONS [list]: List of dataset partitions (e.g., ['train', 'test', 'valitoin'])
    
    FUNCTIONALITY:
    |  Delete empty folders from the dataset.
    '''
    import os
    

    for partition in PARTITIONS:
        print(f"\n--- Checking {partition} folder for empty directories ---")

        # Define the paths for the _Bacterial_, _Fungal_ and _Healthy_ folders
        bacterial_folder_3_1 = os.path.join(CUSTOM_DIR, partition, "Cabbage__Black_Rot")
        bacterial_folder_3_2 = os.path.join(CUSTOM_DIR, partition, "Cabbage__Bacterial_spot_rot")
        fungal_folder_3_1 = os.path.join(CUSTOM_DIR, partition, "Cabbage__Alternaria_Leaf_Spot")
        fungal_folder_3_2 = os.path.join(CUSTOM_DIR, partition, "Cabbage__Downy_Mildew")
        fungal_folder_3_3 = os.path.join(CUSTOM_DIR, partition, "Cabbage__ring_spot")
        healthy_folder_3 = os.path.join(CUSTOM_DIR, partition, "Cabbage__Healthy")

        for folder in [bacterial_folder_3_1, bacterial_folder_3_2, fungal_folder_3_1, fungal_folder_3_2, fungal_folder_3_3, healthy_folder_3]:
            if os.path.exists(folder) and not os.listdir(folder):  # Check if folder exists and is empty
                os.rmdir(folder)
                print(f"Deleted empty folder: {folder}")
            else:
                print(f"Folder not deleted (either does not exist or is not empty): {folder}")

---

In [81]:
delete_empty_cabbage_folders(custom_dir_2, partitions_2)


--- Checking train folder for empty directories ---
Deleted empty folder: ../Datasets/lettuce_2\train\Cabbage__Black_Rot
Deleted empty folder: ../Datasets/lettuce_2\train\Cabbage__Bacterial_spot_rot
Deleted empty folder: ../Datasets/lettuce_2\train\Cabbage__Alternaria_Leaf_Spot
Deleted empty folder: ../Datasets/lettuce_2\train\Cabbage__Downy_Mildew
Deleted empty folder: ../Datasets/lettuce_2\train\Cabbage__ring_spot
Deleted empty folder: ../Datasets/lettuce_2\train\Cabbage__Healthy

--- Checking test folder for empty directories ---
Deleted empty folder: ../Datasets/lettuce_2\test\Cabbage__Black_Rot
Deleted empty folder: ../Datasets/lettuce_2\test\Cabbage__Bacterial_spot_rot
Deleted empty folder: ../Datasets/lettuce_2\test\Cabbage__Alternaria_Leaf_Spot
Deleted empty folder: ../Datasets/lettuce_2\test\Cabbage__Downy_Mildew
Deleted empty folder: ../Datasets/lettuce_2\test\Cabbage__ring_spot
Deleted empty folder: ../Datasets/lettuce_2\test\Cabbage__Healthy

--- Checking val folder for em

## 1.4 - Data combining (_lettuce_ + _lettuce_2_)

In [82]:
print(f"custom_dir_1: {custom_dir_1}; partitions: {partitions}; categories: {categories}")
print(f"custom_dir_2: {custom_dir_2}; partitions: {partitions_2}; categories: {categories_2}")
print(f"custom_dir_3: {custom_dir_3}; partitions: {partitions_2}; categories: {categories_3}")

custom_dir_1: ../Datasets/lettuce; partitions: ['train', 'test', 'valid']; categories: ['Bacterial', 'fungal', 'healthy', 'diseased']
custom_dir_2: ../Datasets/lettuce_2; partitions: ['train', 'test', 'val']; categories: ['Lettuce__Bacterial', 'Lettuce__fungal', 'Lettuce__healthy', 'Cabbage__Black_Rot', 'Cabbage__Bacterial_spot_rot', 'Cabbage__Healthy', 'Cabbage__Alternaria_Leaf_Spot', 'Cabbage__Downy_Mildew', 'Cabbage__ring_spot', 'diseased']
custom_dir_3: ../Datasets/cabbage; partitions: ['train', 'test', 'val']; categories: ['Cabbage__Healthy', 'Cabbage_Healthy', 'Cabbage_Diseased', 'diseased']


In [85]:
categories = ['diseased', 'healthy']
categories_2 = ['diseased', 'Lettuce__healthy']
categories_3 = ['Cabbage_Diseased', 'Cabbage_Healthy']

# Final data structure preview
print("\n--- Lettuce Data Structure Preview: ---")
data_structure_preview(CUSTOM_DIR=custom_dir_1, PARTITIONS=partitions, CATEGORIES=categories)   # Lettuce Data
print("\n-----------------------------------------")
print("\n--- Lettuce_2 Data Structure Preview: ---")
data_structure_preview(CUSTOM_DIR=custom_dir_2, PARTITIONS=partitions_2, CATEGORIES=categories_2)   # Lettuce_2 Data
print("\n-----------------------------------------")
print("\n--- Cabbage Data Structure Preview: ---")
data_structure_preview(CUSTOM_DIR=custom_dir_3, PARTITIONS=partitions_2, CATEGORIES=categories_3)   # Cabbage Data


--- Lettuce Data Structure Preview: ---

--- train folder ---
diseased: 1313 images
healthy: 1049 images

--- test folder ---
diseased: 150 images
healthy: 76 images

--- valid folder ---
diseased: 150 images
healthy: 75 images

-----------------------------------------

--- Lettuce_2 Data Structure Preview: ---

--- train folder ---
diseased: 1023 images
Lettuce__healthy: 805 images

--- test folder ---
diseased: 221 images
Lettuce__healthy: 173 images

--- val folder ---
diseased: 219 images
Lettuce__healthy: 172 images

-----------------------------------------

--- Cabbage Data Structure Preview: ---

--- train folder ---
Cabbage_Diseased: 700 images
Cabbage_Healthy: 140 images

--- test folder ---
Cabbage_Diseased: 150 images
Cabbage_Healthy: 30 images

--- val folder ---
Cabbage_Diseased: 150 images
Cabbage_Healthy: 30 images


In [89]:
import os

# Renaming folder _valid_ in "lettuce" dataset to _val_
for partition in partitions:
    if partition == "valid":
        old_path = os.path.join(custom_dir_1, partition)
        new_path = os.path.join(custom_dir_1, "val")
        os.rename(old_path, new_path)
partitions = ["train", "test", "val"]

In [96]:
def combine_classes_lettuce(CUSTOM_SRC, CUSTOM_DST, PARTITIONS):
    '''
    PARAMETERS:
    - CUSTOM_DIR [str]: Path to the custom directory where dataset is downloaded
    - BACTERIAL_FOLDER [str]: Name of the bacterial disease folder
    - FUNGAL_FOLDER [str]: Name of the fungal disease folder
    - PARTITIONS [list]: List of dataset partitions (e.g., ['train', 'test', 'validation'])
    
    FUNCTIONALITY:
    |  Rename specific disease folders to general disease categories.
    '''
    import os
    DATA_SOURCE = "lettuce"

    for partition in PARTITIONS:
        print("\n-----------------------------------------")
        print(f"\n--- Processing {partition} folder ---")

        # Define the paths for the sources _diseased_ and _Lettuce__healthy_ folders inside "lettuce_2" dataset
        diseased_SRC_folder = os.path.join(CUSTOM_SRC, partition, "diseased")
        healthy_SRC_folder = os.path.join(CUSTOM_SRC, partition, "Lettuce__healthy")

        # Create the _diseased_ and _healthy_ folders if it doesn't exist
        diseased_DST_folder = os.path.join(CUSTOM_DST, partition, "diseased")
        os.makedirs(diseased_DST_folder, exist_ok=True)
        healthy_DST_folder = os.path.join(CUSTOM_DST, partition, "healthy")
        os.makedirs(healthy_DST_folder, exist_ok=True)


        # Rename all files from 'healthy' from "lettuce_2" dataset to "lettuce" dataset
        for folder in [healthy_SRC_folder, healthy_DST_folder]:
            if os.path.exists(folder):
                # Renaming _Lettuce__healthy_
                if folder == healthy_SRC_folder:
                    for filename in os.listdir(folder):
                        src = os.path.join(folder, filename)
                        src_rnm = os.path.join(folder, f"{DATA_SOURCE}-healthy_{filename}")
                        if os.path.isfile(src):
                            os.rename(src, src_rnm)
                # Renaming _healthy_
                if folder == healthy_DST_folder:
                    for filename in os.listdir(folder):
                        src = os.path.join(folder, filename)
                        src_rnm = os.path.join(folder, f"{DATA_SOURCE}_2-healthy_{filename}")
                        if os.path.isfile(src):
                            os.rename(src, src_rnm)
        print("\n--- Lettuce Data Structure Preview: ---")
        data_structure_preview(CUSTOM_DIR=custom_dir_1, PARTITIONS=partitions, CATEGORIES=categories)   # Lettuce Data
        print("\n--- Lettuce_2 Data Structure Preview: ---")
        data_structure_preview(CUSTOM_DIR=custom_dir_2, PARTITIONS=partitions_2, CATEGORIES=categories_2)   # Lettuce_2 Data


        # Move all files from "lettuce_2" dataset to "lettuce" dataset
        for folder in [diseased_SRC_folder, healthy_SRC_folder]:
            if os.path.exists(folder):
                # Moving _Diseased_ and _Healthy_ files
                if folder == diseased_SRC_folder:
                    for filename in os.listdir(folder):
                        src = os.path.join(folder, filename)
                        dst = os.path.join(diseased_DST_folder, filename)
                        if os.path.isfile(src):
                            os.rename(src, dst)
                elif folder == healthy_SRC_folder:
                    for filename in os.listdir(folder):
                        src = os.path.join(folder, filename)
                        dst = os.path.join(healthy_DST_folder, filename)
                        if os.path.isfile(src):
                            os.rename(src, dst)

In [97]:
combine_classes_lettuce(CUSTOM_SRC=custom_dir_2, CUSTOM_DST=custom_dir_1, PARTITIONS=partitions)


-----------------------------------------

--- Processing train folder ---

--- Lettuce Data Structure Preview: ---

--- train folder ---
diseased: 2336 images
healthy: 1854 images

--- test folder ---
diseased: 371 images
healthy: 249 images

--- val folder ---
diseased: 369 images
healthy: 247 images

--- Lettuce_2 Data Structure Preview: ---

--- train folder ---
diseased: 0 images
Lettuce__healthy: 0 images

--- test folder ---
diseased: 0 images
Lettuce__healthy: 0 images

--- val folder ---
diseased: 0 images
Lettuce__healthy: 0 images

-----------------------------------------

--- Processing test folder ---

--- Lettuce Data Structure Preview: ---

--- train folder ---
diseased: 2336 images
healthy: 1854 images

--- test folder ---
diseased: 371 images
healthy: 249 images

--- val folder ---
diseased: 369 images
healthy: 247 images

--- Lettuce_2 Data Structure Preview: ---

--- train folder ---
diseased: 0 images
Lettuce__healthy: 0 images

--- test folder ---
diseased: 0 ima

In [102]:
# Delete empty folder "lettuce_2" dataset after combining classes
import os


for partition in partitions_2:
    print(f"\n--- Checking {partition} folder for empty directories ---")

    # Define the paths for the Bacterial and Fungal foldersL
    diseased_folder = os.path.join(custom_dir_2, partition, "diseased")
    healthy_folder = os.path.join(custom_dir_2, partition, "Lettuce__healthy")

    # Delete empty folders inside partitions
    for folder in [diseased_folder, healthy_folder]:
        if os.path.exists(folder) and not os.listdir(folder):  # Check if folder exists and is empty
            os.rmdir(folder)
            print(f"Deleted empty folder: {folder}")
        else:
            print(f"Folder not deleted (either does not exist or is not empty): {folder}")
    # Delete empty partition folders
    if os.path.exists(os.path.join(custom_dir_2, partition)) and not os.listdir(os.path.join(custom_dir_2, partition)):  # Check if partition folder exists and is empty
        os.rmdir(os.path.join(custom_dir_2, partition))
        print("\n-----------------------------------------")
        print(f"\nDeleted empty partition folder: {os.path.join(custom_dir_2, partition)}")

# Moving up 'labels.csv' file one level to empty "lettuce_2" dataset folder
labels_file_src = os.path.join(custom_dir_2, "labels.csv")
labels_file_dst = os.path.join("../Datasets", "lettuce_2_labels.csv")
if os.path.exists(labels_file_src):
    os.rename(labels_file_src, labels_file_dst)
    print("\n-----------------------------------------")
    print(f"\nMoved 'labels.csv' file to: {labels_file_dst}")

# Delete empty folder "lettuce_2" dataset after combining classes
if os.path.exists(custom_dir_2) and not os.listdir(custom_dir_2):  # Check if folder exists and is empty
    os.rmdir(custom_dir_2)
    print("\n-----------------------------------------")
    print(f"\nDeleted empty folder: {custom_dir_2}")


--- Checking train folder for empty directories ---
Folder not deleted (either does not exist or is not empty): ../Datasets/lettuce_2\train\diseased
Folder not deleted (either does not exist or is not empty): ../Datasets/lettuce_2\train\Lettuce__healthy

--- Checking test folder for empty directories ---
Folder not deleted (either does not exist or is not empty): ../Datasets/lettuce_2\test\diseased
Folder not deleted (either does not exist or is not empty): ../Datasets/lettuce_2\test\Lettuce__healthy

--- Checking val folder for empty directories ---
Folder not deleted (either does not exist or is not empty): ../Datasets/lettuce_2\val\diseased
Folder not deleted (either does not exist or is not empty): ../Datasets/lettuce_2\val\Lettuce__healthy

-----------------------------------------

Moved 'labels.csv' file to: ../Datasets\lettuce_2_labels.csv

-----------------------------------------

Deleted empty folder: ../Datasets/lettuce_2


In [105]:
import os

# Renaming folders _diseased_ and _healthy_ in "lettuce" dataset
for partition in partitions:
    for categorie in categories:
        if categorie == "diseased":
            old_path = os.path.join(custom_dir_1, partition, categorie)
            new_path = os.path.join(custom_dir_1, partition, "Lettuce_Diseased")
            os.rename(old_path, new_path)
        if categorie == "healthy":
            old_path = os.path.join(custom_dir_1, partition, categorie)
            new_path = os.path.join(custom_dir_1, partition, "Lettuce_Healthy")
            os.rename(old_path, new_path)
categories = ["Lettuce_Diseased", "Lettuce_Healthy"]

In [106]:
categories_3 = ['Cabbage_Diseased', 'Cabbage_Healthy']

# Final data structure preview
print("\n--- Lettuce Data Structure Preview: ---")
data_structure_preview(CUSTOM_DIR=custom_dir_1, PARTITIONS=partitions, CATEGORIES=categories)   # Lettuce Data
print("\n-----------------------------------------")

print("\n-----------------------------------------")
print("\n--- Cabbage Data Structure Preview: ---")
data_structure_preview(CUSTOM_DIR=custom_dir_3, PARTITIONS=partitions_2, CATEGORIES=categories_3)   # Cabbage Data


--- Lettuce Data Structure Preview: ---

--- train folder ---
Lettuce_Diseased: 2336 images
Lettuce_Healthy: 1854 images

--- test folder ---
Lettuce_Diseased: 371 images
Lettuce_Healthy: 249 images

--- val folder ---
Lettuce_Diseased: 369 images
Lettuce_Healthy: 247 images

-----------------------------------------

-----------------------------------------

--- Cabbage Data Structure Preview: ---

--- train folder ---
Cabbage_Diseased: 700 images
Cabbage_Healthy: 140 images

--- test folder ---
Cabbage_Diseased: 150 images
Cabbage_Healthy: 30 images

--- val folder ---
Cabbage_Diseased: 150 images
Cabbage_Healthy: 30 images
