# Data Collection

In [1]:
%load_ext pycodestyle_magic
%pycodestyle_on


## Objectives

- Fetch data from Kaggle and prepare it for further processes.
- Load the cherry leaf image dataset.
- Clean the dataset by removing non-image files.
- Confirm dataset structure for use in later modeling steps.

## Inputs

- Kaggle JSON file - the authentication token.
- `inputs/cherry_leaves/cherry-leaves` directory containing image files for different classes.

## Outputs

- Generate Dataset: inputs/datasets/malaria_dataset
- Cleaned dataset with only image files remaining.
- Verified file structure for downstream modeling.
- Conclusions and next steps

## Additional Comments

* No additional comments.



# Import packages

In [2]:
%pip install -r /workspaces/milestone-project-mildew-detection-in-cherry-leaves/requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy
import os

---

# Change working directory

In [4]:
current_dir = os.getcwd()
current_dir

'/workspaces/milestone-project-mildew-detection-in-cherry-leaves/jupyter_notebooks'

In [5]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [6]:
current_dir = os.getcwd()
current_dir

'/workspaces/milestone-project-mildew-detection-in-cherry-leaves'

# Install Kaggle

In [7]:
# install kaggle package
%pip install kaggle

In [8]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

In [9]:
KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/cherry_leaves"

!kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

4:28: E225 missing whitespace around operator
4:51: E225 missing whitespace around operator


In [10]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

---

# Data Preparation

## Data cleaning
- Check and remove non-image files

In [11]:
def remove_non_image_file(my_data_dir):
    image_extension = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(my_data_dir + '/' + folder)
        # print(files)
        i = []
        j = []
        for given_file in files:
            if not given_file.lower().endswith(image_extension):
                file_location = my_data_dir + '/' + folder + '/' + given_file
                os.remove(file_location)  # remove non image file
                i.append(1)
            else:
                j.append(1)
                pass
        print(f"Folder: {folder} - has image file", len(j))
        print(f"Folder: {folder} - has non-image file", len(i))

In [12]:
def remove_non_image_file(my_data_dir):
    image_extensions = ('.jpg', '.jpeg', '.png')
    removed_files = []

    for root, dirs, files in os.walk(my_data_dir):
        for file in files:
            if not file.lower().endswith(image_extensions):
                file_path = os.path.join(root, file)
                os.remove(file_path)
                removed_files.append(file_path)

    print(f"Removed {len(removed_files)} non-image files.")

In [13]:
remove_non_image_file(my_data_dir='inputs/cherry_leaves/cherry-leaves')

In [14]:
for root, dirs, files in os.walk('inputs/cherry_leaves/cherry-leaves'):
    print(f"{root} contains {len(files)} files and {len(dirs)} directories")

## Split train validation test set

In [15]:
import os
import shutil
import random
import joblib


def split_train_validation_test_images(
    my_data_dir,
    train_set_ratio,
    validation_set_ratio,
    test_set_ratio
):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print(
            "train_set_ratio + validation_set_ratio +"
            "test_set_ratio should sum to 1.0")
        return

    # gets classes labels
    labels = os.listdir(my_data_dir)  # it should get only the folder name
    if 'test' in labels:
        pass
    else:
        # create train, test folders with classes labels sub-folder
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move a given file to the train set
                    shutil.move(
                        my_data_dir + '/' + label + '/' + file_name,
                        my_data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move a given file to the validation set
                    shutil.move(
                        my_data_dir + '/' + label + '/' + file_name,
                        my_data_dir + '/validation/' + label + '/' + file_name)

                else:
                    # move given file to test set
                    shutil.move(
                        my_data_dir + '/' + label + '/' + file_name,
                        my_data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(my_data_dir + '/' + label)

In [16]:
split_train_validation_test_images(
    my_data_dir=f"inputs/cherry_leaves/cherry-leaves",
    train_set_ratio=0.7,
    validation_set_ratio=0.1,
    test_set_ratio=0.2)

---

##  Conclusions and Next Steps

- The dataset was successfully loaded and cleaned.
- Non-image files were removed to ensure error-free loading during modelling.
- Directory structures and image counts for each class and subset were confirmed.

###  Next Steps:
- Begin data visualisation to explore image properties.
- Assess class balance and visual differences between healthy and infected leaves.