# **Data Collection**

## Objectives

* Fetch data from Kaggle

## Inputs

* Kaggle JSON file - the authentication token.  

## Outputs

* Generate Dataset: input/datasets/mildew_detection_in_cherry_leaves 


# Change working directory

Select current directory

In [1]:
import os
current_dir = os.getcwd()
current_dir

'/workspace/mildew-detection-in-cherry-leaves/jupyter_notebooks'

Change current directory to the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [2]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


Confirm the new current directory

In [3]:
current_dir = os.getcwd()
current_dir

'/workspace/mildew-detection-in-cherry-leaves'

# Data Collection

Change the Kaggle configuration directory to the current working directory and set permissions for the Kaggle authentication JSON

In [4]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

Set the Kaggle Dataset and Download it.

In [5]:
KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/mildew_detection_in_cherry_leaves"   
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Downloading cherry-leaves.zip to inputs/mildew_detection_in_cherry_leaves
 98%|█████████████████████████████████████▎| 54.0M/55.0M [00:02<00:00, 38.3MB/s]
100%|██████████████████████████████████████| 55.0M/55.0M [00:02<00:00, 28.7MB/s]


Unzip the dataset


In [6]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

---

# Data Preparation

Data cleaning

In [7]:
import os

def remove_non_image_file(my_data_dir):
    image_extension = ('.png', '.jpg', '.jpeg')

    # Ensure the directory exists
    if not os.path.exists(my_data_dir):
        print(f"Error: Directory {my_data_dir} does not exist.")
        return
    
    # List all folders inside the given directory
    folders = os.listdir(my_data_dir)
    
    for folder in folders:
        folder_path = os.path.join(my_data_dir, folder)

        # Skip if not a directory (sometimes there can be files in the main directory)
        if not os.path.isdir(folder_path):
            print(f"Skipping {folder_path} as it is not a directory.")
            continue
        
        # List all files and subdirectories in the folder
        files = os.listdir(folder_path)
        image_count = 0
        non_image_count = 0
        
        for given_file in files:
            file_path = os.path.join(folder_path, given_file)
            
            # Skip if the given file is a directory (this avoids the 'Is a directory' error)
            if os.path.isdir(file_path):
                print(f"Skipping directory: {file_path}")
                continue

            # Check if the file ends with valid image extensions
            if not given_file.lower().endswith(image_extension):
                try:
                    # Attempt to remove the non-image file
                    os.remove(file_path)
                    non_image_count += 1
                    print(f"Removed non-image file: {file_path}")
                except Exception as e:
                    print(f"Error removing file {file_path}: {e}")
            else:
                image_count += 1
        
        # Print summary for the folder
        print(f"Folder: {folder} - contains {image_count} image files and {non_image_count} non-image files.")



## Split train validation test set

In [8]:
import os
import shutil
import random
import joblib


def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # gets classes labels
    labels = os.listdir(my_data_dir)  # it should get only the folder name
    if 'test' in labels:
        pass
    else:
        # create train, test folders with classes labels sub-folder
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move a given file to the train set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move a given file to the validation set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/validation/' + label + '/' + file_name)

                else:
                    # move given file to test set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(my_data_dir + '/' + label)

Devide the dataset in:
* Training set in 0.70 ratio
* Validation set in 0.10 ratio
* Test set in 0.20 ratio

In [9]:
split_train_validation_test_images(my_data_dir=f"inputs/mildew_detection_in_cherry_leaves/cherry-leaves",
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )