# Import packages

In [1]:
%pip install -r /workspace/pp5-mildew-detection/requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy
import os

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [3]:
current_dir = os.getcwd()
current_dir

'/workspace/pp5-mildew-detection/jupyter_notebooks'

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [4]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


Confirm the new current directory

In [5]:
current_dir = os.getcwd()
current_dir

'/workspace/pp5-mildew-detection'

---

# **Data Collection**

## Objectives

* Fetch and prepare the data for processing

## Inputs

*   Make use of Kaggle JSON file - the authentication token. 

## Outputs

* Generate Dataset: inputs/datasets/mildew_dataset

## Additional Comments | Insights | Conclusions


* No additional comments.

---

# Install Kaggle

In [6]:
%pip install kaggle==1.5.12


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


 **Change the Kaggle configuration directory to the current working directory and set permissions for the Kaggle authentication JSON**.

In [7]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

* Get the dataset path from the [Kaggle URL](https://www.kaggle.com/datasets/codeinstitute/cherry-leaves). When you viewing the dataset at Kaggle, check what is after https://www.kaggle.com/ (in some cases kaggle.com/datasets). Copy that at KaggleDatasetPath.
* Set your destination folder

Try to fit a image here of kaggle data ------------------------

Set the Kaggle Dataset and Download it.

In [8]:
KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/mildew_dataset"   
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Downloading cherry-leaves.zip to inputs/mildew_dataset
 98%|█████████████████████████████████████▎| 54.0M/55.0M [00:02<00:00, 36.3MB/s]
100%|██████████████████████████████████████| 55.0M/55.0M [00:02<00:00, 26.2MB/s]


Unzip the downloaded file, and delete the zip file.

In [9]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

# Data Preparation

---

## Data cleaning

### Check and remove non-image files from data

In [10]:
import os
import random

def prepeare_image_data(my_data_dir, percentage_to_delete):
    image_extensions = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)
    
    # Remove non-image files and collect all image files
    for folder in folders:
        folder_path = os.path.join(my_data_dir, folder)
        files = os.listdir(folder_path)
        img_files = []
        non_img_files = []
        
        for file in files:
            file_location = os.path.join(folder_path, file)
            if not file.lower().endswith(image_extensions):
                os.remove(file_location)
                non_img_files.append(file_location)
            else:
                img_files.append(file_location)
        
        print(f"Folder: {folder} - has {len(img_files)} image files")
        print(f"Folder: {folder} - has {len(non_img_files)} non-image files")

    # Calculate number of images to delete per folder based on percentage
    for folder in folders:
        folder_path = os.path.join(my_data_dir, folder)
        images = [os.path.join(folder_path, image) for image in os.listdir(folder_path) if image.lower().endswith(image_extensions)]
        total_imgs = len(images)
        
        if total_imgs > 0:
            num_imgs_to_delete = int(percentage_to_delete / 100 * total_imgs)
            
            if num_imgs_to_delete > 0:
                imgs_to_delete = random.sample(images, num_imgs_to_delete)
                
                # Delete selected images
                for image_path in imgs_to_delete:
                    os.remove(image_path)
                
                print(f"Folder: {folder} - Deleted {num_imgs_to_delete} images based on {percentage_to_delete}% deletion.")
            else:
                print(f"Folder: {folder} - No images to delete based on {percentage_to_delete}% deletion.")
        else:
            print(f"Folder: {folder} - No images found.")

    # Count and print the number of images left in each folder
    for folder in folders:
        folder_path = os.path.join(my_data_dir, folder)
        remaining_imgs = [f for f in os.listdir(folder_path) if f.lower().endswith(image_extensions)]
        print(f"Folder: {folder} - has {len(remaining_imgs)} images remaining")


In [11]:
prepeare_image_data(
    my_data_dir='inputs/mildew_dataset/cherry-leaves',
    percentage_to_delete=50
)

Folder: healthy - has 2104 image files
Folder: healthy - has 0 non-image files
Folder: powdery_mildew - has 2104 image files
Folder: powdery_mildew - has 0 non-image files
Folder: healthy - Deleted 1052 images based on 50% deletion.
Folder: powdery_mildew - Deleted 1052 images based on 50% deletion.
Folder: healthy - has 1052 images remaining
Folder: powdery_mildew - has 1052 images remaining


## Split train and validation test set's

In [12]:
import os
import shutil
import random
import joblib


def split_train_validation_test_images(my_data_dir, train_ratio, validation_ratio, test_ratio):

    if train_ratio + validation_ratio + test_ratio != 1.0:
        print("train_ratio + validation_ratio + test_ratio should sum to 1.0")
        return

    labels = os.listdir(my_data_dir) 
    if 'test' in labels:
        pass
    else:
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_ratio)
            validation_set_files_qty = int(len(files) * validation_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/validation/' + label + '/' + file_name)

                else:
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(my_data_dir + '/' + label)

Conventionally,
* The training set is divided into a 0.70 ratio of data.
* The validation set is divided into a 0.10 ratio of data.
* The test set is divided into a 0.20 ratio of data.

In [13]:
split_train_validation_test_images(my_data_dir=f"inputs/mildew_dataset/cherry-leaves",
                                   train_ratio=0.7,
                                   validation_ratio=0.1,
                                   test_ratio=0.2
                                   )