# Data collection

## 1. Objectives

* Collect data from Kaggle and save it for further processes.

## 2. Inputs

* Kaggle JSON file (the authentication token).

## 3. Outputs

* Generate a dataset in the project's workspace: inputs/datasets/cherry_leaves

# Import packages

In [4]:
import numpy
import os

# Change working directory

### 1. Access the current directory with os.getcwd().

In [5]:
import os
current_dir = os.getcwd()
current_dir

'/workspaces/mildew-detector/jupyter_notebooks'

### 2. Make the parent of the current directory the new current directory.

* os.chdir() defines the directory that we want to set as the new current directory.

In [6]:
os.chdir('/workspaces/mildew-detector')
print("'mildew-detector' has been set as the new current directory")

'mildew-detector' has been set as the new current directory


### 3. Confirm the new current directory.

In [7]:
current_dir = os.getcwd()
current_dir

'/workspaces/mildew-detector'

# Download the project dataset from Kaggle

### 1. Install Kaggle

In [8]:
%pip install kaggle

Collecting kaggle
  Downloading kaggle-1.6.0.tar.gz (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.5/84.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting tqdm (from kaggle)
  Downloading tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-slugify (from kaggle)
  Downloading python_slugify-8.0.1-py2.py3-none-any.whl (9.7 kB)
Collecting text-unidecode>=1.3 (from python-slugify->kaggle)
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.2/78.2 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Downloading tqdm-4.66.1-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding 

### 2. Change the Kaggle configuration directory to the current working directory and set permissions for the Kaggle authentication JSON.

In [9]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

### 3. Set the Kaggle Dataset and Download it.

In [11]:
KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/cherry_leaves_dataset"   
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Downloading cherry-leaves.zip to inputs/cherry_leaves_dataset
... resuming from 0 bytes (57697655 bytes left) ...
 93%|███████████████████████████████████▏  | 51.0M/55.0M [00:00<00:00, 82.6MB/s]
100%|██████████████████████████████████████| 55.0M/55.0M [00:00<00:00, 79.7MB/s]


### 4. Unzip the downloaded cherry_leaves_dataset file, and delete the zip file:

In [13]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

# Data Preparation

### Data Cleaning

+ Check and remove non-image files

In [None]:
def remove_non_image_file(my_data_dir):
    image_extension = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(my_data_dir + '/' + folder)
        i = []
        j = []
        for given_file in files:
            if not given_file.lower().endswith(image_extension):
                file_location = my_data_dir + '/' + folder + '/' + given_file
                os.remove(file_location) 
                i.append(1)
            else:
                j.append(1)
                pass
        print(f"Folder: {folder} - has image file", len(j))
        print(f"Folder: {folder} - has non-image file", len(i))

In [16]:
remove_non_image_file(my_data_dir='inputs/cherry_leaves_dataset/cherry_leaves_images')

Folder: healthy - has image file 2104
Folder: healthy - has non-image file 0
Folder: powdery_mildew - has image file 2104
Folder: powdery_mildew - has non-image file 0


# Data Splitting

### 1. Split the data into 3 sets: a train set, a validation set, and a test set.

In [17]:
import os
import shutil
import random
import joblib


def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # collect classes labels
    labels = os.listdir(my_data_dir)  # this collects only the folder name
    if 'test' in labels:
        pass
    else:
        # initiate folders for train and test sets and subfolders for classes labels
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)
        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move a given file into the train set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move a given file into the validation set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/validation/' + label + '/' + file_name)

                else:
                    # move a given file into the test set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(my_data_dir + '/' + label)

### 2. Ratios of the 3 data sets above

The data is divided into 3 sets:
* Training set, which comprises 0.7 of the data.
* Validation set, which comprises 0.1 of the data.
* Test set, which comprises 0.2 of the data.

In [18]:
split_train_validation_test_images(my_data_dir=f"inputs/cherry_leaves_dataset/cherry_leaves_images",
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )