# **Data Collection**

## Objectives

* Fetch images from Kaggle and split into train, test and validation folders

## Inputs

* Kaggle JSON file - the authentication token.

## Outputs

* Generate Dataset: inputs/datasets/cats_vs_dogs_dataset

## Additional Comments

* Must delete unlabelled images from the Kaggle dataset as they can not be used for this project.


---

# Install packages

Install requirements, import libraries, and set variable DatasetFolder

In [1]:
%pip install -r /workspace/pp5-cats-vs-dogs/requirements.txt 2>/dev/null | grep -v 'Requirement already satisfied'
print('Requirements installed.')

Note: you may need to restart the kernel to use updated packages.
Requirements installed.


In [2]:
import os
import numpy
import zipfile
import shutil
import random
import joblib

DatasetFolder = 'inputs/cats_vs_dogs_dataset_small'
DatasetFolder

'inputs/cats_vs_dogs_dataset_small'

# Change working directory

Change working directory to root project folder

In [3]:
current_dir = os.getcwd()
print('Current folder: ' + current_dir)
os.chdir(os.path.dirname(current_dir))
current_dir = os.getcwd()
print('New folder: ' + current_dir)

Current folder: /workspace/pp5-cats-vs-dogs/jupyter_notebooks
New folder: /workspace/pp5-cats-vs-dogs


---

# **Download dataset**

* Install Kaggle, configure the directory, and set permissions for the Kaggle authentication JSON.
* Download the Kaggle dataset.
* Unzip the file and delete the zip file and unlabeled images.

In [None]:
%pip install kaggle==1.5.12 2>/dev/null | grep -v 'Requirement already satisfied'
print('Requirements installed.')

os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json
print('Directory configured and permissions set.')

! kaggle competitions download -c dogs-vs-cats -p {DatasetFolder}

print('Extracting files...')
with zipfile.ZipFile(DatasetFolder + '/dogs-vs-cats.zip', 'r') as zip_ref:
    zip_ref.extractall(DatasetFolder)

with zipfile.ZipFile(DatasetFolder + '/train.zip', 'r') as zip_ref:
    zip_ref.extractall(DatasetFolder)

os.remove(DatasetFolder + '/dogs-vs-cats.zip')
os.remove(DatasetFolder + '/test1.zip')
os.remove(DatasetFolder + '/train.zip')
os.remove(DatasetFolder + '/sampleSubmission.csv')
print('Unused files deleted.')

---
# **Data Preparation and cleaning**

## Check and remove non-image files

In [None]:
def remove_non_image_file(my_data_dir):
    image_extension = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(my_data_dir + '/' + folder)
        # print(files)
        i = []
        j = []
        for given_file in files:
            if not given_file.lower().endswith(image_extension):
                file_location = my_data_dir + '/' + folder + '/' + given_file
                os.remove(file_location)  # remove non image file
                i.append(1)
            else:
                j.append(1)
                pass
        print(f"Folder: {folder} - has image file", len(j))
        print(f"Folder: {folder} - has non-image file", len(i))


remove_non_image_file(my_data_dir=DatasetFolder)

## Split into different folders for cats and dogs

In [None]:
def split_images(my_data_dir):
    # Define the source and target directories
    source_dir = os.path.join(my_data_dir, 'train')
    cat_dir = os.path.join(my_data_dir, 'cat')
    dog_dir = os.path.join(my_data_dir, 'dog')

    # Create 'cat' and 'dog' directories if they don't exist
    for dir_name in [cat_dir, dog_dir]:
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)

    # Iterate through all files in the 'train' folder
    for file_name in os.listdir(source_dir):
        file_path = os.path.join(source_dir, file_name)

        # Check if the filename contains 'cat' or 'dog'
        if 'cat' in file_name.lower():
            target_dir = cat_dir
        elif 'dog' in file_name.lower():
            target_dir = dog_dir
        else:
            continue  # Skip files without labels

        # Move the file to the corresponding folder
        shutil.move(file_path, target_dir)

    # Delete the 'train' folder after moving all files
    shutil.rmtree(source_dir)


split_images(my_data_dir=DatasetFolder)

## Delete 80% of image files

In [None]:
def delete_80_percent_of_files(cat_dir, dog_dir):
    # Get the number of files in both directories
    cat_files = os.listdir(cat_dir)
    dog_files = os.listdir(dog_dir)

    # Calculate 80% of the files in each directory
    num_cat_files_to_delete = int(len(cat_files) * 0.8)
    num_dog_files_to_delete = int(len(dog_files) * 0.8)

    # Select files to delete in each directory
    cat_files_to_delete = random.sample(cat_files, num_cat_files_to_delete)
    dog_files_to_delete = random.sample(dog_files, num_dog_files_to_delete)

    # Delete files that are in the lists
    for file_name in cat_files:
        if file_name in cat_files_to_delete:
            file_path = os.path.join(cat_dir, file_name)
            os.remove(file_path)

    for file_name in dog_files:
        if file_name in dog_files_to_delete:
            file_path = os.path.join(dog_dir, file_name)
            os.remove(file_path)

In [None]:
cat_dir = os.path.join(DatasetFolder, 'cat')
dog_dir = os.path.join(DatasetFolder, 'dog')
delete_80_percent_of_files(cat_dir, dog_dir)

## Split dataset

Split dataset into train (70%), validation (10%) and test (20%) sets

In [None]:
def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # gets classes labels
    labels = os.listdir(my_data_dir)  # it should get only the folder names
    if 'test' in labels:
        pass
    else:
        # create train, test folders with classes labels sub-folder
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move a given file to the train set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move a given file to the validation set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/validation/' + label + '/' + file_name)

                else:
                    # move given file to test set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(my_data_dir + '/' + label)


split_train_validation_test_images(my_data_dir=DatasetFolder,
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )

## Copy random dog and cat images to output folder

In [None]:
def copy_random_images(number):

    OutputFolder = 'outputs/sample_images'

    cat_dir = os.path.join(DatasetFolder, 'validation/cat')
    dog_dir = os.path.join(DatasetFolder, 'validation/dog')

    sample_cat_dir = os.path.join(OutputFolder, 'cat')
    sample_dog_dir = os.path.join(OutputFolder, 'dog')

    # Create the sample directories if they don't exist
    os.makedirs(sample_cat_dir, exist_ok=True)
    os.makedirs(sample_dog_dir, exist_ok=True)

    # Get a list of all cat and dog images
    cat_images = os.listdir(cat_dir)
    dog_images = os.listdir(dog_dir)

    # Select a number of random images from each category
    random_cat_images = random.sample(cat_images, number)
    random_dog_images = random.sample(dog_images, number)

    # Copy the selected images to the sample directories
    for image in random_cat_images:
        shutil.copy(os.path.join(cat_dir, image), sample_cat_dir)

    for image in random_dog_images:
        shutil.copy(os.path.join(dog_dir, image), sample_dog_dir)

In [None]:
copy_random_images(50)

## Create a zip file of the sample images

In [4]:
def create_zip_from_folder(folder_path, zip_filename="sample_images.zip"):
    zip_path = os.path.join(os.path.dirname(folder_path), zip_filename)

    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, folder_path)
                zipf.write(file_path, arcname)

    return zip_path


OutputFolder = 'outputs/sample_images'
create_zip_from_folder(OutputFolder)

'outputs/sample_images.zip'

---

# Conclusions and Next Steps

Dataset has been downloaded and images prepared for analysis. <br>
Proceed to next notebook for Data visualization or Modelling and evaluation.