# Create the project structure

This notebook creates the folder structure necessary for HA1. This should be run from the same folder where the `dogs-vs-cats.zip` file you downloaded from Kaggle is.

In [None]:
# For dealing with files
from pathlib import Path
import shutil

# For using regex expressions
import re

# For splitting the data
from sklearn.model_selection import train_test_split

In [None]:
# NOTE: This script assumes that you have the `dogs-vs-cats.zip` in the same directory as this notebook

pre_existing_item = ["test1.zip",
                     "test",
                     "val",
                     "train.zip",
                     "train",
                     "train_all",
                     "sampleSubmission.csv",
                     "small_train",
                     "small_val"
                     ]

for item in pre_existing_item:
    item = Path(item)
    if item.exists():
        if item.is_dir():
            shutil.rmtree(item)
        elif item.is_file():
            item.unlink()
        else:
            print("Unknown item: {}, remove manually".format(item))


# Depending on your machine the following might take some seconds to run
# `!unzip` runs the unzip command on your system, i.e. you must have unzip installed.
# ` The docker environment has `unzip` installed but if you're not using docker
# the command might fail and you need to unzip manually.
!unzip -q dogs-vs-cats.zip
!unzip -q test1.zip
!unzip -q train.zip

Path("test1").rename("test")
Path("train").rename("train_all")


# Remove sub zip files
Path("test1.zip").unlink()
Path("train.zip").unlink()

In [None]:
train_all_path = Path.cwd() / "train_all"

# Get a list of all filenames inside (these will be used for training and validation)
all_cat_filenames = list(train_all_path.glob("cat.*.jpg"))
all_dog_filenames = list(train_all_path.glob("dog.*.jpg"))

print('Found {} images of cats.\nFound {} images of dogs.'.format(len(all_cat_filenames),
                                                                  len(all_dog_filenames)))

We'll create the `'small_train'` and `'small_val'` folders for a smaller subset of the original dataset (the assignment asks for 20%).

In [None]:
# Get a subset of the entire training dataset (20%)
_, few_cat_filenames, _, few_dog_filenames = train_test_split(all_cat_filenames, 
                                                              all_dog_filenames, 
                                                              test_size=0.2, random_state=1)

In [None]:
# Split it into training and validation sets
split_ratio_small_dataset = 0.3

few_cat_filenames_train, few_cat_filenames_val, few_dog_filenames_train, few_dog_filenames_val = \
train_test_split(few_cat_filenames, 
              few_dog_filenames, 
              test_size = split_ratio_small_dataset,
              random_state = 2)

print('The smaller dataset will be comprised of:')
print('Train:\t', len(few_cat_filenames_train), 'cats and', len(few_dog_filenames_train), 'dogs.')
print('Val:\t', len(few_cat_filenames_val), 'cats and', len(few_dog_filenames_val), 'dogs.')

In [None]:
# Create the train and val directories and subdirectories
subdirectories = {"small_train/cats": few_cat_filenames_train,
                 "small_train/dogs": few_dog_filenames_train,
                 "small_val/cats": few_cat_filenames_val,
                 "small_val/dogs": few_dog_filenames_val
                 }

for subdirectory in subdirectories.keys():
    subdirectory = Path(subdirectory)
    subdirectory.mkdir(parents=True, exist_ok=True)

# Put the training and validation data in the respective folders
def fill_sub_dir_symlink(sub_dir, file_subset):
    """This function uses symbolic links"""
    for file in file_subset:
        symbolic_path = Path.cwd() / sub_dir / file.name
        symbolic_path.symlink_to(file)
        
for sub_dir, file_subset in subdirectories.items():
    fill_sub_dir_symlink(sub_dir, file_subset)

Now we create the `'val'` and `'train'` folders for the entire dataset. You need to specify the train/val split (but something reasonable).

In [None]:
# Choose
split_ratio_big_dataset = 0.3

if not split_ratio_big_dataset:
    raise ValueError("'split_ratio_big_dataset' must have a value between 0 and 1.")

# Split it
all_cat_filenames_train, all_cat_filenames_val, all_dog_filenames_train, all_dog_filenames_val = \
train_test_split(all_cat_filenames,
                 all_dog_filenames,
                 test_size=split_ratio_big_dataset,
                 random_state=3)

print('The full dataset will be comprised of:')
print('Train:\t', len(all_cat_filenames_train), 'cats and', len(all_dog_filenames_train), 'dogs.')
print('Val:\t', len(all_cat_filenames_val), 'cats and', len(all_dog_filenames_val), 'dogs.')

In [None]:
# Create the train and val directories and subdirectories
subdirectories = {"train/cats": all_cat_filenames_train,
                 "train/dogs": all_dog_filenames_train,
                 "val/cats": all_cat_filenames_val,
                 "val/dogs": all_dog_filenames_val
                 }

for subdirectory in subdirectories.keys():
    subdirectory = Path(subdirectory)
    subdirectory.mkdir(parents=True, exist_ok=True)

# Put the training and validation data in the respective folders
def fill_sub_dir(sub_dir, file_subset):
    for file in file_subset:
        symbolic_path = Path.cwd() / sub_dir / file.name
        symbolic_path.symlink_to(file)
        
for sub_dir, file_subset in subdirectories.items():
    fill_sub_dir(sub_dir, file_subset)