In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import os
import shutil
from shutil import copyfile

# Functions

## Split data

In [2]:
def create_train_valid_test_dirs(root_path, subdir_names, train_valid_test_names=['train', 'valid', 'test']):
    """ Function for creating separate folders that contain data for training, validation and testing of the model
    Args:
        1) root_path - the path to the parent folder in which you want to create subfolders
        2) subdir_names - a list of label class names (subfolders with the specified names will be created in each of the train, valid, and test folders)
        3) train_valid_test_names - a list of names of training, validation and test samples
    Returns:
        None; but creates folders
    """
    parent_directories = []
    for dir_name in train_valid_test_names:
        parent_directories.append(os.path.join(root_path, dir_name))

    for directory in parent_directories:
        if not os.path.exists(directory):
            os.makedirs(directory)
        for subdirectory in subdir_names:
            subdir_name = os.path.join(directory + '/', subdirectory)
            if not os.path.exists(subdir_name):
                os.makedirs(subdir_name)

In [3]:
def split_data(source_dir_path, train_dir_path, valid_dir_path, test_dir_path, train_test_split=0.8, train_valid_split=0.85, random_sample=True):
    """ Function to split the files of the specified folder into training, validation and test samples by copying 
    the files from source_dir_path to the corresponding folders
    Args:
        1) source_dir_path - the path to the folder containing the original data to be split into train/valid/test
        2) train_dir_path - the path to the folder that will contain the training data
        3) valid_dir_path - the path to the folder that will contain the validation data
        4) test_dir_path - the path to the folder that will contain the test data
        5) train_test_split - the ratio between training and test samples ([0; 1])
        6) train_valid_split - the ratio between training and validation samples ([0; 1])
        7) random_sample - whether files need to be shuffled randomly before splitting into training, validation, and test samples
    Returns:
        None, but split the files into training, validation and test samples
    """
    fnames = os.listdir(source_dir_path)

    processed_fnames = []
    for file_name in fnames:
        if os.path.getsize(os.path.join(source_dir_path, file_name)) > 0:
            processed_fnames.append(file_name)
        else:
            print(f'{file_name} is zero length, so ignoring.')
    
    if random_sample:
        processed_fnames = random.sample(processed_fnames, len(processed_fnames))
        
    split_index = int(train_test_split * len(processed_fnames))
    print(f"train_test_split_index = {split_index}")
    train_valid_files = processed_fnames[:split_index]
    test_files = processed_fnames[split_index:]
    
    split_index = int(train_valid_split * len(train_valid_files))
    print(f"train_valid_split_index = {split_index}")
    train_files = train_valid_files[:split_index]
    valid_files = train_valid_files[split_index:]
    
    # Copy training files
    for file in train_files:
        source = os.path.join(source_dir_path, file)
        destination = os.path.join(train_dir_path, file)
        copyfile(source, destination)
    
    # Copy validation files
    for file in valid_files:
        source = os.path.join(source_dir_path, file)
        destination = os.path.join(valid_dir_path, file)
        copyfile(source, destination)
    
    # Copy test files
    for file in test_files:
        source = os.path.join(source_dir_path, file)
        destination = os.path.join(test_dir_path, file)
        copyfile(source, destination)
        

In [11]:
def split_class_data(source_dir_path, train_valid_test_paths, class_dir_name, train_test_split=0.8, train_valid_split=0.85, random_sample=True):
    """ Function for dividing the data of one label class into train/valid/test
    Args:
        1) source_dir_path - the path to the folder containing the original data of all label classes which needs to be splitted into train/valid/test;
        2) train_valid_test_paths - the list of paths to the folders of training, validation and test samples 
        (the paths are specified in this order: train, valid, test)
        3) class_dir_name - the name of the folder that contains the label class data
        4) train_test_split - the ratio between training and test samples ([0; 1])
        5) train_valid_split - the ratio between training and validation samples ([0; 1])
        6) random_sample - whether files need to be shuffled randomly before splitting into training, validation, and test samples
    Returns:
        None, but split the files of label class into training, validation and test samples
    """
    train_dir_path_class = os.path.join(train_valid_test_paths[0], class_dir_name)
    valid_dir_path_class = os.path.join(train_valid_test_paths[1], class_dir_name)
    test_dir_path_class = os.path.join(train_valid_test_paths[2], class_dir_name)
    source_dir_path_class = os.path.join(source_dir_path, class_dir_name)
    split_data(source_dir_path=source_dir_path_class, train_dir_path=train_dir_path_class, valid_dir_path=valid_dir_path_class, 
               test_dir_path=test_dir_path_class, 
               train_test_split=train_test_split, train_valid_split=train_valid_split, random_sample=random_sample)

## View data

In [55]:
def display_image(root_path, image_name, title=None):
    """ Function to display an image
    Args:
        1) root_path - the path to the folder that contains the image
        2) image_name - the name of the image
        3) title - the title that will be displayed above the image
    Returns:
        None; but displays an image
    """
    img = mpimg.imread(os.path.join(root_path, image_name))
    plt.imshow(img)
    plt.title(title)
    plt.show()

# Load data

In [5]:
source_path = 'data/garbage_classification_6_classes/garbage_classification/'

source_path_cardboard = os.path.join(source_path, 'cardboard')
source_path_glass = os.path.join(source_path, 'glass')
source_path_metal = os.path.join(source_path, 'metal')
source_path_paper = os.path.join(source_path, 'paper')
source_path_plastic = os.path.join(source_path, 'plastic')
source_path_trash = os.path.join(source_path, 'trash')

In [6]:
cardboard_image_names = os.listdir(source_path_cardboard)
glass_image_names = os.listdir(source_path_glass)
metal_image_names = os.listdir(source_path_metal)
paper_image_names = os.listdir(source_path_paper)
plastic_image_names = os.listdir(source_path_plastic)
trash_image_names = os.listdir(source_path_trash)

In [7]:
print(f"There are {len(cardboard_image_names)} images of cardboard.") # 403
print(f"There are {len(glass_image_names)} images of glass.") # 501
print(f"There are {len(metal_image_names)} images of metal.") # 410
print(f"There are {len(paper_image_names)} images of paper.") # 594
print(f"There are {len(plastic_image_names)} images of plastic.") # 482
print(f"There are {len(trash_image_names)} images of trash.") # 137

There are 403 images of cardboard.
There are 501 images of glass.
There are 410 images of metal.
There are 594 images of paper.
There are 482 images of plastic.
There are 137 images of trash.


## Display representatives of each class

### Cardboard

In [None]:
for cardboard_image_name in cardboard_image_names[:5]:
    display_image(root_path=source_path_cardboard, image_name=cardboard_image_name, title=cardboard_image_name)

### Glass

In [None]:
for glass_image_name in glass_image_names[:5]:
    display_image(root_path=source_path_glass, image_name=glass_image_name, title=glass_image_name)

### Metal

In [None]:
for metal_image_name in metal_image_names[:5]:
    display_image(root_path=source_path_metal, image_name=metal_image_name, title=metal_image_name)

### Paper

In [None]:
for paper_image_name in paper_image_names[:5]:
    display_image(root_path=source_path_paper, image_name=paper_image_name, title=paper_image_name)

### Plastic

In [None]:
for plastic_image_name in plastic_image_names[:5]:
    display_image(root_path=source_path_plastic, image_name=plastic_image_name, title=plastic_image_name)

### Trash

In [None]:
for trash_image_name in trash_image_names[:5]:
    display_image(root_path=source_path_trash, image_name=trash_image_name, title=trash_image_name)

# Split data into train, validation and test

## Create folders for train/valid/test data

In [8]:
destination_path = 'data/garbage_classification_6_classes/garbage_classification_TrainValidTest/'

garbage_class_names = ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']
create_train_valid_test_dirs(root_path=destination_path, subdir_names=garbage_class_names)

## Split the data and save it in the appropriate folders

### Check the correct operation of the split_class_data() function

In [84]:
train_dir_path = 'data/garbage_classification_6_classes/garbage_classification_TrainValidTest/train/'
valid_dir_path = 'data/garbage_classification_6_classes/garbage_classification_TrainValidTest/valid/'
test_dir_path = 'data/garbage_classification_6_classes/garbage_classification_TrainValidTest/test/'
train_valid_test_paths = [train_dir_path, valid_dir_path, test_dir_path]

split_class_data(source_dir_path=source_path, train_valid_test_paths=train_valid_test_paths, 
                 class_dir_name='cardboard', train_test_split=0.9, train_valid_split=0.85, random_sample=False)

cardboard_train_images = os.listdir(os.path.join(train_dir_path, 'cardboard'))
cardboard_valid_images = os.listdir(os.path.join(valid_dir_path, 'cardboard'))
cardboard_test_images = os.listdir(os.path.join(test_dir_path, 'cardboard'))

print(f"Cardboard: train = {len(cardboard_train_images)}") # 307
print(f"Cardboard: valid = {len(cardboard_valid_images)}") # 55
print(f"Cardboard: test = {len(cardboard_test_images)}") # 41

train_test_split_index = 362
train_valid_split_index = 307
Cardboard: train = 307
Cardboard: valid = 55
Cardboard: test = 41


In [85]:
split_class_data(source_dir_path=source_path, train_valid_test_paths=train_valid_test_paths, 
                 class_dir_name='glass', train_test_split=0.9, train_valid_split=0.85, random_sample=False)

glass_train_images = os.listdir(os.path.join(train_dir_path, 'glass'))
glass_valid_images = os.listdir(os.path.join(valid_dir_path, 'glass'))
glass_test_images = os.listdir(os.path.join(test_dir_path, 'glass'))

print(f"Glass: train = {len(glass_train_images)}") # 382
print(f"Glass: valid = {len(glass_valid_images)}") # 68
print(f"Glass: test = {len(glass_test_images)}") # 51

train_test_split_index = 450
train_valid_split_index = 382
Glass: train = 382
Glass: valid = 68
Glass: test = 51


In [86]:
split_class_data(source_dir_path=source_path, train_valid_test_paths=train_valid_test_paths, 
                 class_dir_name='metal', train_test_split=0.9, train_valid_split=0.85, random_sample=False)

metal_train_images = os.listdir(os.path.join(train_dir_path, 'metal'))
metal_valid_images = os.listdir(os.path.join(valid_dir_path, 'metal'))
metal_test_images = os.listdir(os.path.join(test_dir_path, 'metal'))

print(f"Metal: train = {len(metal_train_images)}") # 313
print(f"Metal: valid = {len(metal_valid_images)}") # 56
print(f"Metal: test = {len(metal_test_images)}") # 41

train_test_split_index = 369
train_valid_split_index = 313
Metal: train = 313
Metal: valid = 56
Metal: test = 41


### Split classes data

In [12]:
train_dir_path = 'data/garbage_classification_6_classes/garbage_classification_TrainValidTest/train/'
valid_dir_path = 'data/garbage_classification_6_classes/garbage_classification_TrainValidTest/valid/'
test_dir_path = 'data/garbage_classification_6_classes/garbage_classification_TrainValidTest/test/'
train_valid_test_paths = [train_dir_path, valid_dir_path, test_dir_path]

for class_name in garbage_class_names:
    split_class_data(source_dir_path=source_path, train_valid_test_paths=train_valid_test_paths, 
                 class_dir_name=class_name, train_test_split=0.9, train_valid_split=0.85, random_sample=False)
    
    class_train_images = os.listdir(os.path.join(train_dir_path, class_name))
    class_valid_images = os.listdir(os.path.join(valid_dir_path, class_name))
    class_test_images = os.listdir(os.path.join(test_dir_path, class_name))

    print(f"{class_name}: train = {len(class_train_images)}") 
    print(f"{class_name}: valid = {len(class_valid_images)}") 
    print(f"{class_name}: test = {len(class_test_images)}") 
    print("")

train_test_split_index = 362
train_valid_split_index = 307
cardboard: train = 307
cardboard: valid = 55
cardboard: test = 41

train_test_split_index = 450
train_valid_split_index = 382
glass: train = 382
glass: valid = 68
glass: test = 51

train_test_split_index = 369
train_valid_split_index = 313
metal: train = 313
metal: valid = 56
metal: test = 41

train_test_split_index = 534
train_valid_split_index = 453
paper: train = 453
paper: valid = 81
paper: test = 60

train_test_split_index = 433
train_valid_split_index = 368
plastic: train = 368
plastic: valid = 65
plastic: test = 49

train_test_split_index = 123
train_valid_split_index = 104
trash: train = 104
trash: valid = 19
trash: test = 14

