In [0]:
#!/usr/bin/python
import os
import sys
import shutil, errno
import zipfile as zf
import random
from glob import glob
from pathlib import Path

import h5py
from keras.utils import to_categorical
            


In [0]:

###########################################################################     
######################## data preprocessing ###############################
###########################################################################
'''
create folder: original_data; processed_data

get original data
unzip file
move data to the path of folder original_Data

copy original_data and paste to processed_data so we can process data in the right folder
split dataset into test and train subset

'''


############################### Helper functions ###############################

'''
create the folders
input: folders want to create in the format "./xxx/"
output: void
'''
def create_folders(folders):
    for folder in folders:
        if not os.path.exists(folder):
            os.makedirs(folder)
            print("\nCreated", folder)
        else:
            inp = input('Do you clear the folder ' + folder + '?, y/n: ')
            if inp.lower() == "y":
                print("The folder will be cleared")
                try:
                    shutil.rmtree(folder)
                    os.makedirs(folder)
                except OSError as e:
                    print ("Error: %s - %s." % (e.filename, e.strerror))
            elif inp.lower() == "n":
                print("The folder will not be cleared")
            else:
                print("Please type y/n")
    return


'''
unzip the folder to the same path
input: zipfile to unzip
output: void
'''
def unzip(zipfile):
    files = zf.ZipFile(zipfile, 'r')
    files.extractall()
    files.close()
    
'''
move files from one folder to another
input: src folder and dst folder in format ./xxx/
output: void
'''
def move_folder(src, dst):
    try:
        files = get_subsets(src)
        for file in files:
            shutil.move(src+file, dst)
    except OSError as e:
        print ("Error: %s - %s." % (e.filename, e.strerror))
        
'''
copy files from one folder to another
input: src folder and dst folder in format ./xxx/
output: void
'''
def copy_folder(src, dst):
    try: 
        if os.path.exists(dst):
            shutil.rmtree(dst)
        shutil.copytree(src, dst)
    except OSError as e:
        if e.errno == errno.ENOTDIR:
            shutil.copy(Src, dst)
        else:
            raise

'''
get subsets (either files or folders) of the folder
input: folder path
output: list of name of subsets
'''
def get_subsets(path):
    subsets = os.listdir(path)
    for s in subsets:
        if s.startswith('.'):
            subsets.remove(s)
    return subsets

'''
get path of subsets (either files or folders) of the folder
input: folder path
output: list of subsets path
'''
def get_subsets_path(path):
    return glob(path+"*/")

#strip the name from a path
def get_name_from_path(f):
    return f[f.rindex("/")+1: ]

'''
split all data into train set and test set
input: parent path of trainset folder and testset folder, the ratio of test/all data
output: void
'''
def split_into_train_and_test_sets(datapath, ratio):
    assert ratio <= 1 and ratio >= 0
    test_path = os.path.join(datapath, "test/")
    train_path = os.path.join(datapath, "train/")
    print(test_path)
    subset_paths = get_subsets_path(train_path)
    subsets = get_subsets(train_path)
    print(subset_paths)
    print(subsets)
    for i, path in enumerate(subset_paths):
        curr = subsets[i]

        temp = test_path + curr + "/"
        os.makedirs(os.path.dirname(temp), exist_ok=True)
        
        images = glob(path + "*.jpg")
        rand = random.sample(images, int(ratio*len(images)))
        print(curr , " -- size of test set: " , len(rand) , ", size of trainset: " , (len(images)-len(rand)))

        for image in rand:
            dst = temp + get_name_from_path(image)
            os.rename(image, dst)
    
    

            

In [12]:
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

# !ls "/content/drive/My Drive"
# os.chdir("../")
datapath = "./processed_data/"
test_path = os.path.join(datapath, "test")
train_path = os.path.join(datapath, "train")
folders = ["./original_data/", datapath, test_path, train_path, "./trained_models/"]
create_folders(folders)
unzip("/content/drive/My Drive/dataset-resized.zip")
move_folder("./dataset-resized/", "./original_data")
copy_folder("./original_data/", train_path)
waste_types = get_subsets(datapath)
split_into_train_and_test_sets(datapath, 0.2)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Do you clear the folder ./original_data/?, y/n: n
The folder will not be cleared
Do you clear the folder ./processed_data/?, y/n: n
The folder will not be cleared
Do you clear the folder ./processed_data/test?, y/n: 
Please type y/n
Do you clear the folder ./processed_data/train?, y/n: n
The folder will not be cleared
Do you clear the folder ./trained_models/?, y/n: n
The folder will not be cleared
Error: None - None.
./processed_data/test/
['./processed_data/train/plastic/', './processed_data/train/paper/', './processed_data/train/glass/', './processed_data/train/metal/', './processed_data/train/cardboard/', './processed_data/train/trash/']
['plastic', 'paper', 'glass', 'metal', 'cardboard', 'trash']
plastic  -- size of test set:  96 , size of trainset:  386
paper  -- size of test set:  118 , size of trainset:  476
glass  -- size of test set:  100 , size of 

In [0]:
'''
export the data folder to .h5 file
input: data folder path
output: h5 path
'''
def export_to_h5(pth):
    #set the file name that the dataset will be saved under
    input_fname = './'+get_end_slash(pth) + '_' + str(IMG_WIDTH) + "x" + str(IMG_HEIGHT)+"x"+str(CHANNELS)+  '.h5'
    #checks if this particular file already exists and asks the user if it should be overwritten
    if(os.path.isfile(input_fname)):
        inp = input('overwrite file ' + input_fname + '?, y/n: ')
        if inp.lower() =="y":
            print("file will be overwritten")
            os.remove(input_fname)
        elif inp.lower()=="n":
            input_fname = input("enter a new filename: ") + '.h5'
            print(input_fname)
        else:
            print("incorrect input, preprocessing failed")
            return     

#     return input_fname
    
#     folders = get_subsets_path(pth)
#     classes = get_subsets(pth)
  
#     #create h5py file to store dataset
#     hf = h5py.File(input_fname)

#     #get list of all images and number of images
#     all_images = glob(pth+"**/*.jpg",recursive=True)
#     n_images = len(all_images)
  
#     #create dataset X and label list
#     X = hf.create_dataset(
#         name= 'X',
#         shape=(n_images,IMG_WIDTH, IMG_HEIGHT, CHANNELS),
#         maxshape=(None, IMG_WIDTH, IMG_HEIGHT,None),
#         compression="gzip",
#         compression_opts=9)
#     label_lis = []

#     #set an index to iterate through
#     x_ind =0

#     #go through all the folders
#     for i, folder in enumerate(folders):
#         images = glob(folder+"*.jpg")
#         total_images = len(images)
#         print(classes[i],total_images)

#         #process each image in each folder and add the class and the processed image to the image array list
#         for j, image_pth in enumerate(images):
#             img = process_single_img(image_pth, IMG_WIDTH, IMG_HEIGHT)
#             X[x_ind] = img
#             label_lis.append(i)
#             print("{}/{} fname = {}".format(j,total_images, get_pic_name(image_pth)))
#             x_ind+=1
    
#     #store the labels under the y set
#     hf.create_dataset(
#         name= 'y',
#         compression="gzip",
#         compression_opts=9,
#         data=label_lis)
    
    
#     #convert the labels to one-hot values (i.e. 2 -> [0 0 1 0 0]) if there were 5 possible values)
#     y_one_hot = to_categorical(np.array(label_lis))
#     hf.create_dataset(
#         name= 'y_one_hot',
#         compression="gzip",
#         compression_opts=9,
#         data=y_one_hot)

#     #close the opened file
#     hf.close()
    

Using TensorFlow backend.


NameError: ignored

In [0]:
#normalize the inputs   
def scale_X(X):
    return X/255.0

In [0]:
#strip the picture name from a path
def get_pic_name(f):
    return f[f.rindex("/")+1: ]

In [0]:
#strip the folder name from a path
def get_end_slash(f):
    return f[f.rindex("/",0,f.rindex("/"))+1:f.rindex("/")]

'''decalre image properties'''
IMG_WIDTH = 64
IMG_HEIGHT = 64
CHANNELS=3

'''declare some training hyperparameters'''
BATCH_SIZE = 50
EPOCHS = 100

In [0]:
from PIL import Image
import cv2
'''
Puts the image from an image path into the appropriate size, shape, and normalizes
the image
input: image_pth - the path to the image
        width - the width to resize the image to
        height - the height to resize the image to
        channels - [optional] the number of channels the image has, generally 3 (for RGB/BGR) or 1 for grayscale
'''
def process_single_img(image_pth, width, height, channels = CHANNELS):
    #open image with open cv
    img = cv2.imread(image_pth)

    #convert to grayscale if 1 channel required for processing
    if(channels==1):
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  
    
    #resize and reshape the image
    img = cv2.resize(img, (width, height))
    img = img.reshape(width,height,channels)

    #normalize the image
    img = scale_X(img)
    return img



In [56]:
export_to_h5('./processed_data/train/')

overwrite file ./train_64x64x3.h5?, y/n: y
file will be overwritten
plastic 386
0/386 fname = plastic222.jpg
1/386 fname = plastic345.jpg
2/386 fname = plastic55.jpg
3/386 fname = plastic435.jpg
4/386 fname = plastic265.jpg
5/386 fname = plastic254.jpg
6/386 fname = plastic20.jpg
7/386 fname = plastic109.jpg
8/386 fname = plastic422.jpg
9/386 fname = plastic141.jpg
10/386 fname = plastic80.jpg
11/386 fname = plastic336.jpg
12/386 fname = plastic74.jpg
13/386 fname = plastic5.jpg
14/386 fname = plastic99.jpg
15/386 fname = plastic34.jpg
16/386 fname = plastic225.jpg
17/386 fname = plastic313.jpg
18/386 fname = plastic370.jpg
19/386 fname = plastic231.jpg
20/386 fname = plastic346.jpg
21/386 fname = plastic192.jpg
22/386 fname = plastic255.jpg
23/386 fname = plastic360.jpg
24/386 fname = plastic441.jpg
25/386 fname = plastic134.jpg
26/386 fname = plastic51.jpg
27/386 fname = plastic379.jpg
28/386 fname = plastic261.jpg
29/386 fname = plastic317.jpg
30/386 fname = plastic127.jpg
31/386 fn

In [57]:
export_to_h5('./processed_data/test/')

overwrite file ./test_64x64x3.h5?, y/n: y
file will be overwritten
plastic 248
0/248 fname = plastic222.jpg
1/248 fname = plastic79.jpg
2/248 fname = plastic349.jpg
3/248 fname = plastic351.jpg
4/248 fname = plastic109.jpg
5/248 fname = plastic422.jpg
6/248 fname = plastic80.jpg
7/248 fname = plastic336.jpg
8/248 fname = plastic362.jpg
9/248 fname = plastic74.jpg
10/248 fname = plastic384.jpg
11/248 fname = plastic201.jpg
12/248 fname = plastic34.jpg
13/248 fname = plastic313.jpg
14/248 fname = plastic153.jpg
15/248 fname = plastic117.jpg
16/248 fname = plastic255.jpg
17/248 fname = plastic360.jpg
18/248 fname = plastic441.jpg
19/248 fname = plastic462.jpg
20/248 fname = plastic145.jpg
21/248 fname = plastic91.jpg
22/248 fname = plastic397.jpg
23/248 fname = plastic429.jpg
24/248 fname = plastic434.jpg
25/248 fname = plastic48.jpg
26/248 fname = plastic310.jpg
27/248 fname = plastic32.jpg
28/248 fname = plastic242.jpg
29/248 fname = plastic482.jpg
30/248 fname = plastic115.jpg
31/248 f

In [64]:

# '''
# load a dataset given the file name w/ appropriate relative path
# input: fname - string to the path of the file that contains the dataset
# output: X - array of images in the dataset
#         y - labels
#         y_one_hot - one hot labels
#         classes - a list of classes in the train_dir at the current time
# '''
# def load_dataset(fname):
#     #open file
#     hf = h5py.File(fname, 'r')

#     #load X, y and y_one_hot from file
#     X = hf['X'][()]
#     y = hf['y'][()]
#     y_one_hot = hf['y_one_hot'][()]

#     #load the classes from the current train directory
#     classes = get_class_list()

#     #close the file
#     hf.close()
#     return (X, y, y_one_hot, classes)
  
# (X_train_orig, Y_train_orig, y_train_oh, train_classes) = load_dataset("./train_64x64x3.h5")
# (X_test_orig, Y_test_orig, y_test_oh, test_classes) = load_dataset("./test_64x64x3.h5")

NameError: ignored

dataset-resized  n.h5		 processed_data_64x64x3.h5  train_64x64x3.h5
drive		 original_data	 sample_data		    trained_models
__MACOSX	 processed_data  test_64x64x3.h5


In [0]:
import keras.backend as K
import math
import numpy as np
import h5py
import matplotlib.pyplot as plt
############### load data ##############
def load_dataset_from_h5():
    train_dataset = h5py.File('./train_64x64x3.h5', "r")
    train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
    train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels

    test_dataset = h5py.File('datasets/test_64x64x3.h5', "r")
    test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
    test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels

    classes = np.array(test_dataset["list_classes"][:]) # the list of classes
    
    train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
    test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
    
    return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes

In [71]:
train_dataset = h5py.File('./train_64x64x3.h5', "r")
train_dataset.keys()
# Direct calling keys will get the following output
# Which can not determin what's inside
# KeysView(<HDF5 file "train_catvnoncat.h5" (mode r)>)
# While we could loop over it
for key in train_dataset.keys():
    print(key)

child_keys = [k for k in train_dataset.keys()]
all_child_keys = []
train_dataset.visit(all_child_keys.append)

train_set_x = train_dataset['train_set_x']
# train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features

# X_train_orig, Y_train_orig, X_test_orig, Y_test_orig, classes = load_dataset_from_h5()

X
y
y_one_hot


KeyError: ignored

In [0]:
######################### main functions #########################          
'''
do the whole process of data processing with all the helper functions
input: using_colab: import data from different location; 
        if using_colab is 1, we are using colab
        else is 0, we are using local machine
        else, the user input is wrong, do nothing
output: data
'''
def process_data(using_colab):

    if using_colab == 1:
        print("Importing data from google drive")
        ############## if compile on google colab #################
        # Load the Drive helper and mount
        from google.colab import drive

        # This will prompt for authorization.
        drive.mount('/content/drive')

        # !ls "/content/drive/My Drive"
        # os.chdir("../")
        datapath = "./processed_data/"
        test_path = os.path.join(datapath, "test")
        train_path = os.path.join(datapath, "train")
        folders = ["./original_data/", datapath, test_path, train_path, "./trained_models/"]
        create_folders(folders)
        unzip("/content/drive/My Drive/dataset-resized.zip")
        move_folder("./dataset-resized/", "./original_data")
        copy_folder("./original_data/", train_path)
        waste_types = get_subsets(datapath)
        split_into_train_valid_and_test_sets(datapath, 0.4, 0.5)


        path = Path(os.getcwd())/"processed_data"
        tfms = get_transforms(do_flip=True, flip_vert=True)
        data = ImageDataBunch.from_folder(path, test="test", ds_tfms=tfms, bs=16)

        return data
    
    elif using_colab == 0:
        print("Importing data from local machine")
        ########## if compile locally ###############
        datapath = "./processed_data/"
        test_path = os.path.join(datapath, "test")
        train_path = os.path.join(datapath, "train")
        folders = ["./original_data/", datapath, test_path, train_path, "./trained_models/"]
        create_folders(folders)
        unzip("dataset-resized.zip")
        move_folder("./dataset-resized/", "./original_data")
        copy_folder("./original_data/", train_path)
        waste_types = get_subsets(datapath)
        split_into_train_valid_and_test_sets(datapath, 0.4, 0.5)


        path = Path(os.getcwd())/"processed_data"
        tfms = get_transforms(do_flip=True, flip_vert=True)
        data = ImageDataBunch.from_folder(path, test="test", ds_tfms=tfms, bs=16)

        return data