In [None]:
#Basic imports
import os
import numpy as np
from shutil import copyfile
from sys import stdout

#Dataset imports
import csv

#Imports for image load/unload/process
from keras.preprocessing import image as image_utils
from keras.applications.imagenet_utils import preprocess_input, decode_predictions
from keras.applications import VGG16
import cv2
from skimage import transform
from random import shuffle

#Plotting libs
from matplotlib import pyplot as plt

#Graph keras model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

#Keras imports
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPool2D, Activation, MaxPool1D, Conv1D, Dropout
from keras.utils import to_categorical
from keras.optimizers import SGD

#Progress bars
from tqdm import tqdm_notebook as tqdm

#Project constants
from source.common import constants

In [2]:
### Constants ###

#Dataset constants for Windows
if platform.startswith("win"):
    RAW_TRAIN_SET_LOC = "..\\Humpback Whale\\dataset\\train"
    TRAIN_SET_LOC = "..\\Humpback Whale\\dataset\\train_resized"
    PROCESSED_SET_LOC = "..\\Humpback Whale\\dataset\\processed"
    RAW_TEST_SET_LOC = "..\\Humpback Whale\\dataset\\test"
    TEST_SET_LOC = "..\\Humpback Whale\\dataset\\test_resized"
    LABEL_FILE_LOC = "..\\Humpback Whale\\dataset\\train.csv"
else:
    RAW_TRAIN_SET_LOC = "dataset/train"
    TRAIN_SET_LOC = "dataset/train_resized"
    PROCESSED_SET_LOC = "dataset/processed"
    RAW_TEST_SET_LOC = "dataset/test"
    TEST_SET_LOC = "dataset/test_resized"
    LABEL_FILE_LOC = "dataset/train.csv"

#Dataset markers
TRAIN_SET_SIZE = 20
TEST_SET_SIZE = 10
PREPROCESS_BATCH_START = 0
PREPROCESS_BATCH_SIZE = 500

#Image parameter constants
IMG_SIZE = (400, 700)
IMG_WIDTH = 700
IMG_HEIGHT = 400
IMG_EXTN = "jpg"

#Sampling
SAMPLE_IMG_ID = 5

#Containers
LABEL_DICT = {} 
X_TRAIN = []
Y_TRAIN = []
X_TEST = []
Y_TEST = []
CLASS_NAMES = []
CLASSES = []
NUM_CLASSES = 0

['0000e88ab.jpg',
 '0001f9222.jpg',
 '00029d126.jpg',
 '00050a15a.jpg',
 '0005c1ef8.jpg',
 '0006e997e.jpg']

In [None]:
#Utility methods

def locate_img(path, img_name):
    return os.path.join(path, img_name)

def load_dataset(dataset_path, files):
    images = []
    for file in files:
        image = cv2.imread(locate_img(dataset_path, file), cv2.IMREAD_GRAYSCALE)
        images.append(image)
    
    return np.asarray(images)

def resize_images(images, target_size):
    #impad() changes the image data from [0..255] range to [0..1]
    resized_images = [impad(image, target_size) for image in images]
    
    return resized_images
    
def save_dataset(target_path, files, image_data):
    for idx, file in enumerate(files):
        data = image_data[idx]
        file_path = locate_img(target_path, file)
        cv2.imwrite(file_path, (data*255).astype('uint8'))

def preprocess_raw_dataset(source_dataset_path, files, target_dataset_path, target_size, batch_size = constants.PREPROCESS_BATCH_SIZE, progress_bar = None):
    for batch_id, batch_files in enumerate(batch(files, batch_size)):
        if progress_bar is not None:
            progress_bar.set_description("Processing batch: {batch_id}".format(batch_id = batch_id))

        #Load batch of images for processing.
        batch_images = load_dataset(source_dataset_path, batch_files)

        #Resize images to keep all images for a consistent size.
        resized_batch = resize_images(batch_images, target_size)

        #Save training images to be readily available to be trained.
        save_dataset(target_dataset_path, batch_files, resized_batch)

        if progress_bar is not None:
            progress_bar.update(len(batch_files))
        
def batch(iterable, batch_size = 1):
    count = len(iterable)
    for batch_idx in range(0, count, batch_size):
        yield iterable[batch_idx:min(batch_idx + batch_size, count)]

def impad(image, target_size):
    return transform.resize(image, target_size, anti_aliasing = True)

def load_model_data(source_path, files, batch_size, class_name_map, label_dict, num_classes):
    #Image batch placeholder
    x = None
    
    #Labels placeholder
    y = None

   # with tqdm(total = len(files), file=stdout) as progress_bar:
    #    loaded = 0
    while True:
        shuffle(files)
        for batch_files in batch(files, batch_size):
            #Load images
            x = load_dataset(source_path, batch_files)

            #Normalize
            x = np.array(x/255)

            y = [class_name_map[label_dict[image]] for image in batch_files]
            y = to_categorical(y, num_classes = num_classes)

           # loaded += len(batch_files)
           # progress_bar.set_description("Loaded {loaded}".format(loaded = loaded))
           # progress_bar.update(len(batch_files))

            yield [x], y
    

In [None]:
files = ["0000e88ab.jpg", "000a6daec.jpg"]
batch_size = 2
for x, y in load_model_data(TRAIN_SET_LOC, files, batch_size, CLASS_NAME_MAP, LABEL_DICT, NUM_CLASSES):
    print(x[0].shape)

In [None]:
### Create label and class mapping for training set. ###

#Load labels
LABEL_DICT = {}

with open(LABEL_FILE_LOC, 'r') as handle:
    label_reader = csv.reader(handle)
    next(label_reader, None)
    
    loaded_items = 0
    for row in label_reader:
        LABEL_DICT[row[0]] = row[1]
    
#Classes
CLASS_NAMES = list(set(LABEL_DICT.values()))
CLASS_NAME_MAP = {}

class_idx = 0
for class_name in CLASS_NAMES:
    CLASS_NAME_MAP[class_name] = class_idx
    class_idx += 1

NUM_CLASSES = len(CLASS_NAMES)

print("Number of classses: {count}".format(count = NUM_CLASSES))

In [None]:
### Preprocess train dataset ###
files = list(LABEL_DICT.keys())

with tqdm(total = len(files), file=stdout) as progress_bar:
    preprocess_raw_dataset(RAW_TRAIN_SET_LOC, files, TRAIN_SET_LOC, IMG_SIZE, 256, progress_bar = progress_bar)

"""
train_raw_files = ["0000e88ab.jpg"]
image = imread(locate_img(RAW_TRAIN_SET_LOC, "0000e88ab.jpg"))
resized = load_dataset(RAW_TRAIN_SET_LOC, train_raw_files)
print(resized[0])
"""

In [None]:
def display_img(source_path, label_dict, num_files = 10):
    files = list(label_dict.keys())[:num_files]

    x = load_dataset(source_path, files)
    x = to_grayscale(x)

    y = [CLASS_NAME_MAP[LABEL_DICT[image]] for image in files]
    y = to_categorical(y, num_classes = NUM_CLASSES)

    #Print sample
    plt.figure()

    print(x[3])
    plt.imshow(x[0], cmap='gray')
    #plt.imshow(cvtColor((x[4]).astype('uint8'), COLOR_BGR2RGB)) #SAMPLE_IMG_ID

    print(y[4])

In [None]:
files = list(LABEL_DICT.keys())[:10]

x = load_dataset(TRAIN_SET_LOC, files)
x = to_grayscale(x)

y = [CLASS_NAME_MAP[LABEL_DICT[image]] for image in files]
y = to_categorical(y, num_classes = NUM_CLASSES)

#Print sample
plt.figure()

print(x[3])
plt.imshow(x[0], cmap='gray')
#plt.imshow(cvtColor((x[4]).astype('uint8'), COLOR_BGR2RGB)) #SAMPLE_IMG_ID

print(y[4])

In [None]:
### Create the model for gray-scale inputs ###
model = Sequential()

input_shape = IMG_SIZE

model.add(Conv1D(32, kernel_size = 3, activation='relu', input_shape=input_shape))
model.add(Conv1D(32, kernel_size = 3, activation='relu'))
model.add(MaxPool1D(pool_size=2))
model.add(Dropout(0.25))

model.add(Conv1D(64, kernel_size = 3, activation='relu'))
model.add(Conv1D(64, kernel_size = 3, activation='relu'))
model.add(MaxPool1D(pool_size=2))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(NUM_CLASSES, activation='softmax'))

#Compile the model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])

#Print model summary
#print(model.summary())

#Training and validation sets
files = list(LABEL_DICT.keys())[:2048]
num_files = len(files)
batch_size = 128
validation_split = 0.2
split_marker = int(num_files*(1 - validation_split))
train_set = files[:split_marker]
validation_set = files[split_marker:]

#Train the model
model.fit_generator(
    load_model_data(TRAIN_SET_LOC, train_set, batch_size, CLASS_NAME_MAP, LABEL_DICT, NUM_CLASSES),
    steps_per_epoch = (len(train_set) + batch_size - 1)/batch_size,
    epochs = 20,
    validation_data=load_model_data(TRAIN_SET_LOC, validation_set, batch_size, CLASS_NAME_MAP, LABEL_DICT, NUM_CLASSES),
    validation_steps=(len(validation_set) + batch_size - 1)/batch_size)

In [None]:
### Create the model ###
model = Sequential()

input_shape = IMG_SIZE

model.add(MaxPool2D((5, 5), (2, 2), 'valid', input_shape=input_shape))

model.add(Conv2D(128, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPool2D((5, 5), (2, 2), 'valid'))

model.add(Conv2D(64, kernel_size=3, activation='relu'))
model.add(MaxPool2D((5, 5), (2, 2), 'valid'))

model.add(Conv2D(32, kernel_size=3, activation='relu'))
model.add(MaxPool2D((5, 5), (2, 2), 'valid'))

model.add(Conv2D(8, kernel_size=3, activation='relu'))
model.add(MaxPool2D((5, 5), (2, 2), 'valid'))

model.add(Conv2D(4, kernel_size=3, activation='relu'))
model.add(MaxPool2D((5, 5), (2, 2), 'valid'))

model.add(Flatten())
model.add(Dense(NUM_CLASSES))
model.add(Activation('softmax'))

#Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

#Print model summary
print(model.summary())

#Train the model
files = list(LABEL_DICT.keys())[:5096]
batch_size = 16
model.fit_generator(
    load_model_data(TRAIN_SET_LOC, files, 16, CLASS_NAME_MAP, LABEL_DICT, NUM_CLASSES),
    epochs = 20, 
    steps_per_epoch = len(files)/batch_size + 1, 
    use_multiprocessing = True)
"""
for files in batch(list(LABEL_DICT.keys()), 256):
    x, y = load_image_set(TRAIN_SET_LOC, files, CLASS_NAME_MAP, LABEL_DICT, NUM_CLASSES)
    model.fit(x, y, batch_size = 16, validation_split = 0.2, epochs=3)
"""

#VG(model_to_dot(model).create(prog='dot', format='s

In [None]:
#Scratch
#img = imread(locate_train_img("0000e88ab.jpg"))

In [None]:
print(X_TRAIN.shape)