In [42]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

import numpy as np
np.random.seed(1984)

import os
import glob
import cv2
import datetime
import pandas as pd
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.cross_validation import KFold
from keras import optimizers

from keras.models import Sequential
from keras.applications import ResNet50
from keras.applications import vgg16
from keras.preprocessing.image import ImageDataGenerator
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D, AveragePooling2D
from keras.optimizers import SGD, Adagrad
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.constraints import maxnorm
from sklearn.metrics import log_loss
from keras import __version__ as keras_version
from os.path import join, split
from os import getcwd

bottleneck_fc_model_10e.h5
bottleneck_fc_model_50e.h5
bottleneck_fc_model.h5
bottleneck_features_train.npy
bottleneck_features_validation.npy
tbottleneck_features_train.npy
tbottleneck_features_validation.npy
test_stg1
test_train_transfer
test_validation_transfer
train
train_transfer
validation_transfer
vgg_bottleneck_fc_model.h5
vgg_bottleneck_features_train.npy
vgg_bottleneck_features_validation.npy



In [55]:
def get_im_cv2(path):
    img = cv2.imread(path)
    resized = cv2.resize(img, (150, 150), cv2.INTER_LINEAR)
    return resized


def load_train():
    X_train = []
    X_train_id = []
    y_train = []
    start_time = time.time()

    print('Read train images')
    folders = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']
    for fld in folders:
        index = folders.index(fld)
        print('Load folder {} (Index: {})'.format(fld, index))
        path = os.path.join('..', 'input', 'train', fld, '*.jpg')
        files = glob.glob(path)
        for fl in files:
            flbase = os.path.basename(fl)
            img = get_im_cv2(fl)
            X_train.append(img)
            X_train_id.append(flbase)
            y_train.append(index)

    print('Read train data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return X_train, y_train, X_train_id


def load_test():
    path = os.path.join('..', 'input', 'test_stg1', '*.jpg')
    files = sorted(glob.glob(path))

    X_test = []
    X_test_id = []
    for fl in files:
        flbase = os.path.basename(fl)
        img = get_im_cv2(fl)
        X_test.append(img)
        X_test_id.append(flbase)

    return X_test, X_test_id


def create_submission(predictions, test_id, info):
    result1 = pd.DataFrame(predictions, columns=['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT'])
    result1.loc[:, 'image'] = pd.Series(test_id, index=result1.index)
    now = datetime.datetime.now()
    sub_file = 'submission_' + info + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    result1.to_csv(sub_file, index=False)


def read_and_normalize_train_data():
    train_data, train_target, train_id = load_train()

    print('Convert to numpy...')
    train_data = np.array(train_data, dtype=np.uint8)
    train_target = np.array(train_target, dtype=np.uint8)

    print('Reshape...')
    print(train_data.shape)
    train_data = train_data.transpose((0, 3, 1, 2))

    print('Convert to float...')
    train_data = train_data.astype('float32')
    train_data = train_data / 255
    train_target = np_utils.to_categorical(train_target, 8)

    print('Train shape:', train_data.shape)
    print(train_data.shape[0], 'train samples')
    return train_data, train_target, train_id


def read_and_normalize_test_data():
    start_time = time.time()
    test_data, test_id = load_test()

    test_data = np.array(test_data, dtype=np.uint8)
    test_data = test_data.transpose((0, 3, 1, 2))

    test_data = test_data.astype('float32')
    test_data = test_data / 255

    print('Test shape:', test_data.shape)
    print(test_data.shape[0], 'test samples')
    print('Read and process test data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return test_data, test_id


def dict_to_list(d):
    ret = []
    for i in d.items():
        ret.append(i[1])
    return ret


def merge_several_folds_mean(data, nfolds):
    a = np.array(data[0])
    for i in range(1, nfolds):
        a += np.array(data[i])
    a /= nfolds
    return a.tolist()


def create_model():
    model = ResNet50()
    sgd = SGD(lr=1e-2, decay=1e-4, momentum=0.89, nesterov=False)
    model.compile(optimizer=sgd, loss='categorical_crossentropy')

    return model


def get_validation_predictions(train_data, predictions_valid):
    pv = []
    for i in range(len(train_data)):
        pv.append(predictions_valid[i])
    return pv


def run_cross_validation_create_models(nfolds=10):
    # input image dimensions
    batch_size = 32
    #nb_epoch = 8
    nb_epoch = 50
    random_state = 51
    first_rl = 96

    train_data, train_target, train_id = read_and_normalize_train_data()

    yfull_train = dict()
    kf = KFold(len(train_id), n_folds=nfolds, shuffle=True, random_state=random_state)
    num_fold = 0
    sum_score = 0
    models = []
    for train_index, test_index in kf:
        model = create_model()
        X_train = train_data[train_index]
        Y_train = train_target[train_index]
        X_valid = train_data[test_index]
        Y_valid = train_target[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))

        callbacks = [
            EarlyStopping(monitor='val_loss', patience=3, verbose=0),
        ]
        model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
              shuffle=True, verbose=2, validation_data=(X_valid, Y_valid),
              callbacks=callbacks)

        predictions_valid = model.predict(X_valid.astype('float32'), batch_size=batch_size, verbose=2)
        score = log_loss(Y_valid, predictions_valid)
        print('Score log_loss: ', score)
        sum_score += score*len(test_index)

        # Store valid predictions
        for i in range(len(test_index)):
            yfull_train[test_index[i]] = predictions_valid[i]

        models.append(model)

    score = sum_score/len(train_data)
    print("Log_loss train independent avg: ", score)

    info_string = '_' + str(np.round(score,3)) + '_flds_' + str(nfolds) + '_eps_' + str(nb_epoch) + '_fl_' + str(first_rl)
    return info_string, models


def run_cross_validation_process_test(info_string, models):
    batch_size = 24
    num_fold = 0
    yfull_test = []
    test_id = []
    nfolds = len(models)

    for i in range(nfolds):
        model = models[i]
        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        test_data, test_id = read_and_normalize_test_data()
        test_prediction = model.predict(test_data, batch_size=batch_size, verbose=2)
        yfull_test.append(test_prediction)

    test_res = merge_several_folds_mean(yfull_test, nfolds)
    info_string = 'loss_' + info_string \
                + '_folds_' + str(nfolds)
    create_submission(test_res, test_id, info_string)


In [5]:
if __name__ == '__main__':
    print('Keras version: {}'.format(keras_version))
    num_folds = 3
    info_string, models = run_cross_validation_create_models(num_folds)
    run_cross_validation_process_test(info_string, models)

Keras version: 1.2.2
Read train images
Load folder ALB (Index: 0)
Load folder BET (Index: 1)
Load folder DOL (Index: 2)
Load folder LAG (Index: 3)
Load folder NoF (Index: 4)
Load folder OTHER (Index: 5)
Load folder SHARK (Index: 6)
Load folder YFT (Index: 7)
Read train data time: 158.39 seconds
Convert to numpy...
Reshape...
(3777, 224, 224, 3)
Convert to float...
Train shape: (3777, 3, 224, 224)
3777 train samples
Start KFold number 1 from 3
Split train:  2518 2518
Split valid:  1259 1259


ValueError: Error when checking model input: expected input_2 to have shape (None, 224, 224, 3) but got array with shape (2518, 3, 224, 224)

In [81]:
# path to the model weights file
base = join(split(split(getcwd())[0])[0],'notebooks', 'input')
weights_path = join(split(base)[0],'vgg16_weights.h5')
top_model_weights_path = join(base, 'vgg_bottleneck_fc_model.h5')

# dimensions of the imates
img_width , img_height = 150, 150

train_data_dir = join(base, 'train_transfer')
validation_data_dir = join(base, 'validation_transfer')
print(train_data_dir)
print(validation_data_dir)
#nb_train_samples = 2000
#nb_validation_sampeles = 800
nb_epoch = 10

/home/anthonyle/Projects/ClassifiyingFish/notebooks/input/train_transfer
/home/anthonyle/Projects/ClassifiyingFish/notebooks/input/validation_transfer


Note:

Several things needed to be done to get this code to work.

- For Python 3, one needs to explicitly state in which mode a file is opened (r/w). In this case, the output needed to be saved in binary mode (so add the 'b').
- For VGG150, the width and height parameters needed to be changed to 150x150
- I manually had to calculate and adjust the nb_training_samples and nb_validadtion_samples input number. These must match what's actually in the folders or the script will fail.

In [82]:
os.listdir(validation_data_dir)

['YFT', 'LAG', 'OTHER', 'NoF', 'ALB', 'DOL', 'BET', 'SHARK']

In [83]:
def save_bottleneck_feature(train_data_dir, validation_data_dir, save_directory,
                           img_width =150, img_height=150,nb_train_samples=62*32,
                            nb_validation_samples=800):
    """Predicts with the convolutional layers of VGG16 and saves the output
    Args: 
        train_data_dir(str): path to training data
        validation__data_dir(str): path to validation data
        save_directory(str): path to save the files
    Returns None
    """

    # load the vgg16 covolutional layers
    model = vgg16.VGG16(include_top=False,input_shape=(img_width, img_height,3))
    
    # set up a data generator for the training data
    datagen=ImageDataGenerator(1./255)
    generator = datagen.flow_from_directory(
            train_data_dir,
            target_size=(img_width, img_height),
            batch_size=32,
            class_mode=None,
            shuffle=False)
    
    # predict on the training data
    print('Generating train predictions...')
    bottleneck_features_train = model.predict_generator(generator, nb_train_samples)
    print('Saving training output...')
    file_name = join(save_directory, 'vgg_bottleneck_features_train.npy')
    np.save(open(file_name, 'wb'), bottleneck_features_train)
    
    # set up a generator for the validation_data
    generator = datagen.flow_from_directory(
        validation_data_dir,
        target_size = (img_width, img_height),
        batch_size=32,
        class_mode = None,
        shuffle=False)
    
    # predict on the validation data
    print('Generating validation predictions...')
    bottleneck_features_validation = model.predict_generator(generator, nb_validation_samples)
    print('Saving validation predictions...')
    file_name = join(save_directory, 'vgg_bottleneck_features_validation.npy')
    np.save(open(file_name, 'wb'), bottleneck_features_validation)
    
    return None


In [84]:
# I need to correctly specify the number of train and validation samples here to get top layer training to work!
# This code needs to be refactored. For now, let me see if I can get this to work on the larger dataset!
save_bottleneck_feature(train_data_dir=train_data_dir, validation_data_dir=validation_data_dir,
                       save_directory=base, nb_train_samples=2845, nb_validation_samples=1037, img_height=img_height,
                       img_width=img_width)

Found 2845 images belonging to 8 classes.
Generating train predictions...
Saving training output...
Found 1037 images belonging to 8 classes.
Generating validation predictions...
Saving validation predictions...


Note: The following modifications had to be done here to get the code to work.

- For the dogs/cats example, there was an even number of binary choices. Here I had to manually create a label vector corresponding to the number of photos in each category for both training and validadtion. This needs to be refactored.
- Since the tutorial was written, Keras was updated. train_labels and validation_labels have to be converted to categorical vectors using np_utils.to_categorical().
- Since there are 8 classes here, a binary sigmoidal function is not enough. I changed the top model to the fully connected model from the simple convolutional network in the example above.

POTENTIAL PITFALL: I am assuming that folder order is equal to label order. I need to find out what the actual order is.

In [85]:
def create_top_model(input_data):
    model = Sequential()
    model.add(Flatten(input_shape=input_data.shape[1:]))
    model.add(Dense(96, activation='relu',init='he_uniform'))
    model.add(Dropout(0.4))
    model.add(Dense(24, activation='relu',init='he_uniform'))
    model.add(Dropout(0.2))
    # This needs to correspond to the number of categories.
    model.add(Dense(8, activation='softmax'))
    #model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    print('model setup complete')
    return model
    
def train_top_model(save_directory,top_model_weights_path,
                    nb_train_samples=32*62, nb_validation_samples=800,
                    nb_epochs=50 ):
    """ Trains the top model weights"""
    file_name = join(save_directory, 'vgg_bottleneck_features_train.npy')
    train_data = np.load(open(file_name, 'rb'))
    print(nb_validation_samples, nb_train_samples)
    print('train_data shape', train_data.shape)
    print(train_data.shape[1:])
    # TODO:
    # Refactor to automatically infer labels here.
    # We are defining the labels for different fish type here! For cats and dogs, there was an even
    # number of items of cats (0) vs dogs (1). This will need to be modified according to the actual numbers
    # of fish in each category!
    train_labels = np.array([0] * 528 + 
                            [1] * 48 + 
                            [2] * 232 + 
                            [3] * 465 + 
                            [4] * 1216 + 
                            [5] * 88 + 
                            [6] * 132 + 
                            [7] * 136)
    train_labels = np_utils.to_categorical(train_labels)
    #['YFT', 'LAG', 'OTHER', 'NoF', 'ALB', 'DOL', 'BET', 'SHARK']
    
    file_name = join(save_directory, 'vgg_bottleneck_features_validation.npy')
    validation_data = np.load(open(file_name, 'rb'))
    print('validation_data shape', validation_data.shape)
    print(validation_data.shape[1:])
    # TODO:
    # Refactor to automatically infer labels here.
    # We are defining the labels for different fish type here! For cats and dogs, there was an even
    # number of items of cats (0) vs dogs (1). This will need to be modified according to the actual numbers
    # of fish in each category!
    ['YFT', 'LAG', 'OTHER', 'NoF', 'ALB', 'DOL', 'BET', 'SHARK']

    validation_labels = np.array([0] * 206 + 
                                 [1] * 19 + 
                                 [2] * 71 + 
                                 [3] * 101 + 
                                 [4] * 503 + 
                                 [5] * 29 + 
                                 [6] * 68 + 
                                 [7] * 40)
    validation_labels = np_utils.to_categorical(validation_labels)

    #print(train_labels)
    #print(validation_labels)

    
    # set up the top model
    model = create_top_model(train_data)
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(train_data, train_labels,
              nb_epoch=nb_epochs, batch_size=32,
              validation_data=(validation_data, validation_labels), 
              verbose = 1)
    
    model.save_weights(top_model_weights_path)
    return model

In [86]:
#def train_top_model(save_directory,top_model_weights_path,
#                    nb_train_samples=32*62, nb_validation_samples=800,
#                   nb_epochs=50 ):
#
top_model = train_top_model(save_directory = base,top_model_weights_path=top_model_weights_path,
                           nb_train_samples=32, nb_validation_samples=58, nb_epochs=20)

58 32
train_data shape (2845, 4, 4, 512)
(4, 4, 512)
validation_data shape (1037, 4, 4, 512)
(4, 4, 512)
model setup complete
Train on 2845 samples, validate on 1037 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


I tried training the top level for 10, 50 and 20 epochs. 10 was what I found before. 20 seems convergent...and 50 just for the heck of it. For tonight, I just want to test how good the transfer learning is with only training the top model with 10, 20, and 50 items. Tomorrow, I will test how fine-tuning the model further improves accuracy.

In [91]:
def setup_alternate_VGG16_model(top_model=None, img_width = 150, img_height = 150,
                               freeze_layers = 25,loss=None, optimizer=None, metrics=None):
    """Sets up an alternate vgg16 model with diffent top"""

    # build the VGG16 network
    model = Sequential()
    model.add(ZeroPadding2D((1, 1), input_shape=(img_width, img_height,3)))

    model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_2'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_2'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    
    # Load the weights
    model.load_weights(get_weights_path())
    
    if top_model is None:
        # build a classifier model to put on top of the convolutional model
        top_model = Sequential()
        top_model.add(Flatten(input_shape=model.output_shape[1:]))
        top_model.add(Dense(96, activation='relu'))
        top_model.add(Dropout(0.4))
        top_model.add(Dense(24, activation='relu'))
        top_model.add(Dropout(0.2))
        # This needs to correspond to the number of categories.
        top_model.add(Dense(8, activation='softmax'))
        top_model.load_weights(top_model_weights_path)     
    
    model.add(top_model)
    
    # freeze the first 25 layers
    # set the first 25 layers (up to the last conv block)
    for layer in model.layers[:freeze_layers]:
        layer.trainable = False
    
    # compile the model to be optimzied
    if loss is None: 
        loss = 'binary_crossentropy'
    if optimizer is None: 
        optimizer = optimizers.SGD(lr=1e-4, momentum=0.9)
    if metrics is None:
        metrics = ['accuracy']

    model.compile(loss =loss, 
                  optimizer = optimizer,
                  metrics = metrics)
    return model 

In [92]:
def get_weights_path():
    """gets the local path to the weights"""
    TF_WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5'
    return get_file('vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5',
                                        TF_WEIGHTS_PATH_NO_TOP,
                                        cache_subdir='models')

In [93]:
test_data_dir=join(base, 'test_stg1')
print(test_data_dir)
model = setup_alternate_VGG16_model()
print('Conv model Setup...generating convolutions...')
# set up a data generator for the training data
#datagen=ImageDataGenerator(1./255)
#generator = datagen.flow_from_directory(
#        test_data_dir,
#        target_size=(img_width, img_height),
#        batch_size=32,
#        class_mode=None,
#        shuffle=False)
    
# predict on the training data

#bottleneck_features_test = model.predict_generator(generator, 1000)
print('Loading test data...')
test_data, test_id = load_test()
#print(test_id)
print('Generating test predictions...')
test_res_list = []
test_res = model.predict(np.array(test_data))
info_string = 'VGG_transfer_learning_e20'
create_submission(test_res, test_id, info_string)

/home/anthonyle/Projects/ClassifiyingFish/notebooks/input/test_stg1
Conv model Setup...generating convolutions...
Loading test data...
Generating test predictions...


After the bug fixes, the unfine-tuned model gives me a testing score of 1.91106 as a baseline. I know now I am on the right track. Previously, the scores came back around 3-4 because of labeling and activation function mismatch during training and testing. The next step is to fine-tune the model and see what sorts of improvements are possible then. For comparison: Training a full basic convolutional model gave me a test score of 1.13914.

In [96]:
transfer_model_weights_path = join(base, 'vgg_untuned_model.h5')
model.save_weights(transfer_model_weights_path)

In [97]:
train_datagen = ImageDataGenerator(
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)

test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        train_data_dir,
        target_size=(img_height, img_width),
        batch_size=32,
        class_mode='binary')

validation_generator = test_datagen.flow_from_directory(
        validation_data_dir,
        target_size=(img_height, img_width),
        batch_size=32,
        class_mode='binary')

Found 2845 images belonging to 8 classes.
Found 1037 images belonging to 8 classes.


In [95]:
# fine-tune the model
model.fit_generator(
        train_generator,
        samples_per_epoch=nb_train_samples,
        nb_epoch=nb_epoch,
        validation_data=validation_generator,
        nb_val_samples=1037)

Epoch 1/10


ValueError: Error when checking model target: expected sequential_50 to have shape (None, 8) but got array with shape (32, 1)