In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

import numpy as np
np.random.seed(1984)

import os
import glob
import cv2
import datetime
import pandas as pd
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from keras.applications import ResNet50, InceptionV3
from keras.models import Sequential, Model
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D, AveragePooling2D
from keras.optimizers import SGD, Adagrad
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.constraints import maxnorm
from keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import log_loss
from keras import __version__ as keras_version


bottleneck_fc_model_10e.h5
bottleneck_fc_model_50e.h5
bottleneck_fc_model.h5
bottleneck_features_train.npy
bottleneck_features_validation.npy
tbottleneck_features_train.npy
tbottleneck_features_validation.npy
test_stg1
test_train_transfer
test_validation_transfer
train
train_transfer
validation_transfer
vgg_bottleneck_fc_model.h5
vgg_bottleneck_features_train.npy
vgg_bottleneck_features_validation.npy
vgg_untuned_model.h5



Using TensorFlow backend.


In [2]:
width = 139
height = 139

def get_im_cv2(path):
    img = cv2.imread(path)
    resized = cv2.resize(img, (width, height), cv2.INTER_LINEAR)
    return resized


def load_train():
    X_train = []
    X_train_id = []
    y_train = []
    start_time = time.time()

    print('Read train images')
    folders = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']
    for fld in folders:
        index = folders.index(fld)
        print('Load folder {} (Index: {})'.format(fld, index))
        path = os.path.join('..', 'input', 'train', fld, '*.jpg')
        files = glob.glob(path)
        for fl in files:
            flbase = os.path.basename(fl)
            img = get_im_cv2(fl)
            X_train.append(img)
            X_train_id.append(flbase)
            y_train.append(index)

    print('Read train data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return X_train, y_train, X_train_id


def load_test():
    path = os.path.join('..', 'input', 'test_stg1', '*.jpg')
    files = sorted(glob.glob(path))

    X_test = []
    X_test_id = []
    for fl in files:
        flbase = os.path.basename(fl)
        img = get_im_cv2(fl)
        X_test.append(img)
        X_test_id.append(flbase)

    return X_test, X_test_id


def create_submission(predictions, test_id, info):
    result1 = pd.DataFrame(predictions, columns=['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT'])
    result1.loc[:, 'image'] = pd.Series(test_id, index=result1.index)
    now = datetime.datetime.now()
    sub_file = 'submission_' + info + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    result1.to_csv(sub_file, index=False)


def read_and_normalize_train_data():
    train_data, train_target, train_id = load_train()

    print('Convert to numpy...')
    train_data = np.array(train_data, dtype=np.uint8)
    train_target = np.array(train_target, dtype=np.uint8)

    print('Reshape...')
    print(train_data.shape)
    #train_data = train_data.transpose((0, 3, 1, 2))

    print('Convert to float...')
    train_data = train_data.astype('float32')
    train_data = train_data / 255
    train_target = np_utils.to_categorical(train_target, 8)

    print('Train shape:', train_data.shape)
    print(train_data.shape[0], 'train samples')
    return train_data, train_target, train_id


def read_and_normalize_test_data():
    start_time = time.time()
    test_data, test_id = load_test()

    test_data = np.array(test_data, dtype=np.uint8)
    #test_data = test_data.transpose((0, 3, 1, 2))

    test_data = test_data.astype('float32')
    test_data = test_data / 255

    print('Test shape:', test_data.shape)
    print(test_data.shape[0], 'test samples')
    print('Read and process test data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return test_data, test_id


def dict_to_list(d):
    ret = []
    for i in d.items():
        ret.append(i[1])
    return ret


def merge_several_folds_mean(data, nfolds):
    a = np.array(data[0])
    for i in range(1, nfolds):
        a += np.array(data[i])
    a /= nfolds
    return a.tolist()


def create_model():
    print('Loading Inception V3 Weights ...')
    inc_v3_notop = InceptionV3(include_top=False, weights='imagenet',
    input_tensor=None, input_shape=(width, height, 3))
    output = inc_v3_notop.get_layer(index = -1).output # Shape: (8, 8, 2048)
    output = Flatten(name='flatten')(output)
    output = Dense(8, activation='sigmoid', name='predictions')(output)
    model = Model(inc_v3_notop.input, output)
    for layer in inc_v3_notop.layers:
        layer.trainable = False
    sgd = SGD(lr=1e-3, decay=1e-4, momentum=0.89, nesterov=True)
    model.compile(optimizer=sgd, loss='categorical_crossentropy')
    return model


def create_fine_tune_model(weight_file_string):
    inc_v3_notop = InceptionV3(include_top=False, weights=None,
    input_tensor=None, input_shape=(width, height, 3))
    output = inc_v3_notop.get_layer(index = -1).output # Shape: (8, 8, 2048)
    output = Flatten(name='flatten')(output)
    output = Dense(8, activation='sigmoid', name='predictions')(output)
    model = Model(inc_v3_notop.input, output)
    model.load_weights(weight_file_string)
    for layer in model.layers[:172]:
        layer.trainable = False
    for layer in model.layers[172:]:
        layer.trainable = True
    sgd = SGD(lr=1e-4, decay=1e-5, momentum=0.89, nesterov=True)
    model.compile(optimizer=sgd, loss='categorical_crossentropy')
    return model



def get_validation_predictions(train_data, predictions_valid):
    pv = []
    for i in range(len(train_data)):
        pv.append(predictions_valid[i])
    return pv


def run_cross_validation_create_models(file_name, nfolds=3):
    # input image dimensions
    batch_size = 16
    #nb_epoch = 8
    nb_epoch = 50
    random_state = 51
    first_rl = 96

    train_data, train_target, train_id = read_and_normalize_train_data()

    yfull_train = dict()

    train_datagen = ImageDataGenerator(
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)
    test_datagen = ImageDataGenerator()
    
    kf = KFold(len(train_id), n_folds=nfolds, shuffle=True, random_state=random_state)
    num_fold = 0
    sum_score = 0
    models = []
    for train_index, test_index in kf:
        model = create_fine_tune_model(file_name)
        X_train = train_data[train_index]
        Y_train = train_target[train_index]
        X_valid = train_data[test_index]
        Y_valid = train_target[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))

        train_generator = train_datagen.flow(
            X_train,
            Y_train,  
            batch_size=batch_size)
        validation_generator = test_datagen.flow(
            X_valid,
            Y_valid,
            batch_size=batch_size)
        
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=3, verbose=1),
        ]
        
        model.fit_generator(
            train_generator,
            steps_per_epoch=len(X_train)//batch_size,
            epochs=nb_epoch,
            validation_data=validation_generator,
            validation_steps=len(X_valid)//batch_size,
            verbose=1,
            callbacks=callbacks)

        
        predictions_valid = model.predict(X_valid.astype('float32'), batch_size=batch_size, verbose=2)
        score = log_loss(Y_valid, predictions_valid)
        print('Score log_loss: ', score)
        sum_score += score*len(test_index)

        # Store valid predictions
        for i in range(len(test_index)):
            yfull_train[test_index[i]] = predictions_valid[i]

        models.append(model)

    score = sum_score/len(train_data)
    print("Log_loss train independent avg: ", score)

    info_string = '_' + str(np.round(score,3)) + '_flds_' + str(nfolds) + '_eps_' + str(nb_epoch) + '_fl_' + str(first_rl)
    return info_string, models


def run_stratified_cross_validation_create_models(weight_file_string, nfolds=3):
    # input image dimensions
    batch_size = 16
    #nb_epoch = 8
    nb_epoch = 50
    random_state = 51
    first_rl = 96

    train_data, train_target, train_id = read_and_normalize_train_data()

    yfull_train = dict()

    c, r = train_target.shape
    labels = train_target.reshape(c,)
    skf = StratifiedKFold(
        labels,
        n_folds=nfolds, 
        shuffle=True, 
        random_state=random_state)
    num_fold = 0
    sum_score = 0
    models = []

    train_datagen = ImageDataGenerator(
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)
    test_datagen = ImageDataGenerator()

    for train_index, test_index in skf:
        model = create_fine_tune_model(weight_file_string)
        X_train = train_data[train_index]
        Y_train = train_target[train_index]
        X_valid = train_data[test_index]
        Y_valid = train_target[test_index]
        
        train_generator = train_datagen.flow(
            X_train,
            Y_train,  
            batch_size=batch_size)
        validation_generator = test_datagen.flow(
            X_valid,
            Y_valid,
            batch_size=batch_size)

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))

        callbacks = [
            EarlyStopping(monitor='val_loss', patience=3, verbose=1),
        ]
        model.fit_generator(
            train_generator,
            steps_per_epoch=len(X_train)//batch_size,
            epochs=nb_epoch,
            validation_data=validation_generator,
            validation_steps=len(X_test)//batch_size,
            verbose=1)
        
        predictions_valid = model.predict(X_valid.astype('float32'), batch_size=batch_size, verbose=2)
        score = log_loss(Y_valid, predictions_valid)
        print('Score log_loss: ', score)
        sum_score += score*len(test_index)

        # Store valid predictions
        for i in range(len(test_index)):
            yfull_train[test_index[i]] = predictions_valid[i]

        models.append(model)

    score = sum_score/len(train_data)
    print("Log_loss train independent avg: ", score)

    info_string = '_' + str(np.round(score,3)) + '_flds_' + str(nfolds) + '_eps_' + str(nb_epoch) + '_fl_' + str(first_rl)
    return info_string, models


def run_cross_validation_process_test(info_string, models):
    batch_size = 16
    num_fold = 0
    yfull_test = []
    test_id = []
    nfolds = len(models)

    for i in range(nfolds):
        model = models[i]
        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        test_data, test_id = read_and_normalize_test_data()
        test_prediction = model.predict(test_data, batch_size=batch_size, verbose=2)
        yfull_test.append(test_prediction)

    test_res = merge_several_folds_mean(yfull_test, nfolds)
    info_string = 'loss_' + info_string \
                + '_folds_' + str(nfolds)
    create_submission(test_res, test_id, info_string)


In [3]:
# top_model training
batch_size = 32
train_data, train_target, train_id = read_and_normalize_train_data()
X_train, X_test, Y_train, Y_test = train_test_split(train_data, train_target, test_size=0.25, random_state=42)
train_datagen = ImageDataGenerator(
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)
test_datagen = ImageDataGenerator()
train_generator = train_datagen.flow(
        X_train,
        Y_train,  
        batch_size=batch_size)
validation_generator = test_datagen.flow(
        X_test,
        Y_test,
        batch_size=batch_size
)

model = create_model()
model.fit_generator(
        train_generator,
        steps_per_epoch=len(X_train)//batch_size,
        epochs=50,
        validation_data=validation_generator,
        validation_steps=len(X_test)//batch_size,
        verbose=1)

Read train images
Load folder ALB (Index: 0)
Load folder BET (Index: 1)
Load folder DOL (Index: 2)
Load folder LAG (Index: 3)
Load folder NoF (Index: 4)
Load folder OTHER (Index: 5)
Load folder SHARK (Index: 6)
Load folder YFT (Index: 7)
Read train data time: 179.07 seconds
Convert to numpy...
Reshape...
(3777, 139, 139, 3)
Convert to float...
Train shape: (3777, 139, 139, 3)
3777 train samples
Loading ResNet50 Weights ...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 

<keras.callbacks.History at 0x7fd6f51b92e8>

In [3]:
# I had to close my notebook in order to start jupyter notebook in a new 
# conda environment with a self-compiled tensorflow. So I am 
# loading the model and weights again before continuing.
model = create_model()
model.load_weights('./iv3_top_training_weights.h5')

Loading Inception V3 Weights ...


In [4]:
num_folds = 6
file = './iv3_top_training_weights.h5'
info_string, models = run_cross_validation_create_models(file, num_folds)
run_cross_validation_process_test(info_string, models)

Read train images
Load folder ALB (Index: 0)
Load folder BET (Index: 1)
Load folder DOL (Index: 2)
Load folder LAG (Index: 3)
Load folder NoF (Index: 4)
Load folder OTHER (Index: 5)
Load folder SHARK (Index: 6)
Load folder YFT (Index: 7)
Read train data time: 174.54 seconds
Convert to numpy...
Reshape...
(3777, 139, 139, 3)
Convert to float...
Train shape: (3777, 139, 139, 3)
3777 train samples
Start KFold number 1 from 6
Split train:  3147 3147
Split valid:  630 630
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 00010: early stopping
Score log_loss:  0.257478589761
Start KFold number 2 from 6
Split train:  3147 3147
Split valid:  630 630
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 00018: early stopping
Score log_loss:  0.21386

In [None]:
if __name__ == '__main__':
    print('Keras version: {}'.format(keras_version))
    num_folds = 10
    info_string, models = run_cross_validation_create_models(num_folds)
    run_cross_validation_process_test(info_string, models)

In [5]:
i = 0
for model in models:
    i = i + 1
    model.save_weights('i_v3_50e_ decay_0.89_nesterov_T_{0}.h5'.format(i))

In [6]:
run_cross_validation_process_test(info_string, models[1:4])

Start KFold number 1 from 3
Test shape: (1000, 139, 139, 3)
1000 test samples
Read and process test data time: 44.3 seconds
Start KFold number 2 from 3
Test shape: (1000, 139, 139, 3)
1000 test samples
Read and process test data time: 44.92 seconds
Start KFold number 3 from 3
Test shape: (1000, 139, 139, 3)
1000 test samples
Read and process test data time: 53.33 seconds


With 6-fold validation, the validation losses and losses are slightly higher than with the regular CNN. It should be noted that although I set the number of epochs at 50 this was never reached for 2 reasons:

- Pre-training lead to reaching a plateau in learning faster.
- EarlyTermination.

Kaggle gave a score of 1.888. I tried submitting again with only the top 4 predictions. The results were worse at 1.945.

There are 3 things I need to fix/try:

- I had a fruitful discussion with Tong today about batch sizes. I learned 2 things. First, small batch sizes = faster learning because there are more updates to the weights. It is also more memory efficient. But the flip-side is that variance of prediction is higher. This could have led to early termination in 2-3 cases because I set batch size at 16 although the default is 32. I should switch back to 32.
- With unbalanced datasets, I must try stratifiedkfold validation. This way I make sure that the fold has all classes represented during training.
- I want to put a multi-layer perceptron with dropout ontop of this network to prevent overfitting the data.