# National Data Science Bowl - Plankton

## Action Plan

* Make overfitting model
* Data augmentation
* Batch normalization
* Dropout
* Ensembling

## Imports and Directories

In [1]:
#Create references to important directories we will use over and over
import os, sys
current_dir = os.getcwd()
SCRIPTS_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data'

In [2]:
#import modules
from utils import *
%matplotlib inline

Using gpu device 0: GeForce GTX 1070 (CNMeM is enabled with initial size: 90.0% of memory, cuDNN 5105)
Using Theano backend.


In [3]:
%cd $DATA_HOME_DIR

#Set path to sample/ path if desired
path = DATA_HOME_DIR #'/sample/'
test_path = DATA_HOME_DIR + '/test/' #We use all the test data
results_path=DATA_HOME_DIR + '/results/'
train_path=path + '/train/'
valid_path=path + '/valid/'

/home/nathan/git/planktonDataScienceBowl/scripts/data


## VGG Like Model

#### Constants

In [4]:
img_rows, img_cols = 128, 128
in_shape = (img_rows, img_cols)
batch_size = 64
nb_classes = 121

#### Batches

In [5]:
gen = image.ImageDataGenerator(rescale=1. / 255)
# gen = image.ImageDataGenerator()

In [6]:
train_batches = get_batches(train_path, batch_size=batch_size, 
                            target_size=in_shape, color_mode="grayscale", 
                            gen=gen)
val_batches   = get_batches(valid_path, batch_size=batch_size, 
                            target_size=in_shape, color_mode="grayscale", 
                            gen=gen)

Found 27184 images belonging to 121 classes.
Found 3152 images belonging to 121 classes.


#### Model

In [7]:
def get_model():
    model = Sequential([
        Convolution2D(64,3,3, border_mode='same', activation='relu', input_shape=(1, img_rows, img_cols)),
        Convolution2D(64,3,3, border_mode='same', activation='relu'),
        MaxPooling2D(),
        Convolution2D(128,3,3, border_mode='same', activation='relu'),
        MaxPooling2D(),
        Convolution2D(256,3,3, border_mode='same', activation='relu'),
        MaxPooling2D(),
        Flatten(),
        Dense(2048, activation='relu'),
        Dense(2048, activation='relu'),
        Dense(2048, activation='relu'),
        Dense(nb_classes, activation='softmax')
        ])
    model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [8]:
model = get_model()

In [None]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
convolution2d_1 (Convolution2D)  (None, 64, 128, 128)  640         convolution2d_input_1[0][0]      
____________________________________________________________________________________________________
convolution2d_2 (Convolution2D)  (None, 64, 128, 128)  36928       convolution2d_1[0][0]            
____________________________________________________________________________________________________
maxpooling2d_1 (MaxPooling2D)    (None, 64, 64, 64)    0           convolution2d_2[0][0]            
____________________________________________________________________________________________________
convolution2d_3 (Convolution2D)  (None, 128, 64, 64)   73856       maxpooling2d_1[0][0]             
___________________________________________________________________________________________

#### Train

In [None]:
model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample,
                    nb_epoch=5,
                    validation_data=val_batches,
                    nb_val_samples=val_batches.nb_sample,
                    verbose=True)

Epoch 1/5
Epoch 2/5

In [None]:
model.optimizer.lr=0.1
model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample,
                    nb_epoch=1,
                    validation_data=val_batches, nb_val_samples=val_batches.nb_sample,
                    verbose=True)

In [None]:
model.optimizer.lr=0.01
model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample,
                    nb_epoch=4,
                    validation_data=val_batches, nb_val_samples=val_batches.nb_sample,
                    verbose=True)

## Data Augmentation

#### Constants

In [4]:
img_rows, img_cols = 128, 128
in_shape = (img_rows, img_cols)
batch_size = 64
nb_classes = 121

#### Batches

In [18]:
batch_gen = image.ImageDataGenerator(
                rotation_range=360,
                width_shift_range=0.03,
                height_shift_range=0.03,
                shear_range=0.10,
                zoom_range=0.10,
                rescale=1. / 255,
                horizontal_flip = True,
                vertical_flip = True)

valid_gen = image.ImageDataGenerator(rescale=1. / 255)

In [19]:
train_batches = get_batches(train_path, batch_size=batch_size, 
                            target_size=in_shape, color_mode="grayscale", 
                            gen=batch_gen)
val_batches   = get_batches(valid_path, batch_size=batch_size, 
                            target_size=in_shape, color_mode="grayscale", 
                            gen=valid_gen)

Found 27184 images belonging to 121 classes.
Found 3152 images belonging to 121 classes.


#### Model

In [20]:
def get_model():
    model = Sequential([
        Convolution2D(64,3,3, border_mode='same', activation='relu', input_shape=(1, img_rows, img_cols)),
        Convolution2D(64,3,3, border_mode='same', activation='relu'),
        MaxPooling2D(),
        Convolution2D(128,3,3, border_mode='same', activation='relu'),
        MaxPooling2D(),
        Convolution2D(256,3,3, border_mode='same', activation='relu'),
        MaxPooling2D(),
        Flatten(),
        Dense(2048, activation='relu'),
        Dense(2048, activation='relu'),
        Dense(2048, activation='relu'),
        Dense(nb_classes, activation='softmax')
        ])
    model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [21]:
model = get_model()

In [22]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
convolution2d_9 (Convolution2D)  (None, 64, 128, 128)  640         convolution2d_input_3[0][0]      
____________________________________________________________________________________________________
convolution2d_10 (Convolution2D) (None, 64, 128, 128)  36928       convolution2d_9[0][0]            
____________________________________________________________________________________________________
maxpooling2d_7 (MaxPooling2D)    (None, 64, 64, 64)    0           convolution2d_10[0][0]           
____________________________________________________________________________________________________
convolution2d_11 (Convolution2D) (None, 128, 64, 64)   73856       maxpooling2d_7[0][0]             
___________________________________________________________________________________________

#### Train

In [None]:
model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample,
                    validation_data=val_batches, nb_val_samples=val_batches.nb_sample,
                    nb_epoch=25, verbose=True)

Epoch 1/25

In [None]:
model.optimizer.lr=0.1
model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample,
                    nb_epoch=1,
                    validation_data=val_batches, nb_val_samples=val_batches.nb_sample,
                    verbose=True)

In [None]:
model.optimizer.lr=0.01
model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample,
                    nb_epoch=4,
                    validation_data=val_batches, nb_val_samples=val_batches.nb_sample,
                    verbose=True)

## Data Augmentation

In [None]:
# image_generator = image.ImageDataGenerator(
#                 rotation_range=360,
#                 width_shift_range=0.02,
#                 height_shift_range=0.02,
#                 shear_range=0.05,
#                 zoom_range=0.05)

## Batch Normalization 

## Dropout 

## Ensembling 