<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Load-and-train" data-toc-modified-id="Load-and-train-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load and train</a></span></li><li><span><a href="#Set-global-variables-and-model-hyper-parameters" data-toc-modified-id="Set-global-variables-and-model-hyper-parameters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Set global variables and model hyper-parameters</a></span></li><li><span><a href="#Load-data-and-labels" data-toc-modified-id="Load-data-and-labels-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Load data and labels</a></span></li><li><span><a href="#Helper-functions" data-toc-modified-id="Helper-functions-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Helper functions</a></span></li><li><span><a href="#Pre-processing" data-toc-modified-id="Pre-processing-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Pre-processing</a></span></li><li><span><a href="#Class-imbalance" data-toc-modified-id="Class-imbalance-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Class imbalance</a></span></li><li><span><a href="#CNN" data-toc-modified-id="CNN-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>CNN</a></span><ul class="toc-item"><li><span><a href="#Model-1---Best-so-far" data-toc-modified-id="Model-1---Best-so-far-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Model 1 - Best so far</a></span><ul class="toc-item"><li><span><a href="#Cross-Validation" data-toc-modified-id="Cross-Validation-7.1.1"><span class="toc-item-num">7.1.1&nbsp;&nbsp;</span>Cross-Validation</a></span></li></ul></li><li><span><a href="#Model-2" data-toc-modified-id="Model-2-7.2"><span class="toc-item-num">7.2&nbsp;&nbsp;</span>Model 2</a></span></li><li><span><a href="#Model-deep" data-toc-modified-id="Model-deep-7.3"><span class="toc-item-num">7.3&nbsp;&nbsp;</span>Model deep</a></span></li></ul></li><li><span><a href="#Predictions" data-toc-modified-id="Predictions-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Predictions</a></span></li><li><span><a href="#Notes" data-toc-modified-id="Notes-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Notes</a></span></li></ul></div>

# Load and train

In [None]:
import csv
import h5py
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from scipy.ndimage import shift
import operator
import seaborn as sns

from skimage.transform import resize
from skimage import segmentation
from skimage.morphology import watershed
from skimage import measure
from skimage import morphology

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

from imblearn.over_sampling import RandomOverSampler

from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers import Activation
from keras.layers import Dropout, BatchNormalization
from keras.layers import MaxPooling2D
from keras.layers import Conv2D
from keras.layers import LeakyReLU
from keras.models import load_model
from keras.models import Sequential
from keras.utils import to_categorical
from keras.models import Model
from keras import optimizers
from keras.callbacks import EarlyStopping

%matplotlib inline

# Set global variables and model hyper-parameters

In [None]:
class CONFIG():
    def __init__(self, path, data_location='data/', train_images_location='data/train_images/',
                 test_images_location='/data/test_images/', image_size=224, n_classes=121, batch_size=64,
                 n_epochs=5):
        self.path = path
        self.data_location = self.path + data_location
        self.train_images_location = self.path + train_images_location
        self.test_images_location = self.path + test_images_location 
        self.image_size = image_size
        self.n_classes = n_classes
        self.batch_size = batch_size
        self.n_epochs = n_epochs

In [None]:
cfg = CONFIG(path='/Users/guillaumecorda/Desktop/UvA/Applied Machine Learning/Kaggle/')

# Load data and labels

In [None]:
filenames = [i for i in os.listdir(cfg.train_images_location) if i.endswith('.jpg')]
with open(cfg.data_location + 'train_onelabel.csv', mode='r') as infile:
    reader = csv.reader(infile)
    file_to_class = {el[0]:el[1] for el in reader}

with open(cfg.data_location + 'train_onelabel.csv', mode='r') as infile:
    reader = csv.reader(infile)
    class_counts = {}
    for row in reader:
        if(row[1] != 'class'):
            class_counts[int(row[1])] = class_counts.get(int(row[1]), 0) + 1
    max_nr = max(class_counts.values())
    for key, value in class_counts.items():
        class_counts[key] = int(class_counts[key] + (max_nr - class_counts[key])/6)

X = np.empty([len(filenames), cfg.image_size, cfg.image_size,1])
Y_ = np.empty([len(filenames)])
Y = np.empty([sum(class_counts.values()),cfg.n_classes])

# Helper functions

In [None]:
def get_padding(i):
    
    if i%2 == 0: 
        return (int(i/2), int(i/2))
    else:
        return (int(i/2-.5), int(i/2+.5))

In [None]:
def pad_image(img):
    
    H, W = img.shape
    if H == W:
        return img
    elif H > W:
        return np.pad(img, ((0,0), get_padding(H-W)), 'constant')
    
    else:
        return np.pad(img, (get_padding(W-H), (0,0)), 'constant')

In [None]:
def resize_image(img):
    return resize(img, (cfg.image_size, cfg.image_size), mode='reflect')

# Pre-processing

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
for i in range(len(filenames)):
    img = mpimg.imread(cfg.train_images_location + filenames[i])
    img = np.absolute(np.divide(img.astype(float), 255) - 1.0)
    img = resize_image(pad_image(img))
    img = img.reshape(cfg.image_size, cfg.image_size,1)
    X[i] = img
    Y_[i] = int(file_to_class[filenames[i]])

# Class imbalance

In [None]:
X = X.reshape(24204, cfg.image_size*cfg.image_size)

sm = RandomOverSampler(ratio=class_counts)
X, Y_ = sm.fit_sample(X, Y_)

X = X.reshape(len(X), cfg.image_size, cfg.image_size, 1)
for i in range(len(Y_)):
    Y[i][int(Y_[i])] = 1.0

In [None]:
for i in range(24204, X.shape[0]):
    X[i] = np.rot90(X[i],(1+(i%4)))

In [None]:
f = plt.figure()

sub1 = plt.subplot(2,2,1)
plt.imshow(X[446][:,:,0], cmap='binary')
plt.title('Original')
plt.axis('off')

sub1 = plt.subplot(2,2,2)
rot = np.rot90(X[446],(1))
plt.imshow(rot[:,:,0], cmap='binary')
plt.title('Rotation')
plt.axis('off')

sub1 = plt.subplot(2,2,3)
flip = np.fliplr(X[446])
plt.imshow(flip[:,:,0], cmap='binary')
plt.title('Flip Left-Right', loc=('left'))
plt.axis('off')

sub1 = plt.subplot(2,2,4)
flip = np.flipud(X[446])
plt.imshow(flip[:,:,0], cmap='binary')
plt.title('Flip Up-Down')
plt.axis('off')

# CNN

## Model 1 - Best so far

In [None]:
model = Sequential()

model.add(Conv2D(32,kernel_size=(3, 3), padding='same', input_shape=(cfg.image_size, cfg.image_size, 1)))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(16, kernel_size=(3, 3), padding='same'))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(Dropout(0.7))


model.add(Flatten())
model.add(Dense(512))
model.add(LeakyReLU())
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(LeakyReLU())
model.add(Dropout(0.5))
model.add(Dense(cfg.n_classes, activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
plot_model(
    model,
    to_file='cnn.png',
    show_shapes=False,
    show_layer_names=True,
    rankdir='TB'
)

In [None]:
history = model.fit(
    X, 
    Y,
    epochs=5, 
    batch_size=cfg.batch_size,
    verbose=1)

### Cross-Validation

In [None]:
kf = KFold(n_splits=3)
kf.get_n_splits(X)

In [None]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    history = model.fit(X_train, 
                        y_train,
                        epochs=cfg.n_epochs,
                        validation_data=(X_test, y_test),
                        batch_size=cfg.batch_size,
                        verbose=1,
                        callbacks=[EarlyStopping(patience=2)])

In [None]:
model.save(cfg.path+'/output_guillaume/models/model_cross_val_1.h5')

## Model 2

In [None]:
model = Sequential()

model.add(Conv2D(32, (3, 3), input_shape=X[0].shape))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(Conv2D(16, (3, 3), ))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dense(512))
model.add(LeakyReLU(alpha=.3))
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(LeakyReLU(alpha=.3))

model.add(Dense(cfg.n_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [None]:
history = model.fit(
    X, 
    Y,
    epochs=10, 
    batch_size=cfg.batch_size,
    verbose=1)

In [None]:
model.save(cfg.path+'/output_guillaume/models/model__428.h5')

## Model deep

In [None]:
model = Sequential()

model.add(Conv2D(32, kernel_size=(3, 3), padding='same', input_shape=X[0].shape))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(16, kernel_size=(3, 3), padding='same'))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(16, kernel_size=(3, 3), padding='same'))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(Dropout(0.5))

model.add(Conv2D(64, kernel_size=(3, 3), padding='same', input_shape=X[0].shape))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, kernel_size=(3, 3), padding='same'))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, kernel_size=(3, 3), padding='same'))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(Dropout(0.5))

model.add(Conv2D(128, kernel_size=(3, 3), padding='same', input_shape=X[0].shape))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), padding='same'))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), padding='same'))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dense(512))
model.add(LeakyReLU())
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(LeakyReLU())
model.add(Dropout(0.5))
model.add(Dense(cfg.n_classes, activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

# Predictions

In [None]:
filenames = [i for i in os.listdir(cfg.path+'/data/test_images') if i.endswith('.jpg')]

labels = pd.DataFrame(filenames, columns=['image'])
labels['class'] = -1

In [None]:
model = load_model(cfg.path+'/output_guillaume/models/model__428.h5')

In [None]:
total = len(filenames)
for i in range(total):
    img = mpimg.imread(cfg.test_images_location + filenames[i])
    img = np.absolute(np.divide(img.astype(float), 255) - 1.0)
    img = resize_image(pad_image(img))
    img = img.reshape(1, cfg.image_size, cfg.image_size,1)
    labels.loc[labels['image'] == filenames[i], 'class'] = model.predict_classes(img, verbose=0)[0]

labels.sort_values(by='class')
labels['class'] = labels['class'].astype(int)
labels.sample(n=5)

In [None]:
labels

In [None]:
labels.to_csv(cfg.path+'output_guillaume/predictions/model__428.csv', index=False)

# Notes

**To Do**
* Test sgd
* Test with different im size
* Test Leaky ReLu : https://keras.io/layers/advanced-activations/
* Test Deeper cnn
