# Load and train

In [None]:
import csv
import h5py
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier

from skimage.transform import resize
from skimage import segmentation
from skimage.morphology import watershed
from skimage import measure
from skimage import morphology

from sklearn.model_selection import train_test_split

from imblearn.over_sampling import RandomOverSampler

from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import MaxPooling2D
from keras.layers import Conv2D
from keras.models import load_model
from keras.models import Sequential
from keras.utils import to_categorical
from keras.models import Model
from keras.callbacks import EarlyStopping

%matplotlib inline

# Set global variables and model hyper-parameters

In [None]:
class CONFIG():
    def __init__(self, path, data_location='data/', train_images_location='data/train_images/',
                 test_images_location='/data/test_images/', image_size=224, n_classes=121, batch_size=64,
                 n_epochs=10):
        self.path = path
        self.data_location = self.path + data_location
        self.train_images_location = self.path + train_images_location
        self.test_images_location = self.path + test_images_location 
        self.image_size = image_size
        self.n_classes = n_classes
        self.batch_size = batch_size
        self.n_epochs = n_epochs

In [None]:
cfg = CONFIG(path='/Users/guillaumecorda/Desktop/UvA/Applied Machine Learning/Kaggle/')

# Load data and labels

In [None]:
filenames = [i for i in os.listdir(cfg.train_images_location) if i.endswith('.jpg')]
with open(cfg.data_location + 'train_onelabel.csv', mode='r') as infile:
    reader = csv.reader(infile)
    file_to_class = {el[0]:el[1] for el in reader}

with open(cfg.data_location + 'train_onelabel.csv', mode='r') as infile:
    reader = csv.reader(infile)
    class_counts = {}
    for row in reader:
        if(row[1] != 'class'):
            class_counts[int(row[1])] = class_counts.get(int(row[1]), 0) + 1
    max_nr = max(class_counts.values())
    for key, value in class_counts.items():
        class_counts[key] = int(class_counts[key] + (max_nr - class_counts[key])/6)

X = np.empty([len(filenames), cfg.image_size, cfg.image_size,1])
Y_ = np.empty([len(filenames)])
Y = np.empty([sum(class_counts.values()),cfg.n_classes])

# Helper functions

In [None]:
def get_padding(i):
    
    if i%2 == 0: 
        return (int(i/2), int(i/2))
    else:
        return (int(i/2-.5), int(i/2+.5))

In [None]:
def pad_image(img):
    
    H, W = img.shape
    if H == W:
        return img
    elif H > W:
        return np.pad(img, ((0,0), get_padding(H-W)), 'constant')
    
    else:
        return np.pad(img, (get_padding(W-H), (0,0)), 'constant')

In [None]:
def resize_image(img):
    return resize(img, (cfg.image_size, cfg.image_size), mode='reflect')

# Pre-processing

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
for i in range(len(filenames)):
    img = mpimg.imread(cfg.train_images_location + filenames[i])
    img = np.absolute(np.divide(img.astype(float), 255) - 1.0)
    img = resize_image(pad_image(img))
    img = img.reshape(cfg.image_size, cfg.image_size,1)
    X[i] = img
    Y_[i] = int(file_to_class[filenames[i]])

# Class imbalance

In [None]:
X = X.reshape(24204, cfg.image_size*cfg.image_size)

sm = RandomOverSampler(ratio=class_counts)
X, Y_ = sm.fit_sample(X, Y_)

X = X.reshape(len(X), cfg.image_size, cfg.image_size, 1)
for i in range(len(Y_)):
    Y[i][int(Y_[i])] = 1.0    

In [None]:
for i in range(total,X.shape[0]):
    X[i] = np.rot90(X[i],(1+(i%4)))

# Define and train CNN

In [None]:
model = Sequential()

model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu', input_shape=X[0].shape))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu'))

model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(cfg.n_classes, activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
history = model.fit(
    X, 
    Y,
    epochs=cfg.n_epochs, 
    batch_size=cfg.batch_size,
    verbose=1, callbacks=[EarlyStopping(monitor='val_err', patience=2)])

In [None]:
model.save(cfg.path+'/output_guillaume/models/model_new_2.h5')

# Train new classifiers

In [None]:
model = load_model(cfg.path+'output_guillaume/models/model_new_cnn_224.h5')

In [None]:
for layer in model.layers :
    print(layer.name)

In [None]:
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer('dropout_3').output)

In [None]:
intermediate_output = intermediate_layer_model.predict(X)

In [None]:
intermediate_output.shape

## Train XGBoost

In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier

In [None]:
clf = xgb.XGBClassifier(objective='multi:softmax', learning_rate=0.001)
#learning_rate
#max_depths

In [None]:
clf.fit(intermediate_output, np.argmax(Y, axis=1))

## Train Random Forest 

In [None]:
rfc = RandomForestClassifier(n_estimators=100)

In [None]:
rfc.fit(intermediate_output, np.argmax(Y, axis=1))

# Predict

In [None]:
classifier = clf
#classifier = rfc

In [None]:
filenames = [i for i in os.listdir(cfg.path+'/data/test_images') if i.endswith('.jpg')]

labels = pd.DataFrame(filenames, columns=['image'])
labels['class'] = -1

In [None]:
total = len(filenames)
X_subm = np.empty([len(filenames),cfg.image_size,cfg.image_size,1])

for i in range(total):
    img = mpimg.imread(cfg.test_images_location + filenames[i])
    img = np.absolute(np.divide(img.astype(float), 255) - 1.0)
    img = resize_image(pad_image(img))
    img = img.reshape(1,cfg.image_size, cfg.image_size,1)
    X_subm[i] = img
    
intermediate_layer_model_subm = Model(inputs=model.input,
                                      outputs=model.get_layer('dropout_6').output)

intermediate_output_subm = intermediate_layer_model.predict(X_subm)

    
labels['class'] = classifier.predict(intermediate_output_subm)

labels.sort_values(by='class')
labels['class'] = labels['class'].astype(int)
labels.sample(n=5)

In [None]:
labels

In [None]:
labels.to_csv(cfg.path+'output_guillaume/predictions/model_rfc.csv', index=False)