# Imports

In [None]:
import csv
import h5py
import matplotlib
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import pylab
import itertools
import shutil
import seaborn

from scipy.stats import moment 
from scipy.ndimage import shift

from imblearn.over_sampling import RandomOverSampler

from keras import regularizers, optimizers
from keras.layers import Dense, Dropout, Flatten, MaxPooling2D, Conv2D, Activation
from keras.layers import Input, Embedding, concatenate, LeakyReLU
from keras.models import load_model, Sequential
from keras.utils import to_categorical
from keras.preprocessing.image import ImageDataGenerator
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.applications import VGG16
from keras.callbacks import History

import xgboost as xgb

from skimage import segmentation, morphology, measure
from skimage.transform import resize
from skimage.morphology import watershed

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

# Set paths and model hyper-parameters

In [None]:
class CONFIG():
    def __init__(self, path, data_location='data/', train_images_location='data/train_images/',
                 test_images_location='/data/test_images/', image_size=428, n_classes=121, batch_size=32,
                 n_epochs=10):
        self.path = path
        self.data_location = self.path + data_location
        self.train_images_location = self.path + train_images_location
        self.test_images_location = self.path + test_images_location 
        self.image_size = image_size
        self.n_classes = n_classes
        self.batch_size = batch_size
        self.n_epochs = n_epochs

In [None]:
cfg = CONFIG(path='/Users/guillaumecorda/Desktop/UvA/Applied Machine Learning/Kaggle/')

In [None]:
train_dir = '/Users/guillaumecorda/Desktop/UvA/Applied Machine Learning/Kaggle/data/train_images/'
validation_dir = '/Users/guillaumecorda/Desktop/UvA/Applied Machine Learning/Kaggle/data/val_images/'

# Load and organize data

In [None]:
filenames = [i for i in os.listdir(cfg.train_images_location) if i.endswith('.jpg')]
with open(cfg.data_location + 'train_onelabel.csv', mode='r') as infile:
    reader = csv.reader(infile)
    file_to_class = {el[0]:el[1] for el in reader}

with open(cfg.data_location + 'train_onelabel.csv', mode='r') as infile:
    reader = csv.reader(infile)
    class_counts = {}
    for row in reader:
        if(row[1] != 'class'):
            class_counts[int(row[1])] = class_counts.get(int(row[1]), 0) + 1
    max_nr = max(class_counts.values())
    for key, value in class_counts.items():
        class_counts[key] = int(class_counts[key] + (max_nr - class_counts[key])/6)

In [None]:
X = np.empty([len(filenames),cfg.image_size, cfg.image_size, 1])
Y_tmp = np.empty([len(filenames)])
Y = np.empty([sum(class_counts.values()),cfg.n_classes])

## Organize Data

In [None]:
label_map =  pd.read_csv(cfg.data_location+'train_onelabel.csv')
label_map.head()

In [None]:
ind_list = []
for i in range(121):
    ind_list.append(label_map.loc[label_map['class']==i].index[0])

In [None]:
train_images = []
val_images = []

for i in range(cfg.n_classes-1):
    df = label_map.iloc[ind_list[i]: ind_list[i+1]]
    df_train = df.sample(frac=0.9, random_state=42)
    index_train = df_train.index.tolist()
    df_val = df[~df.index.isin(index_train)]
    train_images.append(df_train['image'].tolist())
    val_images.append(df_val['image'].tolist())

df = label_map.iloc[ind_list[-1]:]
df_train = df.sample(frac=0.9, random_state=42)
index_train = df_train.index.tolist()
df_val = df[~df.index.isin(index_train)]
train_images.append(df_train['image'].tolist())
val_images.append(df_val['image'].tolist())

train_images = list(itertools.chain.from_iterable(train_images))
val_images = list(itertools.chain.from_iterable(val_images))

In [None]:
len(train_images)+len(val_images)

In [None]:
df_train = label_map[label_map['image'].isin(train_images)]
df_val = label_map[label_map['image'].isin(val_images)]

In [None]:
for i in range(cfg.n_classes):
    name = str(i)
    if not os.path.exists(cfg.path+'data_formatted/train_images/class_'+name):
        os.makedirs(cfg.path+'data_formatted/train_images/class_'+name)

for i in range(cfg.n_classes):
    name = str(i)
    if not os.path.exists(cfg.path+'data_formatted/val_images/class_'+name):
        os.makedirs(cfg.path+'data_formatted/val_images/class_'+name)

In [None]:
for i in range(df_train.shape[0]):
    image = str(df_train['image'].iloc[i])
    label = str(df_train['class'].iloc[i])   
    shutil.copy(cfg.path+'data/train_images/'+image, cfg.path+'data_formatted/train_images/class_'+label+'/'+image)

for i in range(df_val.shape[0]):
    image = str(df_val['image'].iloc[i])
    label = str(df_val['class'].iloc[i])   
    shutil.copy(cfg.path+'data/train_images/'+image, cfg.path+'data_formatted/val_images/class_'+label+'/'+image)

## Organize data for submission

### Train

In [None]:
for i in range(cfg.n_classes):
    name = str(i)
    if not os.path.exists(cfg.path+'data_submission_train/class_'+name):
        os.makedirs(cfg.path+'data_submission_train/class_'+name)

for i in range(label_map.shape[0]):
    image = str(label_map['image'].iloc[i])
    label = str(label_map['class'].iloc[i])   
    shutil.copy(cfg.path+'data/train_images/'+image, cfg.path+'data_submission_train/class_'+label+'/'+image)

### Predict

In [None]:
for i in range(cfg.n_classes):
    name = str(i)
    if not os.path.exists(cfg.path+'data_submission_test/class_'+name):
        os.makedirs(cfg.path+'data_submission_test/class_'+name)

# Preprocessing

In [None]:
train_datagen = ImageDataGenerator(
        rotation_range=360,
        width_shift_range=0.1,
        height_shift_range=0.1,
        rescale=1./255,
        shear_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest',
        featurewise_std_normalization=True)

val_datagen = ImageDataGenerator(rescale=1./255, featurewise_std_normalization=True)

In [None]:
train_generator = train_datagen.flow_from_directory(
                cfg.path+'data_formatted/train_images',  
                target_size=(cfg.image_size, cfg.image_size),  
                batch_size=cfg.batch_size,
                class_mode='categorical',
                color_mode='grayscale')

val_generator = val_datagen.flow_from_directory(
                cfg.path+'data_formatted/val_images',  
                target_size=(cfg.image_size, cfg.image_size),  
                batch_size=cfg.batch_size,
                class_mode='categorical',
                color_mode='grayscale')

train_subm_generator = train_datagen.flow_from_directory(
                cfg.path+'data_submission_train/',  
                target_size=(cfg.image_size, cfg.image_size),  
                batch_size=cfg.batch_size,
                class_mode='categorical',
                color_mode='grayscale')

subm_generator = val_datagen.flow_from_directory(
                cfg.path+'data_submission_test/',  
                target_size=(cfg.image_size, cfg.image_size),  
                batch_size=1,
                class_mode='categorical',
                color_mode='grayscale')

In [None]:
#output should be 24204
21782 +2422

# Networks :

## CNN best accuracy

In [None]:
model = Sequential()

model.add(Conv2D(32,kernel_size=(3, 3), padding='same', input_shape=(cfg.image_size, cfg.image_size, 1)))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(16, kernel_size=(3, 3), padding='same'))
model.add(BatchNormalization())
model.add(LeakyReLU())
model.add(Dropout(0.7))



model.add(Flatten())
model.add(Dense(512))
model.add(LeakyReLU())
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(LeakyReLU())
model.add(Dropout(0.5))
model.add(Dense(cfg.n_classes, activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

### Train and validate

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
history = model.fit_generator(
        train_subm_generator,
        steps_per_epoch=100,
        epochs=cfg.n_epochs-5,
        validation_data=val_generator,
        validation_steps=80,
        class_weight=class_counts)

### Train on all images and predict

In [None]:
history = model.fit_generator(
        train_subm_generator,
        steps_per_epoch=1000,
        epochs=10,
        class_weight=class_counts)

In [None]:
filenames = [i for i in os.listdir(cfg.path+'/data/test_images') if i.endswith('.jpg')]

labels = pd.DataFrame(filenames, columns=['image'])
labels['class'] = -1

In [None]:
predictions = model.predict_generator(subm_generator, steps=6132)

In [None]:
predictions.shape

In [None]:
labels['class'] = np.argmax(predictions, axis=1)

In [None]:
labels['class']

In [None]:
labels.to_csv(cfg.path+'output_guillaume/predictions/keras_pp_1.csv', index=False)

## CNN

### Train

In [None]:
model = Sequential()

model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu', input_shape=(cfg.image_size, cfg.image_size, 3)))
model.add(Conv2D(16, kernel_size=(3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(3, 3), strides=2))
model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(3, 3), strides=2))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(cfg.n_classes, activation='softmax'))


sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
history = model.fit_generator(
        train_generator,
        steps_per_epoch=200,
        epochs=5,
        validation_data=val_generator,
        validation_steps=80)

**Try to change optimizer to adam**

In [None]:
model.save(cfg.path+'/output_guillaume/models/model_5.h5')

## NEW

In [None]:
num_test_samples=10000

input_shape = (cfg.image_size, cfg.image_size, 3)

In [None]:
model = Sequential()

model.add(Conv2D(32, (3, 3), input_shape=input_shape))
model.add(Activation('relu'))

model.add(Conv2D(16, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(Dropout(0.5))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dense(512,activation='linear'))
model.add(LeakyReLU(alpha=.3))
model.add(Dropout(0.5))

model.add(Dense(512,activation='linear'))
model.add(LeakyReLU(alpha=.3))

model.add(Dense(cfg.n_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='Adam',
              metrics=['accuracy'])

In [None]:
model.fit_generator(
    train_generator,
    steps_per_epoch=cfg.batch_size,
    val_generator,
    epochs=cfg.n_epochs)

**NOTE**
* Try to use different pre trained model
* Try to use pre trained models as features extractors combined to RFC or XGB
* Try to use different architecture after pre trained models