In [None]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob 
from skimage.io import imread 
import shutil
%matplotlib inline
from random import shuffle
import cv2
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from keras.preprocessing import image
from keras_preprocessing.image import ImageDataGenerator
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.applications.resnet50 import ResNet50
from keras import layers as KL
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.nasnet import NASNetMobile
from keras.applications.xception import Xception
from keras.utils.vis_utils import plot_model
from keras.models import Model
!pip install livelossplot
from livelossplot import PlotLossesKeras

from keras.layers import Convolution1D, concatenate, SpatialDropout1D, GlobalMaxPool1D, GlobalAvgPool1D, Embedding, \
    Conv2D, SeparableConv1D, Add, BatchNormalization, Activation, GlobalAveragePooling2D, LeakyReLU, Flatten
from keras.layers import Dense, Input, Dropout, MaxPooling2D, Concatenate, GlobalMaxPooling2D, GlobalAveragePooling2D, \
    Lambda, Multiply, LSTM, Bidirectional, PReLU, MaxPooling1D,Average
from keras.layers.pooling import _GlobalPooling1D
from keras.losses import mae, sparse_categorical_crossentropy, binary_crossentropy
from keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from keras.optimizers import Adam, RMSprop
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau,CSVLogger

# Model Resnet

In [None]:
TRAIN_PATH = '.input/train/'
TRAIN_LABELS = 'input/train_labels.csv'
SIZE_IMG = 96
EPOCHS = 2

model_path = 'resnet.h5'
saved_model = os.path.isfile(model_path)

# **Data processing**

In [None]:
df = pd.read_csv(TRAIN_LABELS)

print(df['label'].value_counts(), 
      '\n\n', df.describe(), 
      '\n\n', df.head())

# **Init Keras data generator**

In [None]:
#add .tif to ids in the dataframe to use flow_from_dataframe
df=pd.read_csv("input/train_labels.csv",dtype=str)
def add_ext(id):
    return id+".tif"

df["id"]=df["id"].apply(add_ext)
def addpath(col):
    return '../input/train/' + col 
df['Path']=df['id'].apply(addpath)
df.head()

In [None]:
if saved_model:
    val = 0
else:
    val = 0.15
    
datagen= ImageDataGenerator(
            rescale=1./255,
            samplewise_std_normalization= True,
            horizontal_flip=True,
            vertical_flip=True,
            rotation_range=90,
            zoom_range=0.2, 
            width_shift_range=0.1,
            height_shift_range=0.1,
            shear_range=0.05,
            channel_shift_range=0.1,
            validation_split=val)

train_generator=datagen.flow_from_dataframe(
    dataframe=df,
    directory=TRAIN_PATH,
    x_col="id",
    y_col="label",
    subset="training",
    batch_size=64,
    shuffle=True,
    class_mode="binary",
    target_size=(96,96))

valid_generator=datagen.flow_from_dataframe(
    dataframe=df,
    directory=TRAIN_PATH,
    x_col="id",
    y_col="label",
    subset="validation",
    batch_size=64,
    shuffle=True,
    class_mode="binary",
    target_size=(96,96))

# **Build model**
The model is a pre-trained Resnet50 with a dense layer combined to a sigmoid activation function for the binary classification. A keras callback was used to reduce the learning rate if the validation accuracy doesn't improve over epochs.

In [None]:
def build_model():
    input_shape = (SIZE_IMG, SIZE_IMG, 3)
    inputs = KL.Input(input_shape)
    resnet = ResNet50(include_top=False, input_shape=input_shape) 
    x  = KL.GlobalAveragePooling2D()(resnet(inputs))
    x = KL.Dropout(0.5)(x)
    outputs = KL.Dense(1, activation='sigmoid')(x)

    return Model(inputs, outputs)

def first_training():
    '''
    train the model and save it if the val_acc test is better than the precedent epoch
    '''
    model = build_model()
    
    model.compile(optimizer=Adam(lr=0.0001, decay=0.00001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=2, 
                                       verbose=1, mode='max', min_lr=0.000001)
    
    checkpoint = ModelCheckpoint("resnet.h5", monitor='val_acc', verbose=1, 
                              save_best_only=True, mode='max')

    history = model.fit_generator(train_generator,
                              steps_per_epoch=train_generator.n//train_generator.batch_size, 
                              validation_data=valid_generator,
                              validation_steps=valid_generator.n//valid_generator.batch_size,
                              epochs=EPOCHS,
                              callbacks=[checkpoint,reduce_lr])
    
    return history, model

def second_training():
    '''
    Tune the model using all available data and a small learning rate
    '''
    model = load_model(model_path)
    
    model.compile(optimizer=Adam(lr=0.000001, decay=0.00001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    history = model.fit_generator(train_generator,
                              steps_per_epoch=train_generator.n//train_generator.batch_size, 
                              epochs=10)
    
    return history, model

In [None]:
if saved_model:
    history, model = second_training()
else:
    history, model = first_training()

In [None]:
def analyse_results(epochs):
    metrics = ['loss', "acc", 'val_loss','val_acc']
        
    plt.style.use("ggplot")
    (fig, ax) = plt.subplots(1, 4, figsize=(30, 5))
    fig.subplots_adjust(hspace=0.1, wspace=0.3)

    for (i, l) in enumerate(metrics):
        title = "Loss for {}".format(l) if l != "loss" else "Total loss"
        ax[i].set_title(title)
        ax[i].set_xlabel("Epoch #")
        ax[i].set_ylabel(l.split('_')[-1])
        ax[i].plot(np.arange(0, epochs), history.history[l], label=l)
        ax[i].legend() 

if EPOCHS > 1 and saved_model == False:        
    analyse_results(EPOCHS)

# **Predictions**

In [None]:
test_path = 'input/test/'
df_test = pd.read_csv('input/sample_submission.csv')
df_test["id"]=df_test["id"].apply(lambda x : x +".tif")

test_datagen = ImageDataGenerator(rescale=1./255,
                                 samplewise_std_normalization= True)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=df_test,
    directory=test_path,
    x_col="id",
    y_col=None,
    target_size=(96, 96),
    color_mode="rgb",
    batch_size=64,
    class_mode=None,
    shuffle=False,
)  

In [None]:
test_generator.reset()
pred=model.predict_generator(test_generator,verbose=1,steps=test_generator.n/2).ravel()

In [None]:
test_labels = test_generator
y_preds = model.predict_generator(test_generator,verbose=1,steps=test_generator.n/5)
y_pred_keras=y_preds.round()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(test_labels, y_pred_keras)
auc_keras = auc(fpr_keras, tpr_keras)

print('AUC score :', + auc_keras)

# **CSV submission**
Predictions of the test generator are not in the right order so it needs to be rearranged it in the label list before to be passed it to the submission data frame. The final result contains raw predictions without any threshold to classify data because the website is using the ROC curve metric to compute the score.

In [None]:
results = dict(zip(test_generator.filenames, pred))

label = []
for i in range(len(df_test["id"])):
    label.append(results[df_test["id"][i]])
    
df_test["id"]=df_test["id"].apply(lambda x : x[:-4])

In [None]:
submission=pd.DataFrame({"id":df_test["id"],
                      "label":label})
submission.to_csv("resnet_submission.csv",index=False)
submission.head()

#

# Model MobilenetV2

In [None]:
print(os.listdir("input"))

In [None]:
def get_id_from_file_path(file_path):
    return file_path.split(os.path.sep)[-1].replace('.tif', '')

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def data_gen(list_files, id_label_map, batch_size, augment=False):
    keras_gen = ImageDataGenerator(
                    rotation_range=10,
                    width_shift_range=0.1,
                    height_shift_range=0.1,
                    horizontal_flip=True,
                    vertical_flip=True,
                    zoom_range=0.2,
                    shear_range=5)
    while True:
        shuffle(list_files)
        for batch in chunker(list_files, batch_size):
            X = [cv2.imread(x) for x in batch]
            Y = [id_label_map[get_id_from_file_path(x)] for x in batch]
            if augment:
                X = [keras_gen.random_transform(x) for x in X]
            X = [preprocess_input(x.astype(np.float32)) for x in X]
                
            yield np.array(X), np.array(Y)

# Built Model MobilenetV2

In [None]:
def create_model():
    inputs = Input((96, 96, 3))
    base_model = MobileNetV2(include_top=False, input_shape=(96, 96, 3))#, weights=None
    x = base_model(inputs)
    out1 = GlobalMaxPooling2D()(x)
    out2 = GlobalAveragePooling2D()(x)
    out3 = Flatten()(x)
    out = Concatenate(axis=-1)([out1, out2, out3])
    out = Dropout(0.5)(out)
    out = Dense(1, activation="sigmoid", name="3_")(out)
    model = Model(inputs, out)
    model.compile(optimizer=Adam(0.0001), loss=binary_crossentropy, metrics=['acc'])
    model.summary()

    return model

# Training

In [None]:
df_train = pd.read_csv("input/train_labels.csv")
id_label_map = {k:v for k,v in zip(df_train.id.values, df_train.label.values)}
df_train.head()

    
labeled_files = glob(r'input/train/*.tif')
test_files = glob(r'input/test/*.tif')

print("labeled_files size :", len(labeled_files))
print("test_files size :", len(test_files))

train, val = train_test_split(labeled_files, test_size=0.1, random_state=101010)


model = create_model()

batch_size=32
h5_path = "mobilenetv2.h5"
checkpoint = ModelCheckpoint(h5_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

history = model.fit_generator(
    data_gen(train, id_label_map, batch_size, augment=True),
    validation_data=data_gen(val, id_label_map, batch_size),
    epochs=2, verbose=1,
    callbacks=[checkpoint],
    steps_per_epoch=len(train) // batch_size,
    validation_steps=len(val) // batch_size)
batch_size=64
history = model.fit_generator(
    data_gen(train, id_label_map, batch_size, augment=True),
    validation_data=data_gen(val, id_label_map, batch_size),
    epochs=2, verbose=1,
    callbacks=[checkpoint],
    steps_per_epoch=len(train) // batch_size,
    validation_steps=len(val) // batch_size)
    
model.compile(optimizer=Adam(0.00001), loss=binary_crossentropy, metrics=['acc'])
history = model.fit_generator(
    data_gen(train, id_label_map, batch_size, augment=True),
    validation_data=data_gen(val, id_label_map, batch_size),
    epochs=2, verbose=1,
    callbacks=[checkpoint],
    steps_per_epoch=len(train) // batch_size,
    validation_steps=len(val) // batch_size)

model.load_weights(h5_path)

preds = []
ids = []

In [None]:
for batch in chunker(test_files, batch_size):
    X = [preprocess_input(cv2.imread(x).astype(np.float32)) for x in batch]
    ids_batch = [get_id_from_file_path(x) for x in batch]
    X = np.array(X)
    preds_batch = ((model.predict(X).ravel()*model.predict(X[:, ::-1, :, :]).ravel()*model.predict(X[:, ::-1, ::-1, :]).ravel()*model.predict(X[:, :, ::-1, :]).ravel())**0.25).tolist()
    preds += preds_batch
    ids += ids_batch

# Generate submission file

In [None]:
df = pd.DataFrame({'id':ids, 'label':preds})
df.to_csv("mobilenetv2_submission.csv", index=False)
df.head()

#

# Model Xecption

In [None]:
TRAINING_LOGS_FILE = "input/training_logs.csv"
MODEL_SUMMARY_FILE = "model_summary.txt"
MODEL_FILE = "xecption.h5"
KAGGLE_SUBMISSION_FILE = "xception_submission.csv"

In [None]:
input_dir = 'input/'
training_dir = input_dir + 'train/'
data_frame = pd.DataFrame({'path': glob('input/train/*.tif')})
data_frame['id'] = data_frame.path.map(lambda x: x.split('/')[5].split('.')[0])
labels = pd.read_csv(input_dir + 'train_labels.csv')
data_frame = data_frame.merge(labels, on = 'id')
negatives = data_frame[data_frame.label == 0].sample(85000)
positives = data_frame[data_frame.label == 1].sample(85000)
data_frame = pd.concat([negatives, positives]).reset_index()
data_frame = data_frame[['path', 'id', 'label']]
data_frame['image'] = data_frame['path'].map(imread)

training_path = 'training'
validation_path = 'validation'

for folder in [training_path, validation_path]:
    for subfolder in ['0', '1']:
        path = os.path.join(folder, subfolder)
        os.makedirs(path, exist_ok=True)

training, validation = train_test_split(data_frame, train_size=0.9, stratify=data_frame['label'])

data_frame.set_index('id', inplace=True)

for images_and_path in [(training, training_path), (validation, validation_path)]:
    images = images_and_path[0]
    path = images_and_path[1]
    for image in images['id'].values:
        file_name = image + '.tif'
        label = str(data_frame.loc[image,'label'])
        destination = os.path.join(path, label, file_name)
        if not os.path.exists(destination):
            source = os.path.join(input_dir + 'train', file_name)
            shutil.copyfile(source, destination)

# Data augmentation

In [None]:
# Data augmentation
training_data_generator = ImageDataGenerator(rescale=1./255,
                                             horizontal_flip=True,
                                             vertical_flip=True,
                                             rotation_range=90,
                                             zoom_range=0.2, 
                                             width_shift_range=0.1,
                                             height_shift_range=0.1,
                                             shear_range=0.05,
                                             channel_shift_range=0.1)

# Data generation

In [None]:
training_generator = training_data_generator.flow_from_directory(training_path,
                                                                 target_size=(96,96),
                                                                 batch_size=216,
                                                                 class_mode='binary')
validation_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(validation_path,
                                                                              target_size=(96,96),
                                                                              batch_size=216,
                                                                              class_mode='binary')
testing_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(validation_path,
                                                                           target_size=(96,96),
                                                                           batch_size=216,
                                                                           class_mode='binary',
                                                                           shuffle=False)

# Build Model

In [None]:
in_shape = (96, 96, 3)
inputs = Input(in_shape)
xception = Xception(include_top = False, weights = None, input_shape = in_shape)  
nas_net = NASNetMobile(include_top = False, weights = None, input_shape = in_shape)

outputs = Concatenate(axis=-1)([GlobalAveragePooling2D()(xception(inputs)),
                                GlobalAveragePooling2D()(nas_net(inputs))])
outputs = Dropout(0.5)(outputs)
outputs = Dense(1, activation='sigmoid')(outputs)
model = Model(inputs, outputs)
model.compile(optimizer=Adam(lr=0.0001, decay=0.00001),
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

#  Training

In [None]:
history = model.fit_generator(training_generator,
                              steps_per_epoch=len(training_generator), 
                              validation_data=validation_generator,
                              validation_steps=len(validation_generator),
                              epochs=10,
                              verbose=1,
                              callbacks=[PlotLossesKeras(),ModelCheckpoint(MODEL_FILE,
                                                                             monitor='val_acc',
                                                                             verbose=1,
                                                                             save_best_only=True,
                                                            mode='max'),CSVLogger(TRAINING_LOGS_FILE,
                                                                               append=False,
                                                                               separator=';')])

# Kaggle testing


In [None]:
testing_files = glob('input/test/*.tif')
submission = pd.DataFrame()
for index in range(0, len(testing_files), 5000):
    data_frame = pd.DataFrame({'path': testing_files[index:index + 5000]})
    data_frame['id'] = data_frame.path.map(lambda x: x.split('/')[5].split(".")[0])
    data_frame['image'] = data_frame['path'].map(imread)
    images = np.stack(data_frame.image, axis=0)
    predicted_labels = [model.predict(np.expand_dims(image/255.0, axis=0))[0][0] for image in images]
    predictions = np.array(predicted_labels)
    data_frame['label'] = predictions
    submission = pd.concat([submission, data_frame[["id", "label"]]])
submission.to_csv(KAGGLE_SUBMISSION_FILE, index=False, header=True)