In [None]:
import cv2
import os
import pandas as pd
import numpy as np
from glob import glob
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications.densenet import preprocess_input
from tensorflow.keras import layers, optimizers, models
from imgaug import augmenters as iaa
import imgaug as ia

# Datasets are avalible in kaggle [Histopathology Cancer Detection](https://www.kaggle.com/c/histopathologic-cancer-detection/data)

In [None]:
data = pd.read_csv('../input/histopathologic-cancer-detection/train_labels.csv')

In [None]:
data = data[data['id'] != 'dd6dfed324f9fcb6f93f46f32fc800f2ec196be2']
data = data[data['id'] != '9369c7278ec8bcc6c880d99194de09fc2bd4efbe']

In [None]:
data_0 = data[data['label'] == 0].sample(85000, random_state=42)
data_1 = data[data['label'] == 1].sample(85000, random_state=42)
data = pd.concat([data_0, data_1], axis=0).reset_index(drop=True)
data = shuffle(data)

In [None]:
id_label_map = {_id: label for _id, label in zip(data['id'].values, data['label'].values)}

In [None]:
data_train, data_valid = train_test_split(data, test_size=0.1, random_state=42, stratify=data['label'])
data_train

In [None]:
def get_batch(data_set, batch_size):
    return [data_set[size: size + batch_size] for size in range(0, len(data_set), batch_size)]

In [None]:
def get_seq():
    sometimes = lambda aug: iaa.Sometimes(0.5, aug)
    seq = iaa.Sequential([
        iaa.Fliplr(0.5),
        iaa.Flipud(0.2),
        sometimes(iaa.Affine(
            scale={'x': (0.9, 1.1), 'y': (0.9, 1.1)},
            translate_percent={'x': (-0.1, 0.1), 'y': (-0.1, 0.1)},
            rotate=(-10, 10),
            shear=(-5, 5),
            mode=ia.ALL,
        ))
    ], random_order=True)
    return seq

In [None]:
def data_gen(data_set, batch_size, id_label_map, img_path, augment=False):
    seq = get_seq()
    while True:
        for data_batch in get_batch(data_set, batch_size):
            X = [cv2.imread(os.path.join(img_path, x + '.tif')) for x in data_batch]
            Y = [id_label_map[x] for x in data_batch]
            if augment:
                X = seq.augment_images(X)
            X = [preprocess_input(x) for x in X]
            yield np.array(X), np.array(Y)

In [None]:
model = models.Sequential([
layers.Conv2D(32, (3, 3), activation = 'relu', input_shape = (96, 96, 3)),
layers.Conv2D(32, (3, 3), activation = 'relu'),
layers.Conv2D(32, (3, 3), activation = 'relu'),
layers.MaxPooling2D((2, 2)),
layers.Dropout(0.3),
layers.Conv2D(64, (3, 3), activation = 'relu'),
layers.Conv2D(64, (3, 3), activation = 'relu'),
layers.Conv2D(64, (3, 3), activation = 'relu'),
layers.MaxPooling2D((2, 2)),
layers.Dropout(0.3),
layers.Conv2D(128, (3, 3), activation = 'relu'),
layers.Conv2D(128, (3, 3), activation = 'relu'),
layers.Conv2D(128, (3, 3), activation = 'relu'),
layers.MaxPooling2D((2, 2)),
layers.Dropout(0.3),
layers.Flatten(),
layers.Dense(256, activation='relu'),
layers.Dropout(0.3),
layers.Dense(2, activation='sigmoid')
])
model.compile(optimizer=optimizers.Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
img_path = '../input/histopathologic-cancer-detection/train'

In [None]:
batch_size = 128
train_len = data_train.shape[0]
valid_len = data_valid.shape[0]
history = model.fit_generator(data_gen(list(data_train['id']), batch_size, id_label_map, img_path, augment=True),
                              epochs=30, steps_per_epoch=train_len//batch_size, validation_steps=valid_len//batch_size,
                              validation_data=data_gen(list(data_valid['id']), batch_size, id_label_map, img_path, augment=False),)

In [None]:
test_file = glob('../input/histopathologic-cancer-detection/test/*.tif')
preds = []
ids = []

In [None]:
model.save('cancer_detection.h5')