In [1]:
import pandas as pd
import os
import shutil
from glob import glob
from sklearn.model_selection import train_test_split
from skimage.io import imread
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D, Average, Input, Concatenate, GlobalMaxPooling2D
from keras.applications.xception import Xception
from keras.applications.nasnet import NASNetMobile
from keras.models import Model
from keras.optimizers import Adam
from keras.utils.vis_utils import plot_model
from livelossplot import PlotLossesKeras
from keras.callbacks import CSVLogger, ModelCheckpoint
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
import numpy as np 

SAMPLE_COUNT=85000
TRAINING_RATIO=0.9
IMAGE_SIZE = 224
BATCH_SIZE = 18
MODEL_PLOT_FILE = "model_plot.png"
EPOCHS = 10
VERBOSITY = 1
MODEL_FILE = "model.h5"
TRAINING_LOGS_FILE = "training_logs.csv"
TRAINING_PLOT_FILE = "training.png"
VALIDATION_PLOT_FILE = "validation.png"
ROC_PLOT_FILE = "roc.png"
TESTING_BATCH_SIZE = 5000
KAGGLE_SUBMISSION_FILE = "kaggle_submission.csv"

def datasetup(SAMPLE_COUNT, TRAINING_RATIO):    
    training_dir = 'static\\train\\'
    data_frame = pd.DataFrame({'path': glob(os.path.join(training_dir,'*.tif'))})
    data_frame['id'] = data_frame.path.map(lambda x: x.split('\\')[2].split('.')[0]) 
    labels = pd.read_csv('traindata\\train_labels.csv')
    data_frame = data_frame.merge(labels, on = 'id')
    negatives = data_frame[data_frame.label == 0].sample(SAMPLE_COUNT)
    positives = data_frame[data_frame.label == 1].sample(SAMPLE_COUNT)
    data_frame = pd.concat([negatives, positives]).reset_index()
    data_frame = data_frame[['path', 'id', 'label']]
    data_frame['image'] = data_frame['path'].map(imread)
    #small_data_frame=data_frame[0:10]
    #small_frame=data_frame[85000:85010]
    #smalldata_frame=pd.concat([small_data_frame, small_frame]).reset_index()
    #smalldata_frame = smalldata_frame[['path', 'id', 'label']]
    #smalldata_frame['image'] = smalldata_frame['path'].map(imread)
    training_path = '../training'
    validation_path = '../validation'
    for folder in [training_path, validation_path]:
        for subfolder in ['0', '1']:
            path = os.path.join(folder, subfolder)
            os.makedirs(path, exist_ok=True)
    training, validation = train_test_split(data_frame, train_size=TRAINING_RATIO, stratify=data_frame['label'])
    data_frame.set_index('id', inplace=True)
    for images_and_path in [(training, training_path), (validation, validation_path)]:
        images = images_and_path[0]
        path = images_and_path[1]
        for image in images['id'].values:
            file_name = image + '.tif'
            label = str(data_frame.loc[image,'label'])
            destination = os.path.join(path, label, file_name)
            if not os.path.exists(destination):
                source = os.path.join('static/train', file_name)
                shutil.copyfile(source, destination)

    return (training_path, validation_path)


Using TensorFlow backend.


In [2]:
training_path, validation_path=datasetup(SAMPLE_COUNT, TRAINING_RATIO)



In [3]:
training_data_generator = ImageDataGenerator(rescale=1./255,
                                            horizontal_flip=True,
                                            vertical_flip=True,
                                            rotation_range=180,
                                            zoom_range=0.4, 
                                            width_shift_range=0.3,
                                            height_shift_range=0.3,
                                            shear_range=0.3,
                                            channel_shift_range=0.3)


In [4]:
def generation(IMAGE_SIZE, BATCH_SIZE, training_data_generator,training_path, validation_path):
    training_generator = training_data_generator.flow_from_directory(training_path,
                                                                    target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                                                    batch_size=BATCH_SIZE,
                                                                    class_mode='binary')
    validation_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(validation_path,
                                                                                target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                                                                batch_size=BATCH_SIZE,
                                                                                class_mode='binary')
    testing_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(validation_path,
                                                                            target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                                                            batch_size=BATCH_SIZE,
                                                                            class_mode='binary',
                                                                            shuffle=False)
    return (training_generator, validation_generator, testing_generator)   


In [5]:
training_generator, validation_generator, testing_generator=generation(IMAGE_SIZE, BATCH_SIZE, training_data_generator,training_path, validation_path)


Found 219745 images belonging to 2 classes.
Found 94055 images belonging to 2 classes.
Found 94055 images belonging to 2 classes.


In [6]:
def cancermodel(IMAGE_SIZE, MODEL_PLOT_FILE):     
    input_shape = (IMAGE_SIZE, IMAGE_SIZE, 3)
    inputs = Input(input_shape)
    xception = Xception(include_top=False, input_shape=input_shape)(inputs)
    nas_net = NASNetMobile(include_top=False, input_shape=input_shape)(inputs)
    outputs = Concatenate(axis=-1)([GlobalAveragePooling2D()(xception), GlobalAveragePooling2D()(nas_net)])
    outputs = Dropout(0.5)(outputs)
    outputs = Dense(1, activation='sigmoid')(outputs)
    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(lr=0.0001, decay=0.00001),
                loss='binary_crossentropy',
                metrics=['accuracy'])
    model.summary()
    #plot_model(model,
    #        to_file=MODEL_PLOT_FILE,
    #        show_shapes=True,
    #        show_layer_names=True)     
    return model


In [7]:
model=cancermodel(IMAGE_SIZE, MODEL_PLOT_FILE)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
xception (Model)                (None, 7, 7, 2048)   20861480    input_1[0][0]                    
__________________________________________________________________________________________________
NASNet (Model)                  (None, 7, 7, 1056)   4269716     input_1[0][0]                    
__________________________________________________________________________________________________
global_average_pooling2d_1 (Glo (None, 2048)         0           xception[1][0]                   
__________________________________________________________________________________________________
global_ave

In [8]:
def training(EPOCHS, VERBOSITY, MODEL_FILE, TRAINING_LOGS_FILE, training_generator, validation_generator, model):       
    history = model.fit_generator(training_generator,
                                steps_per_epoch=len(training_generator), 
                                validation_data=validation_generator,
                                validation_steps=len(validation_generator),
                                epochs=EPOCHS,
                                verbose=VERBOSITY,
                                callbacks=[PlotLossesKeras(),
                                            ModelCheckpoint(MODEL_FILE,
                                                            monitor='val_acc',
                                                            verbose=VERBOSITY,
                                                            save_best_only=True,
                                                            mode='max'),
                                            CSVLogger(TRAINING_LOGS_FILE,
                                                    append=False,
                                                    separator=';')])
    return history


In [None]:
history=training(EPOCHS, VERBOSITY, MODEL_FILE, TRAINING_LOGS_FILE, training_generator, validation_generator, model)


Epoch 1/10
  27/6868 [..............................] - ETA: 1045:44:46 - loss: 0.6088 - acc: 0.6562