In [None]:
"""
Trains model

Usage: python train.py [-h]
"""
from argparse import ArgumentParser
from multiprocessing import cpu_count
from os import path, environ
import pandas as pd
import numpy as np
from keras.utils import plot_model
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, TerminateOnNaN
from keras.preprocessing.image import ImageDataGenerator
from utils import (TEST_DATA_PATH, TRAIN_DATA_PATH, VALIDATION_DATA_PATH,
                   MODELS_PATH, CLASSES, try_makedirs, plot_loss_acc,
                   plot_confusion_matrix)
from sklearn.metrics import confusion_matrix
from models import get_model
from config import config


def init_argparse():
    """
    Initializes argparse

    Returns parser
    """
    parser = ArgumentParser(description='Trains toxic comment classifier')
    parser.add_argument(
        '-m',
        '--model',
        nargs='?',
        help='model architecture (vgg16, vgg19, incresnet, incv3, xcept, resnet50, densnet, nasnet)',
        default='vgg16',
        type=str)
    parser.add_argument(
        '--gpus',
        nargs='?',
        help="A list of GPU device numbers ('1', '1,2,5')",
        default=0,
        type=str)
    return parser


def train_and_predict(model_type, gpus):
    """
    Trains model and makes predictions file
    """
    # creating data generators
    train_datagen = ImageDataGenerator(rescale=1. / 255, horizontal_flip=True)
    test_datagen = ImageDataGenerator(rescale=1. / 255)
    train_generator = train_datagen.flow_from_directory(
        TRAIN_DATA_PATH,
        class_mode='binary',
        seed=171717,
        **config[model_type]['flow_generator'])
    validation_generator = test_datagen.flow_from_directory(
        VALIDATION_DATA_PATH,
        class_mode='binary',
        
        **config[model_type]['flow_generator'])
    test_generator = test_datagen.flow_from_directory(
        TEST_DATA_PATH,
        class_mode=None,
        classes=CLASSES,
        shuffle=False,
        **config[model_type]['flow_generator'])

    # loading the model
    parallel_model, model = get_model(model=model_type, gpus=gpus)
    print('Training model')
    print(model.summary())
    history = parallel_model.fit_generator(
        train_generator,
        validation_data=validation_generator,
        callbacks=[
            EarlyStopping(monitor='val_loss', min_delta=0, patience=5),
            ReduceLROnPlateau(
                monitor='val_loss', factor=0.2, patience=3, min_lr=0.000001),
            TerminateOnNaN()
        ],
        max_queue_size=100,
        use_multiprocessing=True,
        workers=cpu_count(),
        **config[model_type]['fit_generator'])
    # history of training
    # print(history.history.keys())
    # Saving architecture + weights + optimizer state
    model_path = path.join(MODELS_PATH, '{}_{:.4f}_{:.4f}'.format(
        model_type, history.history['val_loss'][-1]
        if 'val_loss' in history.history else history.history['loss'][-1],
        history.history['val_acc'][-1]
        if 'val_acc' in history.history else history.history['acc'][-1]))
    try_makedirs(model_path)
    plot_model(model, path.join(model_path, 'model.png'), show_shapes=True)
    plot_loss_acc(history, model_path)

    print('Saving model')
    model.save(path.join(model_path, 'model.h5'))
    # Building confusion matrices for every class for validation data
    print("Building confusion matrices")
    val_preds = model.predict_generator(
        validation_generator,
        max_queue_size=100,
        use_multiprocessing=True,
        workers=cpu_count())
    plot_confusion_matrix(
        confusion_matrix(
            list(validation_generator.classes), np.argmax(val_preds, axis=1)),
        CLASSES, model_path)

    print('Generating predictions')
    predictions = model.predict_generator(
        test_generator,
        max_queue_size=100,
        use_multiprocessing=True,
        workers=cpu_count())
    pred_classes = np.argmax(predictions)
    # Dealing with missing data
    ids = list(map(lambda id: id[5:-4], test_generator.filenames))
    proba = predictions[np.arange(len(predictions)), pred_classes]
    # Generating predictions.csv for Kaggle
    pd.DataFrame({
        'id': ids,
        'predicted': pred_classes,
    }).sort_values(by='id').to_csv(
        path.join(model_path, 'predictions.csv'), index=False)
    # Generating predictions.csv with some additional data for post-processing
    pd.DataFrame({
        'id': ids,
        'predicted': pred_classes,
        'proba': proba
    }).sort_values(by='id').to_csv(
        path.join(model_path, 'predictions_extd.csv'), index=False)


def main():
    """
    Main function
    """
    args = init_argparse().parse_args()

    environ['CUDA_VISIBLE_DEVICES'] = args.gpus

    train_and_predict(args.model, args.gpus)


if __name__ == '__main__':
    main()


In [None]:
"""
Some useful utilities
"""

from itertools import product
from os import path, makedirs
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib
# generates images without having a window appear
matplotlib.use('Agg')
import matplotlib.pylab as plt
"""
Absolute utils.py file path. It is considered as the project root path.
"""
CWD = path.dirname(path.realpath(__file__))
"""
It must contain files with raw data
"""
DATA_PATH = path.join(CWD, 'data')
TEST_DATA_PATH = path.join(DATA_PATH, 'test')
TRAIN_DATA_PATH = path.join(DATA_PATH, 'train')
VALIDATION_DATA_PATH = path.join(DATA_PATH, 'validation')

LOG_PATH = path.join(CWD, 'log')
"""
Trained models must be stored here
"""
MODELS_PATH = path.join(CWD, 'models')
"""
Pickled objects must be stored here
"""
PICKLES_PATH = path.join(CWD, 'pickles')
CLASSES = list(map(str, ['irreg','norma']))


def try_makedirs(name):
    """
    Makes path if it doesn't exist
    """
    try:
        if not path.exists(name):
            # Strange, but it may raise winerror 123
            makedirs(name)
    except OSError:
        return


def plot_loss_acc(history, model_path):
    """
    Saves into files accuracy and loss plots
    """
    plt.gcf().clear()
    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.savefig(path.join(model_path, 'accuracy.png'))
    plt.gcf().clear()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.savefig(path.join(model_path, 'loss.png'))
    plt.gcf().clear()


def plot_confusion_matrix(cm, classes, model_path, title='Confusion matrix'):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.gcf().clear()

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.imshow(cm, interpolation='none')
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90, fontsize=2)
    plt.yticks(tick_marks, classes, fontsize=2)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.grid(False)
    plt.savefig(path.join(model_path, 'confusion_matrix.pdf'), format='pdf')

    plt.gcf().clear()


In [None]:
import keras.backend.tensorflow_backend as K
from keras.applications.vgg16 import VGG16
from keras.applications.vgg19 import VGG19
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.applications.inception_v3 import InceptionV3
from keras.applications.xception import Xception
from keras.applications.resnet50 import ResNet50
from keras.models import Model, Sequential

from keras.layers import Flatten, Dense, Dropout, GlobalAveragePooling2D, Activation, Conv2D, MaxPooling2D, Convolution2D, ZeroPadding2D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import RMSprop,SGD, Adagrad, Adam
from keras.utils import multi_gpu_model
from utils import CLASSES


def get_gpus(gpus):
    return list(map(int, gpus.split(',')))


def get_model(model, gpus=1, **kwargs):
    """
    Returns compiled keras parallel model ready for training
    and base model that must be used for saving weights

    Params:
    - model - model type
    - gpus - a list with numbers of GPUs
    """
    if model == 'vgg16' or model == 'vgg19':
        return vgg(gpus, model)
    if model == 'skin_rec':
        return skin_rec(gpus, model)
    if model == 'lung_rec':
        return lung_rec(gpus, model)
    if model == 'alex_net':
        return alex_net(gpus, model)
    if model == 'incresnet':
        return inception_res_net_v2(gpus)
    if model == 'incv3':
        return inception_v3(gpus)
    if model == 'xcept':
        return xception(gpus)
    if model == 'resnet50':
        return resnet50(gpus)
    if model == 'densenet':
        return dense_net(gpus)
    if model == 'nasnet':
        return nasnet(gpus)
    raise ValueError('Wrong model value!')

def alex_net(gpus,model):
    frozen = 0

    model = Sequential()

    # Layer 1
    model.add(Convolution2D(32, 3, 3, input_shape = (141, 141, 3), activation = 'relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    # Layer 
    model.add(Convolution2D(32, 3, 3, activation = 'relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
        
    # Layer 6
    model.add(Flatten())
    model.add(Dense(128, activation = 'relu'))
    model.add(Dropout(0.5))
    
    # Layer 7
    model.add(Dense(output_dim = 64, activation = 'relu'))
    model.add(Dropout(0.5))
    
    # Layer 8
    
    output = Dense(len(CLASSES), init='glorot_normal',activation='softmax')(model.output)
    
    return _compile(gpus, model.input, output, frozen)

def vgg(gpus, model):
    """
    Returns compiled keras vgg16 model ready for training
    """

    gpu = get_gpus(gpus)
    if model == 'vgg16':
        

        base_model = VGG16(
            weights= 'imagenet', include_top=False, input_shape=(224, 224, 3))
        frozen = 14
    elif model == 'vgg19':
        base_model = VGG19(
            weights= None, include_top=False, input_shape=(224, 224, 3))
        frozen = 16
    else:
        raise ValueError('Wrong VGG model type!')
    x = Flatten(name='flatten')(base_model.output)
    x = Dense(512, activation='relu', name='fc1')(x)
    x = Dropout(0.5)(x)
    output = Dense(len(CLASSES), activation='softmax')(x)

    # x = Flatten(name='flatten')(base_model.output)
    # x = Dense(4096, activation='relu', name='fc1')(x)
    # x = Dense(4096, activation='relu', name='fc2')(x)
    # output = Dense(1, activation='sigmoid')(x)
    return _compile(gpus, base_model.input, output, 0)

def skin_rec(gpus, model):
    nb_filters = 64
    k_size = (3, 3)
    pl_size = (2, 2)
    gpu = get_gpus(gpus)
    model = Sequential()
    model.add(Conv2D(nb_filters, kernel_size=k_size, activation='relu', input_shape=(141, 141, 3)))
    model.add(Conv2D(nb_filters-4, k_size, activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    
    model.add(Conv2D(nb_filters-8, kernel_size=k_size, activation='relu'))
    model.add(Conv2D(nb_filters-12, kernel_size=k_size, activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(nb_filters-16, kernel_size=k_size, activation='relu'))
    model.add(Conv2D(nb_filters-20, kernel_size=k_size, activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2))	)

	
    x = Flatten(name='flatten')(model.output) 
    x = Dense(128, activation='relu', name='fc1')(x)

    x = Dropout(0.5)(x)
    output = Dense(1, activation='sigmoid')(x)
#model.add(Dense(1, activation='sigmoid'))
	      
    print('Model flattened out to ', model.output_shape) 
    print(type(gpus))
    print(type(model.input))
    print(type(output))
    return _compile(gpus, model.input, output, 0)

def lung_rec(gpus, model):
    k_size = (3, 3)
    pl_size = (2, 2)
    gpu = get_gpus(gpus)
    model = Sequential()
    model.add(Conv2D(50, kernel_size=(11,11), activation='relu', input_shape=(141, 141, 3)))
    
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(120, kernel_size=(3,3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    x = Flatten(name='flatten')(model.output) 
    x = Dense(10, activation='relu', name='fc1')(x)

    output = Dense(len(CLASSES), activation='softmax')(x)

	      
    print('Model flattened out to ', model.output_shape) 
    print(type(gpus))
    print(type(model.input))
    print(type(output))
    return _compile(gpus, model.input, output, 0)



def inception_v3(gpus):
	"""
	Returns compiled keras vgg16 model ready for training
	"""
	frozen = 29
	base_model = InceptionV3(
		weights='imagenet', include_top=False, input_shape=(141, 141, 3))

	x = GlobalAveragePooling2D()(base_model.output)
	x = Dense(1024, activation='relu')(x)
	output = Dense(len(CLASSES), activation='softmax', name='predictions')(x)
	return _compile(gpus, base_model.input, output, frozen)


def inception_res_net_v2(gpus):
	"""
	Returns compiled keras vgg16 model ready for training
	"""
	frozen = 0  # TODO
	base_model = InceptionResNetV2(
		weights='imagenet', include_top=False, input_shape=(299, 299, 3))

	x = GlobalAveragePooling2D(name='avg_pool')(base_model.output)
	output = Dense(len(CLASSES), activation='softmax', name='predictions')(x)

	return _compile(gpus, base_model.input, output, frozen)


def xception(gpus):
	"""
	Returns compiled keras vgg16 model ready for training
	"""
	frozen = 125
	base_model = Xception(
		weights='imagenet', include_top=False, input_shape=(299, 299, 3))

	x = GlobalAveragePooling2D(name='avg_pool')(base_model.output)
	x = Dense(1024, activation='relu')(x)
	output = Dense(len(CLASSES), activation='softmax', name='predictions')(x)

	return _compile(gpus, base_model.input, output, frozen)


def resnet50(gpus):
	"""
	Returns compiled keras vgg16 model ready for training
	"""
	frozen = 0
	base_model = ResNet50(
		weights='imagenet', include_top=False, input_shape=(224, 224, 3))

	x = Flatten()(base_model.output)
	output = Dense(len(CLASSES), activation='softmax', name='predictions')(x)

	return _compile(gpus, base_model.input, output, frozen)


def dense_net(gpus):
	"""
	Returns compiled keras vgg16 model ready for training
	"""
	frozen = 0
	base_model = DenseNet201(
		weights='imagenet', include_top=False, input_shape=(224, 224, 3))

	x = GlobalAveragePooling2D(name='avg_pool')(base_model.output)
	output = Dense(len(CLASSES), activation='softmax', name='predictions')(x)

	return _compile(gpus, base_model.input, output, frozen)


def nasnet(gpus):
	"""
	Returns compiled keras vgg16 model ready for training
	"""
	frozen = 0
	base_model = NASNetLarge(
		weights='imagenet', include_top=False, input_shape=(331, 331, 3))

	x = GlobalAveragePooling2D(name='avg_pool')(base_model.output)
	output = Dense(len(CLASSES), activation='softmax', name='predictions')(x)

	return _compile(gpus, base_model.input, output, frozen)


def _compile(gpus, input, output, frozen):
	gpus = get_gpus(gpus)
	if len(gpus) == 1:
		with K.tf.device('/gpu:{}'.format(gpus[0])):
			model = Model(input, output)
			for layer in model.layers[:frozen]:
				layer.trainable = False
			parallel_model = model
	else:
		with K.tf.device('/cpu:0'):
			model = Model(input, output)
			for layer in model.layers[:frozen]:
				layer.trainable = False
		parallel_model = multi_gpu_model(model, gpus=gpus)
	parallel_model.compile(
		loss='binary_crossentropy',
		optimizer='rmsprop',
		metrics=['accuracy'])
	return parallel_model, model


In [None]:
#!/usr/bin/python3.5
# -*- coding:utf-8 -*-
# Images that already exist will not be downloaded again, so the script can
# resume a partially completed download. All images will be saved in the JPG
# format with 90% compression quality.

import sys
import os
import multiprocessing
import urllib3
import csv
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import json
from utils import try_makedirs

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


def ParseData(data_file):
    ann = {}
    if 'train' in data_file or 'validation' in data_file:
        _ann = json.load(open(data_file))['annotations']
        for a in _ann:
            ann[a['image_id']] = a['label_id']

    key_url_list = []
    j = json.load(open(data_file))
    images = j['images']
    for item in images:
        assert len(item['url']) == 1
        url = item['url'][0]
        id_ = item['image_id']
        if id_ in ann:
            id_ = (id_, ann[id_])
        key_url_list.append((id_, url))
    return key_url_list


def DownloadImage(key_url):
    out_dir = sys.argv[2]
    (key, url) = key_url
    if isinstance(key, tuple):
        filename = os.path.join(out_dir, str(key[1]), '%s.jpg' % key[0])
    else:
        filename = os.path.join(out_dir, 'test', '%s.jpg' % key)

    if os.path.exists(filename):
        print('Image %s already exists. Skipping download.' % filename)
        return
    else:
        try_makedirs(os.path.dirname(filename))

    try:
        # print('Trying to get %s.' % url)
        http = urllib3.PoolManager()
        response = http.request('GET', url, timeout=10)
        image_data = response.data
    except:
        print('Warning: Could not download image %s from %s' % (os.path.basename(filename), url))
        return

    try:
        pil_image = Image.open(BytesIO(image_data))
    except:
        print('Warning: Failed to parse image %s %s' % (os.path.basename(filename), url))
        return

    try:
        pil_image_rgb = pil_image.convert('RGB')
    except:
        print('Warning: Failed to convert image %s to RGB' % os.path.basename(filename))
        return

    try:
        pil_image_rgb.save(filename, format='JPEG', quality=90)
    except:
        print('Warning: Failed to save image %s' % filename)
        return


def Run():
    if len(sys.argv) != 3:
        print('Syntax: %s <train|validation|test.json> <output_dir/>' %
              sys.argv[0])
        sys.exit(0)
    (data_file, out_dir) = sys.argv[1:]

    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    key_url_list = ParseData(data_file)
    pool = multiprocessing.Pool(processes=80)

    with tqdm(total=len(key_url_list)) as t:
        for _ in pool.imap_unordered(DownloadImage, key_url_list):
            t.update(1)


if __name__ == '__main__':
    Run()


In [None]:
from keras.callbacks import Callback
from sklearn.metrics import roc_auc_score


class IntervalEvaluation(Callback):  # pylint: disable=R0903
    """Computes ROC AUC metrics"""

    def __init__(self, validation_data=()):
        super(Callback, self).__init__()  # pylint: disable=E1003

        self.x_val, self.y_val = validation_data
        self.aucs = []

    def on_epoch_end(self, epoch, logs={}):
        """
        Count ROC AUC score at the end of each epoch
        """
        y_pred = None
        if hasattr(self.model, 'predict_proba'):
            # for Sequentional models
            y_pred = self.model.predict_proba(self.x_val, verbose=0)
        else:
            # for models that was created using functional API
            y_pred = self.model.predict(self.x_val, verbose=0)
        self.aucs.append(roc_auc_score(self.y_val, y_pred))
        print(
            '\repoch: {:d} - ROC AUC: {:.6f}'.format(epoch + 1, self.aucs[-1]))


In [None]:
config = {
    'vgg16': {
        'flow_generator': {
            'target_size': (224, 224),
            'batch_size': 56
        },
        'fit_generator': {
            'epochs': 5
        }
    },
    'skin_rec': {
        'flow_generator': {
            'target_size': (141, 141),
            'batch_size': 256
        },
        'fit_generator': {
            'epochs': 3
        }
    },
     'lung_rec': {
        'flow_generator': {
            'target_size': (141, 141),
            'batch_size': 256
        },
        'fit_generator': {
            'epochs': 15
        }
    },
    'alex_net': {
        'flow_generator': {
            'target_size': (141, 141),
            'batch_size': 256
        },
        'fit_generator': {
            'epochs': 15
        }
    },
    'vgg19': {
        'flow_generator': {
            'target_size': (224, 224),
            'batch_size': 256
        },
        'fit_generator': {
            'epochs': 15
        }
    },
    'resnet50': {
        'flow_generator': {
            'target_size': (224, 224),
            'batch_size': 256
        },
        'fit_generator': {
            'epochs': 15
        }
    },
    'densenet': {
        'flow_generator': {
            'target_size': (224, 224),
            'batch_size': 256
        },
        'fit_generator': {
            'epochs': 15
        }
    },
    'incresnet': {
        'flow_generator': {
            'target_size': (299, 299),
            'batch_size': 256
        },
        'fit_generator': {
            'epochs': 15
        }
    },
    'incv3': {
        'flow_generator': {
            'target_size': (141, 141),
            'batch_size': 256
        },
        'fit_generator': {
            'epochs': 15
        }
    },
    'xcept': {
        'flow_generator': {
            'target_size': (299, 299),
            'batch_size': 256
        },
        'fit_generator': {
            'epochs': 7
        }
    },
    'nasnet': {
        'flow_generator': {
            'target_size': (331, 331),
            'batch_size': 64
        },
        'fit_generator': {
            'epochs': 15
        }
    }
}