In [1]:
# predict on test sample and submit to kaggle

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Model
from tqdm import tqdm_notebook as tqdm
import sys
import os

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# loss and eval metrics
# https://www.kaggle.com/akensert/resnet50-keras-baseline-model
from keras import backend as K

def weighted_log_loss(y_true, y_pred):
    """
    Can be used as the loss function in model.compile()
    ---------------------------------------------------
    """
    class_weights = np.array([2., 1., 1., 1., 1., 1.])
    eps = K.epsilon()
    y_pred = K.clip(y_pred, eps, 1.0-eps)
    out = -(         y_true  * K.log(      y_pred) * class_weights
            + (1.0 - y_true) * K.log(1.0 - y_pred) * class_weights)
    return K.mean(out, axis=-1)

def _normalized_weighted_average(arr, weights=None):
    """
    A simple Keras implementation that mimics that of 
    numpy.average(), specifically for the this competition
    """
    if weights is not None:
        scl = K.sum(weights)
        weights = K.expand_dims(weights, axis=1)
        return K.sum(K.dot(arr, weights), axis=1) / scl
    return K.mean(arr, axis=1)

def weighted_loss(y_true, y_pred):
    """
    Will be used as the metric in model.compile()
    ---------------------------------------------
    Similar to the custom loss function 'weighted_log_loss()' above
    but with normalized weights, which should be very similar 
    to the official competition metric:
        https://www.kaggle.com/kambarakun/lb-probe-weights-n-of-positives-scoring
    and hence:
        sklearn.metrics.log_loss with sample weights
    """
    class_weights = K.variable([2., 1., 1., 1., 1., 1.])
    eps = K.epsilon()
    y_pred = K.clip(y_pred, eps, 1.0-eps)
    loss = -(        y_true  * K.log(      y_pred)
            + (1.0 - y_true) * K.log(1.0 - y_pred))
    loss_samples = _normalized_weighted_average(loss, class_weights)
    return K.mean(loss_samples)

def weighted_log_loss_metric(trues, preds):
    """
    Will be used to calculate the log loss 
    of the validation set in PredictionCheckpoint()
    ------------------------------------------
    """
    class_weights = [2., 1., 1., 1., 1., 1.]
    epsilon = 1e-7
    preds = np.clip(preds, epsilon, 1-epsilon)
    loss = trues * np.log(preds) + (1 - trues) * np.log(1 - preds)
    loss_samples = np.average(loss, axis=1, weights=class_weights)
    return - loss_samples.mean()

Using TensorFlow backend.


In [3]:
train = pd.read_csv('train.csv', index_col=0)
val = pd.read_csv('val.csv', index_col=0)
test = pd.read_csv('test.csv', index_col=0)

In [4]:
print(train.shape, val.shape, test.shape)

(538630, 8) (68290, 8) (67337, 8)


In [5]:
538630 + 68290 + 67337 # 674257

674257

In [6]:
train.head() # they should be int not float
# need to add the .jpg to ID

Unnamed: 0,filename,PatientID,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_231d901c1.jpg,ID_b81a287f,1,0,0,0,1,0
2,ID_127689cce.jpg,ID_42910d3d,0,0,0,0,0,0
3,ID_25457734a.jpg,ID_329aafa7,0,0,0,0,0,0
4,ID_81c9aa125.jpg,ID_6b544c3c,0,0,0,0,0,0
5,ID_87e8b2528.jpg,ID_d6e578fb,0,0,0,0,0,0


In [7]:
# need to create submission df
submission = pd.read_csv('stage_1_sample_submission.csv')
submission["Image"] = submission["ID"].str.slice(stop=12) + '.jpg'
submission["Diagnosis"] = submission["ID"].str.slice(start=13)
submission = submission.loc[:, ["Label", "Diagnosis", "Image"]]
submission = submission.set_index(['Image', 'Diagnosis']).unstack(level=-1)

In [8]:
submission.columns = submission.columns.droplevel(0)

In [9]:
submission = submission.reset_index()

In [10]:
submission.head()

Diagnosis,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf.jpg,0.5,0.5,0.5,0.5,0.5,0.5
1,ID_0000ca2f6.jpg,0.5,0.5,0.5,0.5,0.5,0.5
2,ID_000259ccf.jpg,0.5,0.5,0.5,0.5,0.5,0.5
3,ID_0002d438a.jpg,0.5,0.5,0.5,0.5,0.5,0.5
4,ID_00032d440.jpg,0.5,0.5,0.5,0.5,0.5,0.5


In [11]:
submission.columns

Index(['Image', 'any', 'epidural', 'intraparenchymal', 'intraventricular',
       'subarachnoid', 'subdural'],
      dtype='object', name='Diagnosis')

In [12]:
len(submission) # 78545

78545

In [13]:
batch_size = 128
im_size = 224
seed = 42
columns=["any", "epidural", "intraparenchymal", "intraventricular", "subarachnoid", "subdural"]

In [14]:
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test,
    directory='/home/jupyter/train_images_bsb_224/', # train/val/test are all in same dir
    x_col="filename",
    y_col=columns,
    target_size=(im_size, im_size),
    batch_size=batch_size,
    shuffle=False,
    class_mode='other',
    seed=seed)

Found 67337 validated image filenames.


In [15]:
submission_generator = test_datagen.flow_from_dataframe(
    dataframe=submission,
    directory='/home/jupyter/test_images_bsb_224/', # this one is different dir
    x_col="Image",
    y_col=None,
    target_size=(im_size, im_size),
    batch_size=batch_size,
    shuffle=False,
    class_mode=None,
    seed=seed
    )

Found 78545 validated image filenames.


In [16]:
model = tf.keras.models.load_model('ir2_6_epochs.keras', custom_objects={'weighted_log_loss': weighted_log_loss, 'weighted_loss': weighted_loss})

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [35]:
#model.evaluate_generator(test_generator, steps=len(test_generator), verbose=1)

In [36]:
preds = model.predict_generator(submission_generator, steps=len(submission_generator), verbose=1)



In [22]:
preds[0:15]

array([[9.27162170e-03, 2.13682652e-05, 1.24809146e-03, 2.01043487e-03,
        1.12885237e-03, 2.81718373e-03],
       [1.96695328e-06, 0.00000000e+00, 3.57627869e-07, 2.98023224e-08,
        2.38418579e-07, 6.25848770e-07],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [1.04755163e-04, 3.57627869e-07, 9.03010368e-06, 1.10268593e-06,
        1.77919865e-05, 4.71770763e-05],
       [4.61159050e-02, 5.36268950e-03, 8.14211369e-03, 2.45678425e-03,
        8.58303905e-03, 1.96909010e-02],
       [1.57630444e-03, 3.87430191e-07, 2.29775906e-05, 2.38418579e-06,
        2.00062990e-04, 9.92536545e-04],
       [2.08616257e-07, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [3.63444388e-02, 1.08456612e-03, 1.25472844e-02, 6.31809235e-04,
        1.32672191e-02, 1.52637959e-02],
       [1.02511048e-03, 7.15255737e-07, 1.08689070e-04, 2.78532505e-04,
        2.09569931e-04, 

In [37]:
len(preds)

78545

In [45]:
len(submission)

78545

In [46]:
submission.tail()

Diagnosis,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
78540,ID_fffe2edb8.jpg,0.5,0.5,0.5,0.5,0.5,0.5
78541,ID_ffff3212e.jpg,0.5,0.5,0.5,0.5,0.5,0.5
78542,ID_ffff59a97.jpg,0.5,0.5,0.5,0.5,0.5,0.5
78543,ID_ffffb670a.jpg,0.5,0.5,0.5,0.5,0.5,0.5
78544,ID_ffffcbff8.jpg,0.5,0.5,0.5,0.5,0.5,0.5


In [47]:
submission[['any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']] = preds

In [48]:
submission = submission.melt(id_vars=['Image'])

In [49]:
submission['ID'] = submission.Image.apply(lambda x: x.replace('.jpg', '')) + '_' + submission.Diagnosis

In [50]:
submission = submission.drop(['Image', 'Diagnosis'], axis=1)

In [51]:
submission.columns = ['Label', 'ID']

In [52]:
submission = submission[['ID', 'Label']]

In [53]:
submission.to_csv('submission.csv', index=False)

In [54]:
!~/.local/bin/kaggle competitions submit rsna-intracranial-hemorrhage-detection -f submission.csv -m "ir2_6_epochs"

100%|██████████| 19.8M/19.8M [00:07<00:00, 2.74MB/s]
Successfully submitted to RSNA Intracranial Hemorrhage Detection

tta

In [13]:
# need to create submission df
submission = pd.read_csv('stage_1_sample_submission.csv')
submission["Image"] = submission["ID"].str.slice(stop=12) + '.jpg'
submission["Diagnosis"] = submission["ID"].str.slice(start=13)
submission = submission.loc[:, ["Label", "Diagnosis", "Image"]]
submission = submission.set_index(['Image', 'Diagnosis']).unstack(level=-1)

In [14]:
submission.columns = submission.columns.droplevel(0)

In [15]:
submission = submission.reset_index()

In [17]:
submission.head()

Diagnosis,Image,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000012eaf.jpg,0.5,0.5,0.5,0.5,0.5,0.5
1,ID_0000ca2f6.jpg,0.5,0.5,0.5,0.5,0.5,0.5
2,ID_000259ccf.jpg,0.5,0.5,0.5,0.5,0.5,0.5
3,ID_0002d438a.jpg,0.5,0.5,0.5,0.5,0.5,0.5
4,ID_00032d440.jpg,0.5,0.5,0.5,0.5,0.5,0.5


In [21]:
# tta
im_size=224
batch_size=64
seed=64

tta_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    rotation_range = 30,
    zoom_range = 0.1,
    horizontal_flip=True
    )

submission_generator = tta_datagen.flow_from_dataframe(
    dataframe=submission,
    directory='/home/jupyter/test_images_bsb_224/', # this one is different dir
    x_col="Image",
    y_col=None,
    target_size=(im_size, im_size),
    batch_size=batch_size,
    shuffle=False,
    class_mode=None,
    seed=seed
    )

Found 78545 validated image filenames.


In [18]:
model = tf.keras.models.load_model('ir2_6_epochs.keras', custom_objects={'weighted_log_loss': weighted_log_loss, 'weighted_loss': weighted_loss})

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [19]:
def tta_prediction(datagen, model, n_examples):
    it = datagen.flow_from_dataframe(
        dataframe=submission,
        directory='/home/jupyter/test_images_bsb_224/', # this one is different dir
        x_col="Image",
        y_col=None,
        target_size=(im_size, im_size),
        batch_size=n_examples,
        shuffle=False,
        class_mode=None
    );
    preds = model.predict_generator(it, steps=n_examples, verbose=0)
    summed = np.sum(preds, axis=0)
    return np.argmax(summed)

In [24]:
n_examples = 3
preds = list()

for i in tqdm(range(len(submission))):
    sys.stdout = open(os.devnull, "w") # suppress output
    pred = tta_prediction(tta_datagen, model, n_examples)
    sys.stdout = sys.__stdout__ # supress output
    preds.append(pred)

HBox(children=(IntProgress(value=0, max=78545), HTML(value='')))

KeyboardInterrupt: 

In [None]:
#preds.to_csv('tta_preds.csv', index=False)

In [None]:
def tta_prediction(datagen, model, image, n_examples):
	# convert image into dataset
	samples = expand_dims(image, 0)
	# prepare iterator
	it = datagen.flow(samples, batch_size=n_examples)
	# make predictions for each augmented image
	yhats = model.predict_generator(it, steps=n_examples, verbose=0)
	# sum across predictions
	summed = numpy.sum(yhats, axis=0)
	# argmax across classes
	return argmax(summed)

In [None]:
#https://machinelearningmastery.com/how-to-use-test-time-augmentation-to-improve-model-performance-for-image-classification/
def tta_evaluate_model(model, testX, testY):
	# configure image data augmentation
	datagen = ImageDataGenerator(horizontal_flip=True)
	# define the number of augmented images to generate per test set image
	n_examples_per_image = 7
	yhats = list()
	for i in range(len(testX)):
		# make augmented prediction
		yhat = tta_prediction(datagen, model, testX[i], n_examples_per_image)
		# store for evaluation
		yhats.append(yhat)
	# calculate accuracy
	testY_labels = argmax(testY, axis=1)
	acc = accuracy_score(testY_labels, yhats)
	return acc

In [None]:
preds = model.predict_generator(submission_generator, steps = len(submission_generator))

In [None]:
# tta
# I don't think this is right, maybe need a list for each class??
tta_steps = 3
predictions = []

for i in tqdm(range(tta_steps)):
    preds = model.predict_generator(submission_generator, steps = len(submission_generator))
    predictions.append(preds)


  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [16:44<33:29, 1004.53s/it][A

In [None]:
pred = np.mean(predictions, axis=0)

# np.mean(np.equal(np.argmax(y_val, axis=-1), np.argmax(pred, axis=-1))) 