In [1]:
import numpy  as np 
import pandas as pd 
import os
import cv2 
import gc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, confusion_matrix

# import keras
from keras.preprocessing import image
from keras.models import Sequential
from keras.applications import DenseNet121
from keras.layers import GlobalAveragePooling2D, Dropout, Dense
from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau
from keras.activations import sigmoid
from keras.optimizers import Adam

# Global constants
IMG_DIM      = 256
BATCH_SIZE   = 32
CHANNEL_SIZE = 3
NUM_CLASSES  = 5

# data frame of current competition
df_2019 = pd.read_csv(f"../aptos2019/train.csv") 
df_2019.id_code = df_2019.id_code.apply(lambda x: x + ".png")
df_2019.id_code = df_2019.id_code.apply(lambda x: "modified_" + x) 
train_2019, valid_2019 = train_test_split(df_2019, test_size=0.2, shuffle=False)

# 2019 data frame 
df_2015 = pd.read_csv(f"../aptos2015/trainLabels.csv") 
df_2015.image   = df_2015.image.apply(lambda   x: x + ".jpeg")
df_2015["id_code"]   = df_2015.image
df_2015["diagnosis"] = df_2015.level
train_2015, valid_2015 = train_test_split(df_2015, test_size=0.2, shuffle=False)

# valid_2019['diagnosis'].value_counts().plot(kind='bar')
# plt.title('Samples Per Class')

Using TensorFlow backend.


# Loading data

In this kernel, we are using multilabel data. Instead of predicting a single label, we will change our target to be a multilabel problem; i.e., if the target is a certain class, then it encompasses all the classes before it. E.g. encoding a class 4 retinopathy would usually be `[0, 0, 0, 1]`, but in our case we will predict `[1, 1, 1, 1]`. 

The idea is that if an eye has severe diabetic retinopathy, that also means that it has moderate and severe diabetic retinopathy

In [2]:

def label_convert(y_val):
    y_val = y_val.astype(int).sum(axis=1) - 1
    #y_val= np.argmax(y_val, axis=1)
    return y_val

def get_train_valid_df(year="2019", even_distrib=True):
    
    # shuffle data so each time different samples are dropped
    if (year == "2019"):
        train = train_2019.sample(frac=1)
        valid = valid_2019.sample(frac=1)
    elif (year == "2015"):
        train = train_2015.sample(frac=1)
        valid = valid_2015.sample(frac=1)
    
    # remap from classes to smoothed version of the classes
    train["labels"] = train.diagnosis.apply(lambda x: [i for i in range(x + 1)])
    valid["labels"] = valid.diagnosis.apply(lambda x: [i for i in range(x + 1)])

    train.diagnosis = train.diagnosis.astype('str')
    valid.diagnosis = valid.diagnosis.astype('str')

    # drop classes 
    if even_distrib:
        min_train = min(train['diagnosis'].value_counts())
        min_valid = min(valid['diagnosis'].value_counts())

        for diagnosis in range(5):
            indexes_valid = valid[valid['diagnosis'] == str(diagnosis)].index
            indexes_train = train[train['diagnosis'] == str(diagnosis)].index
            
            frac_drop_train = indexes_train.size * (1 - min_train/indexes_train.size)
            frac_drop_valid = indexes_valid.size * (1 - min_valid/indexes_valid.size)
            
            train.drop(indexes_train[:int(frac_drop_train)], inplace=True)
            valid.drop(indexes_valid[:int(frac_drop_valid)], inplace=True)

    # shuffle it for even distribution
    train = train.sample(frac=0.8)
    valid = valid.sample(frac=0.8)
    
    return train, valid

# plot example
# _, df_to_plot = get_train_valid_df(year="2019")
# df_to_plot['diagnosis'].value_counts().plot(kind='bar')
# plt.title('Samples Per Class')
# print(df_to_plot.head(5))

# df_to_plot.columns

In [3]:
# # display some data
# df_example, _ = get_train_valid_df(year="2019")

# # Display some random images from Data Set with class categories.
# figure=plt.figure(figsize=(22,20))
# for target_class in (df_example['diagnosis'].unique()):
#     for i, (idx, row ) in enumerate(df_example.loc[df_example.diagnosis == target_class]
#                                     .sample(4)
#                                     .iterrows()):
#         # open the file
#         imagefile = f"../aptos2019/train_images/{row['id_code']}" 
#         img = cv2.imread(imagefile)
        
#         # original version
#         rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
#         ax = figure.add_subplot(5,4, int(target_class)*4+i+1)
#         plt.imshow(rgb)
#         ax.set_title(target_class)
        

In [4]:
# Data generators, which put their data into a random crop generator, which is then fed into
# the network during training

def dataGenerator(jitter=0.1):
    datagen = image.ImageDataGenerator(rescale=1./255,
                                       horizontal_flip=True and (jitter > 0.01), 
                                       vertical_flip=True and (jitter > 0.01),
                                       rotation_range=int(800*jitter),
                                       brightness_range=[1-jitter, 1+jitter],
                                       channel_shift_range=int(30*jitter),
                                       zoom_range=[(1-jitter), (1+jitter/2)],
                                       fill_mode="reflect",
                                      )
    return datagen

def datagen_with_flow(datagen, dataframe, directory):
    return datagen.flow_from_dataframe(dataframe=dataframe, directory=directory,
                                       x_col="id_code", 
                                       y_col='labels', 
                                       class_mode="categorical", 
                                       batch_size=BATCH_SIZE,
                                       target_size=(IMG_DIM, IMG_DIM),
                                       shuffle=False,
                                      )

def generator(jitter=0.1, year="2019", even_distrib=True):
    
    train, valid = get_train_valid_df(year=year, even_distrib=even_distrib) 
    datagen = dataGenerator(jitter)
    
    train_gen = datagen_with_flow(datagen, train, f"../aptos{year}/train_images/")
    valid_gen = datagen_with_flow(datagen, valid, f"../aptos{year}/train_images/")
    
    return train_gen, valid_gen

gc.collect()


29

In [5]:
# train_sample_gen, valid_sample_gen = generator(jitter=0.5)

# # Display some data generation
# figure=plt.figure(figsize=(22,20))
# for batch in valid_sample_gen:
#     for j in range(16):
#         ax = figure.add_subplot(4,4, j+1)
#         batch[0][j] = np.clip(batch[0][j], 0, 1)
#         plt.imshow(batch[0][j])
#     break

In [6]:
class Metrics(Callback):
    def __init__(self, generator):
        self.generator = generator
        
    def on_train_begin(self, logs={}):
        self.val_kappas = []

    def on_epoch_end(self, epoch, logs={}):
        
        numBatches = 15
        y_pred     = []
        y_val      = []
        for x, y in self.generator:
            predictions = model.predict(x) 
            y_pred.extend(label_convert(predictions > 0.5))
            y_val.extend(label_convert(y))
            
            numBatches -= 1
            if numBatches <= 0:
                break
            
        val_kappa = cohen_kappa_score(y_val, y_pred, weights='quadratic')
        self.val_kappas.append(val_kappa)
        
        print(val_kappa)
        print(confusion_matrix(y_val, y_pred))
            
        if val_kappa == max(self.val_kappas) and val_kappa > 0.84:
            gc.collect()
            print("Max of this run, saving model.")
            model.save(f"dense-multi-second-{val_kappa:.4f}.h5")


In [7]:
def create_model():
    
    model = Sequential()
    #model.add(DenseNet121(weights='../DenseNet-BC-121-32-no-top.h5', 
    model.add(DenseNet121(weights=None, 
                          include_top=False, 
                          input_shape=(IMG_DIM,IMG_DIM,CHANNEL_SIZE)))
    model.add(GlobalAveragePooling2D())
    model.add(Dropout(0.5))
    model.add(Dense(NUM_CLASSES, activation='sigmoid'))
    
    return model

model = create_model()
model.load_weights("../dense-multi-2015-run.h5")



W0815 03:41:13.220278 140365948131072 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0815 03:41:13.240477 140365948131072 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0815 03:41:13.245838 140365948131072 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0815 03:41:13.268116 140365948131072 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:174: The n

In [8]:
 for jitter in [0.4, 0.2, 0.05]:
    
    model.compile(optimizer=Adam(lr=0.00005*jitter), loss='binary_crossentropy',  metrics=['accuracy'])
    
    print("           -----------------------------------", 
          jitter, "-----------------------------------")
    
    for even_distrib in [False, True]:
        
        for year in ["2019"]:

            print("           -   -   -   -   -   -   -   -   ", year, 
                  even_distrib, "-   -   -   -   -   -   -   -   -")
            
            # these need to be global for the kappa callback
            train_generator, valid_generator = generator(jitter=jitter, year=year, even_distrib=even_distrib)

            # Call backs during training            
            kappa_callbacks = Metrics(valid_generator)
            reduce_lr  = ReduceLROnPlateau(monitor='val_loss', min_delta=0.0004, patience=2, 
                                           min_lr=1e-8, mode='auto', verbose=1)

            # train the model for 12 epochs
            history = model.fit_generator(generator=train_generator,
                                          steps_per_epoch=train_generator.n  // train_generator.batch_size,
                                          validation_data=valid_generator,
                                          validation_steps=valid_generator.n // valid_generator.batch_size,
                                          epochs=4, workers=4, verbose=1,
                                          callbacks=[reduce_lr, kappa_callbacks],
                                         )
    
    gc.collect()

W0815 03:41:48.224748 140365948131072 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0815 03:41:48.234390 140365948131072 deprecation.py:323] From /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


           ----------------------------------- 0.4 -----------------------------------
           -   -   -   -   -   -   -   -    2019 False -   -   -   -   -   -   -   -   -
Found 2343 validated image filenames belonging to 5 classes.
Found 586 validated image filenames belonging to 5 classes.
Epoch 1/4
0.8764564040047571
[[238   3   3   0   0]
 [  3   2  28   1   0]
 [  1   4 100  12   1]
 [  0   1  11   9   3]
 [  1   1  13  12  11]]
Max of this run, saving model.
Epoch 2/4
0.8768949575314482
[[240   3   1   0   0]
 [  3   3  22   0   2]
 [  1   6 101   7   2]
 [  0   0  15  11   2]
 [  1   0  18   5  15]]
Max of this run, saving model.
Epoch 3/4
0.9175421341290613
[[249   9   0   0   0]
 [  5   8  22   0   0]
 [  0  10 101   5   1]
 [  0   0  13   6   3]
 [  0   0   8   5  13]]
Max of this run, saving model.
Epoch 4/4

Epoch 00004: ReduceLROnPlateau reducing learning rate to 1.9999999494757505e-06.
0.8912783541410234
[[242   3   3   0   0]
 [  4   7  19   0   1]
 [  0   8 103   8 

In [9]:
model.save(f"dense-multi-2019-run.h5")

# Compare the average of 5 randomised jitters to a non-jittered val

In [None]:
def compare_prediction_process(year="2019"):

    _, valid_df = get_train_valid_df(year=year, even_distrib=False)
    
    y_val  = valid_df.diagnosis.astype(int)

    # with jitter
    num = 7
    prediction_lists = np.zeros((valid_df.index.size, num, 5))
    for i in range(num):
        datagen = datagen_with_flow(dataGenerator(0.03), valid_df, f"../aptos{year}/train_images/")
        prediction_lists[:, i] = model.predict_generator(generator=datagen, steps=len(datagen), workers=4, verbose=1)

    predictions = np.median(prediction_lists, axis=1)
    y_pred = label_convert(predictions > 0.5)
    
    print("With jitter: ", cohen_kappa_score(y_val, y_pred, weights='quadratic'))
    print(confusion_matrix(y_val, y_pred))
          
    # no jitter
    datagen = datagen_with_flow(dataGenerator(0), valid_df, f"../aptos{year}/train_images/")
    predictions = model.predict_generator(generator=datagen, steps=len(datagen), workers=4, verbose=1)
    y_pred = label_convert(predictions > 0.5)
    
    print("With no jitter: ", cohen_kappa_score(y_val, y_pred, weights='quadratic'))
    print(confusion_matrix(y_val, y_pred))
            
    
compare_prediction_process("2019")
gc.collect()

Found 586 validated image filenames belonging to 5 classes.
Found 586 validated image filenames belonging to 5 classes.
Found 586 validated image filenames belonging to 5 classes.
Found 586 validated image filenames belonging to 5 classes.
Found 586 validated image filenames belonging to 5 classes.
Found 586 validated image filenames belonging to 5 classes.
Found 586 validated image filenames belonging to 5 classes.
With jitter:  0.8987738928725032
[[314   4   0   0   0]
 [  6  20  14   0   0]
 [  1  25 110   4   1]
 [  0   1  14  13   4]
 [  1   2  17  11  24]]
Found 586 validated image filenames belonging to 5 classes.
