In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re
import tensorflow.keras.backend as K

In [2]:
import tensorflow as tf
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.


AUTO     = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
#REPLICAS = 8
print(f'REPLICAS: {REPLICAS}')

Running on TPU  grpc://10.0.0.2:8470
REPLICAS: 8


In [3]:
df_train = pd.read_csv("/kaggle/input/siim-isic-melanoma-classification/train.csv")
df_test = pd.read_csv("/kaggle/input/siim-isic-melanoma-classification/test.csv")
sample_sub = pd.read_csv("/kaggle/input/siim-isic-melanoma-classification/sample_submission.csv")

In [4]:
FOLDS = 5
IMG_SIZE = 512
BATCH_SIZE = 32
EPOCHS = 12

In [5]:
import tensorflow as tf
from kaggle_datasets import KaggleDatasets
GCS_PATH2 = KaggleDatasets().get_gcs_path("isic2019-512x512")
GCS_PATH1 = KaggleDatasets().get_gcs_path("melanoma-512x512")

In [6]:
train_filenames1 = tf.io.gfile.glob(GCS_PATH1 + '/train*.tfrec')
train_filenames2 = tf.io.gfile.glob(GCS_PATH2 + '/train*.tfrec')
test_filenames = tf.io.gfile.glob(GCS_PATH1 + '/test*.tfrec')
data_filenames = train_filenames1 + train_filenames2

In [7]:
from sklearn.model_selection import train_test_split
train_filenames, valid_filenames = train_test_split(data_filenames, test_size=0.2, random_state=0, shuffle=True)

In [8]:
def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "target": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = example['image']
    label = example['target']
    return image, label # returns a dataset of (image, label) pairs

def read_unlabeled_tfrecord(example, return_image_name=True):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "image_name": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
        # class is missing, this competitions's challenge is to predict flower classes for the test dataset
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = example['image']
    idnum = example['image_name']
    return image, idnum if return_image_name else 0


def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord, num_parallel_calls=AUTO)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

In [9]:
 def decode_image(image_data, augment=False):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    if augment:
        image = tf.image.random_flip_left_right(image)
        image = tf.image.random_flip_up_down(image)
        image = tf.image.random_saturation(image, 0, 2)
        image = tf.image.rot90(image)
    image = tf.reshape(image, [IMG_SIZE,IMG_SIZE, 3]) # explicit size needed for TPU
    return image

def get_training_dataset():
    dataset = tf.data.TFRecordDataset(train_filenames, num_parallel_reads=AUTO)
    dataset = dataset.repeat() 
    dataset = dataset.shuffle(1024*8)
    dataset = dataset.map(read_labeled_tfrecord, num_parallel_calls=AUTO)
    dataset = dataset.map(lambda img, imgname_or_label: (decode_image(img, augment=True), imgname_or_label), num_parallel_calls=AUTO)
    dataset = dataset.batch(BATCH_SIZE * REPLICAS)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_val_dataset():
    dataset = tf.data.TFRecordDataset(valid_filenames, num_parallel_reads=AUTO)
    dataset = dataset.map(read_labeled_tfrecord, num_parallel_calls=AUTO)
    dataset = dataset.map(lambda img, imgname_or_label: (decode_image(img, augment=False), imgname_or_label), num_parallel_calls=AUTO)
    dataset = dataset.batch(BATCH_SIZE * REPLICAS)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_test_dataset(ordered=False):
    dataset = tf.data.TFRecordDataset(test_filenames, num_parallel_reads=AUTO)
    dataset = dataset.repeat() 
    dataset = dataset.map(lambda example: read_unlabeled_tfrecord(example, False), num_parallel_calls=AUTO)
    dataset = dataset.map(lambda img, imgname_or_label: (decode_image(img, augment=False), imgname_or_label), num_parallel_calls=AUTO)
    dataset = dataset.batch(BATCH_SIZE * REPLICAS)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [10]:
def get_lr_callback(batch_size=8):
    lr_start   = 0.000005
    lr_max     = 0.00000125 * REPLICAS * batch_size
    lr_min     = 0.000001
    lr_ramp_ep = 5
    lr_sus_ep  = 0
    lr_decay   = 0.8
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        return lr

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback

In [11]:
def build_model(dim=512):
    inp = tf.keras.layers.Input(shape=(dim,dim,3))
    base = efn.EfficientNetB3(input_shape=(dim,dim,3),weights='imagenet',include_top=False)
    x = base(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(1,activation='sigmoid')(x)
    model = tf.keras.Model(inputs=inp,outputs=x)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=0.05) 
    model.compile(optimizer=opt,loss=loss,metrics=['AUC'])
    return model

In [12]:
!pip install yapl==0.1.2 efficientnet > /dev/null

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [13]:
#model 3
import efficientnet.tfkeras as efn
input_shape = (512, 512, 3)
def create_model():
    model = tf.keras.Sequential([
        efn.EfficientNetB4(
                        input_shape=input_shape,
                        weights='imagenet',
                        include_top=False
                    ),
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    return model

In [14]:
# with strategy.scope():
#     model = create_model()
# optimizer = tf.keras.optimizers.Adam(lr=0.001)
# model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
# model.summary()

In [15]:
import math
callbacks = [get_lr_callback(BATCH_SIZE)] 
step_per_epoch = count_data_items(train_filenames)/BATCH_SIZE//REPLICAS

In [16]:
# warmup_history = model.fit(get_training_dataset(), 
#                            steps_per_epoch=step_per_epoch, 
#                            validation_data=get_val_dataset(),
#                             epochs=1, 
#                             verbose=1, callbacks=callbacks).history

In [17]:
test_dataset = get_test_dataset(ordered=True)

In [18]:
# test_images_ds = test_dataset.map(lambda image, idnum: image)
# ct_test = count_data_items(test_filenames)
# STEPS = ct_test/BATCH_SIZE/4/REPLICAS
# probabilities = model.predict(test_images_ds, steps=STEPS).flatten()
# print(probabilities)

In [19]:
# test_imgs = test_dataset.map(lambda images, ids: images)
# img_ids_ds = test_dataset.map(lambda images, ids: ids).unbatch()

# img_ids = []
# for coutner, ids in enumerate(img_ids_ds):
#     if coutner%500 == 0:
#         print(coutner)
#     img_ids.append(ids.numpy())

# img_ids = np.array(img_ids).astype('U')

In [20]:
# sample_sub = sample_sub.set_index("image_name").transpose().reindex(columns=list(img_ids)).transpose()
# sample_sub["target"] = probabilities
# sample_sub.to_csv("submission.csv")

# KFOLD

In [21]:
def get_dataset(filename, shuffle=False, repeat=False, return_img_name=False, augment=False, labeled=False, batch_size=32, dim=512):
    dataset = tf.data.TFRecordDataset(filename, num_parallel_reads=AUTO)
    if repeat:
        dataset = dataset.repeat()
    if shuffle:
        dataset = dataset.shuffle(1024*8)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        dataset = dataset.with_options(opt)
    if labeled:
        dataset = dataset.map(read_labeled_tfrecord, num_parallel_calls=AUTO)
    else:
         dataset = dataset.map(lambda example: read_unlabeled_tfrecord(example, return_img_name), num_parallel_calls=AUTO)
    dataset = dataset.map(lambda img, imgname_or_label: (decode_image(img, augment=augment), imgname_or_label), num_parallel_calls=AUTO)
    dataset = dataset.batch(batch_size * REPLICAS)
    dataset = dataset.prefetch(AUTO)
    return dataset

In [22]:
len(train_filenames)

36

In [23]:
FOLDS = 5
IMG_SIZE = 512
BATCH_SIZE = 32
EPOCHS = 12
weights = 1/6

In [24]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=FOLDS, shuffle=True, random_state=0)

In [25]:
TTA = 11
preds = np.zeros((count_data_items(test_filenames),1))
for fold, (train_file, valid_file) in enumerate(kfold.split(data_filenames)):
    print(fold)
    training_dataset = get_dataset([data_filenames[x] for x in train_file], augment=True, shuffle=True, repeat=True, labeled=True)
    valid_dataset = get_dataset([data_filenames[x] for x in valid_file], augment=True, shuffle=False, repeat=False, labeled=False)
    K.clear_session()
    with strategy.scope():
        model = create_model()
    if fold == 3:
        print("ok")
    optimizer = tf.keras.optimizers.Adam(lr=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    callbacks = [get_lr_callback(BATCH_SIZE)] 
    steps_per_epoch = count_data_items([data_filenames[x] for x in train_file])/BATCH_SIZE//REPLICAS
    sv = tf.keras.callbacks.ModelCheckpoint('fold-%i.h5'%fold, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='min', save_freq='epoch')
    callbacks.append(sv)
    warmup_history = model.fit(training_dataset, steps_per_epoch=steps_per_epoch, validation_data=valid_dataset, epochs=12, verbose=1, callbacks=callbacks).history
    if fold == 3:
        print("whatup")


    test_dataset = get_dataset(test_filenames, augment=True, repeat=True, batch_size=BATCH_SIZE*4)
    ct_valid = count_data_items(test_filenames)
    if fold == 3:
        print("hello")
    STEPS = TTA * ct_valid/32/4/REPLICAS
    pred = model.predict(test_dataset,steps=STEPS,verbose=1)[:TTA*ct_valid,]
    preds[:, 0] += np.mean(pred.reshape((ct_valid,TTA),order='F'),axis=1)

0
Downloading data from https://github.com/Callidior/keras-applications/releases/download/efficientnet/efficientnet-b4_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
1
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
2
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
3
ok
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
whatup
hello
4
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [26]:
preds = np.concatenate(preds)

In [27]:
preds = preds/6

In [28]:
ds = get_dataset(test_filenames, augment=False, repeat=False,
                 labeled=False, return_img_name=True)

image_names = np.array([img_name.numpy().decode("utf-8") 
                        for img, img_name in iter(ds.unbatch())])

In [29]:
submission = pd.DataFrame(dict(image_name=list(image_names), target=preds))
submission = submission.sort_values('image_name')
submission = submission.set_index("image_name")
submission.to_csv('submission.csv', index=True)
submission.head()

Unnamed: 0_level_0,target
image_name,Unnamed: 1_level_1
ISIC_0052060,7e-06
ISIC_0052349,7e-06
ISIC_0058510,2.1e-05
ISIC_0073313,7.3e-05
ISIC_0073502,7e-06


In [30]:
# from tensorflow.keras.preprocessing.image import ImageDataGenerator
# train_datagen = ImageDataGenerator(
#         shear_range=0.1,
#         zoom_range=0.1,
#         horizontal_flip=True,
#         rotation_range=10.,
#         fill_mode='reflect',
#         width_shift_range = 0.1, 
#         height_shift_range = 0.1)
# train_datagen.flow(get_dataset(train_filenames, augment=False, shuffle=False, repeat=True, labeled=True))

In [31]:
# from sklearn.model_selection import train_test_split
# train_file, valid_file = train_test_split(train_filenames, test_size=0.5, random_state=0)

In [32]:
# training_dataset = get_dataset(train_file, augment=True, shuffle=True, repeat=True, labeled=True)
# valid_dataset = get_dataset(valid_file, augment=True, shuffle=False, repeat=True, labeled=False, batch_size=32*5)

In [33]:
# TTA = 10
# oof_pred = []
# test_dataset = get_dataset(test_filenames, augment=True, repeat=True, batch_size=BATCH_SIZE*4)
# ct_valid = count_data_items(test_filenames)
# STEPS = TTA * ct_valid/32/4/REPLICAS
# pred = model.predict(test_dataset,steps=STEPS,verbose=1)       
# preds = pred[:TTA*ct_valid,]

In [34]:
# preds.shape

In [35]:
# oof_pred.append( np.mean(preds.reshape((ct_valid,TTA),order='F'),axis=1) ) 

In [36]:
np.zeros((count_data_items(test_filenames),1))

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [37]:
len(sample_sub)

10982