# Kaggle Neuroblastoma Detection
## Lachlan Dryburgh 2021

Tensorflow implementation of a u-net image segmentation convolutional nerual network.  Trained to label neurons, astrocytes and neuroglioblastoma cell in microscope images.

## Imports and Defines

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import os


IMG_HEIGHT = 520
IMG_WIDTH = 704
NUM_CLASS = 3

SEED = 351

CELL = {
    'shsy5y':0,
    'astro':1,
    'cort':2,
    's':0,
    'a':1,
    'c':2
}



In [2]:
def set_strategy():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
    except ValueError:
        tpu = None
        gpus = tf.config.experimental.list_logical_devices("GPU")

    if tpu:
        strategy = tf.distribute.TPUStrategy(tpu)
        print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
    elif len(gpus) > 1:
        strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
        print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
    elif len(gpus) == 1:
        strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
        print('Running on single GPU ', gpus[0].name)
    else:
        strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
        print('Running on CPU')

    print("Number of accelerators: ", strategy.num_replicas_in_sync)

    return strategy

strategy = set_strategy()

## Importing Images

In [3]:
train_img = "../input/sartorius-cell-instance-segmentation/train"
train_csv = "../input/sartorius-cell-instance-segmentation/train.csv"
test_img = "../input/sartorius-cell-instance-segmentation/test"
semi_supervised = "../input/sartorius-cell-instance-segmentation/train_semi_supervised/"

df = pd.read_csv(train_csv)
df.head()


In [4]:
ids = df['id'].unique()
len(ids)

In [5]:
df.groupby('cell_type').size()

## Image pixel annotation mask 

In [7]:
def rle_decode(mask_rle, shape, color=1):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.float32)
    for lo, hi in zip(starts, ends):
        img[lo : hi] = color
    return img.reshape(shape)

def generate_mask(id, df, shape):
    mask = np.zeros((shape[0], shape[1], shape[2]), dtype=np.uint8)
    
    for index, row in df[df['id']==id].iterrows():
        
        c = CELL[row['cell_type']]
        
        m = rle_decode(row['annotation'], (IMG_HEIGHT, IMG_WIDTH))
        
        mask[:,:,c] += np.array(m, dtype=np.uint8)
        mask = mask.clip(0,1)
       
    return mask

In [8]:
import random

random.seed(SEED)

In [9]:
random_id = random.choice(ids)

sample_path = f"{train_img}/{random_id}.png"

im = plt.imread(sample_path)
m = generate_mask(random_id, df, (IMG_HEIGHT, IMG_WIDTH, NUM_CLASS))

print(random_id)

figure, ax = plt.subplots(1,2,figsize=(15,7))
plt.suptitle(random_id,fontweight="bold", size=20)
ax[0].imshow(im, cmap = 'seismic')
ax[1].imshow(np.array(m, dtype=np.float32))

In [10]:
im = im.reshape((IMG_HEIGHT, IMG_WIDTH, 1))

np.shape(im)

In [11]:
shsy5y = '1c4f14cce8ee'
astro = '129f894abe35'
cort = '95de75855f80'

s_path = f"{train_img}/{shsy5y}.png"
a_path = f"{train_img}/{astro}.png"
c_path = f"{train_img}/{cort}.png"

s_im = plt.imread(s_path)
a_im = plt.imread(a_path)
c_im = plt.imread(c_path)

figure, ax = plt.subplots(2,3,figsize=(15,9))
plt.suptitle("Images and Masks",fontweight="bold", size=20)

ax[0,0].imshow(s_im, cmap = 'seismic')
ax[0,0].set_title(f"SH-SY5Y  - {shsy5y}")
ax[1,0].imshow(np.array(generate_mask(shsy5y, df, (IMG_HEIGHT, IMG_WIDTH, NUM_CLASS)), dtype=np.float32))
ax[1,0].set_title(f"{shsy5y} annotation mask")
ax[0,1].imshow(a_im, cmap = 'seismic')
ax[0,1].set_title(f"Astrocyte - {astro}")
ax[1,1].imshow(np.array(generate_mask(astro, df, (IMG_HEIGHT, IMG_WIDTH, NUM_CLASS)), dtype=np.float32))
ax[1,1].set_title(f"{astro} annotation mask")
ax[0,2].imshow(c_im, cmap = 'seismic')
ax[0,2].set_title(f"Coritical Neuron  - {cort}")
ax[1,2].imshow(np.array(generate_mask(cort, df, (IMG_HEIGHT, IMG_WIDTH, NUM_CLASS)), dtype=np.float32))
ax[1,2].set_title(f"{cort} annotation mask")
[axi.set_axis_off() for axi in ax.ravel()]
figure.tight_layout()
figure.show()


## Split the data into training and validation sets

## Generate Training dataset

In [13]:
def load_images(ids, folder, shape):
    ims = np.zeros((len(ids),shape[0],shape[1],1))
    
    for i in range(len(ids)):
        path = f"{folder}/{ids[i]}.png"
        im = plt.imread(path)
        im = im.reshape((shape[0],shape[1],1))
        ims[i] = im
        
    return ims

def load_class(ids, df):
    cs = np.zeros((len(ids), NUM_CLASS))
    
    for i in range(len(ids)):
        c = df[df['id']==ids[i]].iloc[0]
        c = CELL[c["cell_type"]]
        
        cs[i,c] = 1
        
    return cs
    
    
def load_class2(ids, df):
    cs = []
    
    for i in ids:
        c = df[df['id']==i].iloc[0]
        c = CELL[c["cell_type"]]
        
        cs.append(c)
        
    return cs
    
def load_masks(ids, df, shape):
    ms = np.zeros((len(ids),shape[0],shape[1],shape[2]))
    
    for i in range(len(ids)):
        m = generate_mask(ids[i], df, shape)
        ms[i] = m
        
    return ms

In [14]:
from os import walk

filenames = next(walk(semi_supervised), (None, None, []))[2] 

In [15]:
import shutil


try:
    os.mkdir("../semi_supervised")
except:
    print("Already exists")

try:
    os.mkdir("../semi_supervised/a")
except:
    print("Already exists")
    
try:    
    os.mkdir("../semi_supervised/c")
except:
    print("Already exists")
    
try:
    os.mkdir("../semi_supervised/s")
except:
    print("Already exists")

for f in filenames:
    shutil.copyfile(f"{semi_supervised}{f}", f"../semi_supervised/{f[0]}/{f}")

In [16]:
class_training = tf.keras.utils.image_dataset_from_directory(
    "../semi_supervised/",
    color_mode = 'grayscale',
    validation_split=0.2,
    subset="training",
    seed=SEED,
    image_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=32)

class_validation = tf.keras.utils.image_dataset_from_directory(
    "../semi_supervised/",
    color_mode = 'grayscale',
    validation_split=0.2,
    subset="validation",
    seed=SEED,
    image_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=32)


In [17]:
mask_array = load_masks(ids, df, (IMG_HEIGHT,IMG_WIDTH,NUM_CLASS))


print(mask_array.shape)


In [29]:
train_ds = tf.keras.utils.image_dataset_from_directory(
  train_img,
  labels='inferred',
  seed=123,
  image_size=(520, 704),
  batch_size=16)

## Define  model
We are actually defining 2 models.

The downstack of the u-net is used as image classifier so that it can be trained where we only have labels for the enire image rather than pixels.  This will allow transfer learning.

The Downstack feeds back into the upstack for our unet pixel classifier.


In [17]:
with strategy.scope():
    in1 = keras.Input(shape=(IMG_HEIGHT, IMG_WIDTH,1))

    conv1 = layers.Conv2D(16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(in1)
    conv1 = layers.Dropout(0.2)(conv1)
    conv1 = layers.Conv2D(16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(conv1)
    pool1 = layers.MaxPooling2D((2, 2))(conv1)

    conv2 = layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(pool1)
    conv2 = layers.Dropout(0.2)(conv2)
    conv2 = layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(conv2)
    pool2 = layers.MaxPooling2D((2, 2))(conv2)

    conv3 = layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(pool2)
    conv3 = layers.Dropout(0.2)(conv3)
    conv3 = layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(conv3)
    pool3 = layers.MaxPooling2D((2, 2))(conv3)

    conv4 = layers.Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(pool3)
    conv4 = layers.Dropout(0.2)(conv4)
    conv4 = layers.Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(conv4)

    up1 = layers.concatenate([layers.UpSampling2D((2, 2))(conv4), conv3], axis=-1)
    conv5 = layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(up1)
    conv5 = layers.Dropout(0.2)(conv5)
    conv5 = layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(conv5)

    up2 = layers.concatenate([layers.UpSampling2D((2, 2))(conv5), conv2], axis=-1)
    conv6 = layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(up2)
    conv6 = layers.Dropout(0.2)(conv6)
    conv6 = layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(conv6)

up3 = layers.concatenate([layers.UpSampling2D((2, 2))(conv6), conv1], axis=-1)
conv7 = layers.Conv2D(16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(up3)
conv7 = layers.Dropout(0.2)(conv7)
conv7 = layers.Conv2D(16, (3, 3), activation='relu', kernel_initializer='he_normal', padding='same')(conv7)
segmentation = layers.Conv2D(3, (1, 1), activation='sigmoid', name='seg')(conv7)

class_box = layers.Flatten()(conv4)
class_box = layers.Dense(128, activation = 'relu')(class_box)
class_box = layers.Dense(3)(class_box)

class_model = keras.Model(inputs=[in1], outputs=[class_box])

model = keras.Model(inputs=[in1], outputs=[segmentation])


class_model.compile(optimizer='adam',
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=['acc'])

model.compile(optimizer="adam", loss = {'seg': 'categorical_crossentropy'}, metrics={'seg': ['acc']})

## Compile the class-box model


In [18]:
keras.utils.plot_model(class_model, "downstack.png", show_shapes=True)

## Compile the full model

In [73]:
keras.utils.plot_model(model, "full_model.png", show_shapes=True)

## Train the downstack

In [19]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = class_training.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = class_validation.cache().prefetch(buffer_size=AUTOTUNE)

In [20]:
his = class_model.fit(train_ds,
                epochs=20,
                validation_data=val_ds
)

In [21]:
plt.plot(his.history['loss'])
plt.plot(his.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [22]:
plt.plot(his.history['acc'])
plt.plot(his.history['val_acc'])
plt.title('model loss')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
BATCH_SIZE = 16
BUFFER_SIZE = 16

AUTO = tf.data.AUTOTUNE

EPOCHS = 1

In [None]:
t_ds = (
    train_ds
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE)
    .repeat()
    .prefetch(AUTO))

v_ds = (
    valid_ds
    .batch(BATCH_SIZE)
    .repeat()
    .prefetch(AUTO))

In [None]:
STEPS_PER_EPOCH = len(train_ids) // BATCH_SIZE


VALIDATION_STEPS = len(valid_ids) // BATCH_SIZE

model_history = model.fit(t_ds, epochs=EPOCHS,
                          steps_per_epoch=STEPS_PER_EPOCH,
                          validation_steps=VALIDATION_STEPS,
                          validation_data=v_ds)

In [None]:
v = random.choice(valid_ids)

v_path = f"../input/sartorius-cell-instance-segmentation/train/{v}.png"

figure, ax = plt.subplots(1,3,figsize=(15,9))
plt.suptitle("Images and Masks",fontweight="bold", size=20)

v_im = plt.imread(v_path)
ax[0].imshow(v_im,cmap = 'seismic')
ax[1].imshow(np.array(generate_mask(v, df, (IMG_HEIGHT, IMG_WIDTH, NUM_CLASS)), dtype=np.float32))


In [None]:
v = valid_ds



In [None]:
loss = model_history.history['loss']
val_loss = model_history.history['val_loss']
plt.figure()
plt.plot(model_history.epoch, loss, 'r', label='Training loss')
plt.plot(model_history.epoch, val_loss, 'bo', label='Validation loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss Value')
plt.ylim([0, 1])
plt.legend()
plt.show()

In [None]:
test_paths = [  os.path.join(test_img, each)  for each in os.listdir(test_img) if each.endswith('.png')]

In [None]:
def rle_encode(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    ref: https://www.kaggle.com/dragonzhang/positive-score-with-detectron-3-3-inference
    '''
    pixels = img.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)