In [10]:
#It is important DCS computer has pathlib installed, which may not be there initially
#References: largely modified from https://www.kaggle.com/stkbailey/teaching-notebook-for-total-imaging-newbies
#This script uses k-means clustering of colours, dropping the background colour cluster and merging the non-backgrounds to get a mask
#We allow the option to include watershed segmentation
#We allow the option to scale image channels


#IMPORTING MODULES
%matplotlib inline
import pandas as pd
from skimage.filters import threshold_otsu
from scipy import ndimage
import pathlib
import imageio
import numpy as np
import matplotlib.pyplot as plt
from skimage.color import rgb2gray
from sklearn.cluster import KMeans
import skimage
from scipy import ndimage as ndi
from skimage.morphology import watershed
from skimage.feature import peak_local_max




#VARIABLES
k = 3                 #k in k-means clustering
ws = False            #Perform watershed or not
scale = False          #Scale image channels
IMG_CHANNELS = 3
MIN_SIZE = 17         #Delete cell masks smaller than this



def kmeanscluster(image,k,merge=False):
    
    #Calculates the k means clustering of the colours in the image
    
    #img: the four channel image we wish to work with
    #k: the k in k-means clustering
    #merge: an optional argument which gives a merging of non-background clusters to get a mask
    
    #returns either the mask if we are merging, or the labels if we are not
    
    image = image[:,:,:3]
    image_array = image.reshape((image.shape[0] * image.shape[1], 3))
    # Clusters the pixels
    clt = KMeans(n_clusters = k)
    clt.fit(image_array)
    labels = clt.labels_
    if merge==True:
        counts = np.bincount(labels)
        labels = labels.reshape(image.shape[0],image.shape[1])
        background = np.argmax(counts)
        mask = np.where(labels != background, 1, 0)
        return mask
    else:
        return labels

    
def scale_img_channels(img):
    
    #Scales the image channels linearly to use the entire range of 0 to 255
    
    #img: the 4 channel image we are rescaling
    
    #returns: the channel rescaled image
    
    for i in range(IMG_CHANNELS):
        channel = img[:,:,i]
        channel = channel - channel.min()
        channelmax = channel.max()
        if channelmax > 0:
            factor = 255/channelmax
            channel = (channel * factor).astype(int)
        img[:,:,i] = channel
    return img


def watershed_segmentation_dist(labels,display=False):
    
    #performs watershed segmentation where the distance function is the distance to the edge of the label
    
    #labels: an array of numbers, 0 value for background and value i for the i^th label
    #display: gives a picture of our outputs
    
    #returns: new labels for the image after being watershed segmented
    
    distance = ndi.distance_transform_edt(labels)
    local_maxi = peak_local_max(distance, indices=False, labels=labels)
    markers = ndi.label(local_maxi)[0]
    labels = watershed(-distance, markers, mask=labels)
    if display == True:
        fig, axes = plt.subplots(ncols=3, figsize=(9, 3), sharex=True, sharey=True,
                                 subplot_kw={'adjustable': 'box-forced'})
        ax = axes.ravel()
        ax[0].imshow(labels, cmap=plt.cm.gray, interpolation='nearest')
        ax[0].set_title('Overlapping objects')
        ax[1].imshow(-distance, cmap=plt.cm.gray, interpolation='nearest')
        ax[1].set_title('Distances')
        ax[2].imshow(labels, cmap=plt.cm.spectral, interpolation='nearest')
        ax[2].set_title('Separated objects')
        for a in ax:
            a.set_axis_off()
        fig.tight_layout()
        plt.show()
    return labels


#Reference: https://www.kaggle.com/rakhlin/fast-run-length-encoding-python
def rle_encoding(x):
    
    #Encodes the label into RLE
    
    #x: numpy array of shape (height, width), 1 - mask, 0 - background
    
    #returns: RLE encoding of the label
    
    dots = np.where(x.T.flatten()==1)[0] # .T sets Fortran order down-then-right
    run_lengths = []
    prev = -2
    for b in dots:
        if (b>prev+1): run_lengths.extend((b+1, 0))
        run_lengths[-1] += 1
        prev = b
    return " ".join([str(i) for i in run_lengths])



def RLE_mask(image,image_id):
    
    #Encodes the label into RLE
    
    #image:
    #image_id:
    #df: 
    
    #returns: RLE encoding of the label
    
    #Dropping alpha channel and (maybe) scaling other channels
    image = image[:,:,:3]
    if scale == True:
        image = scale_img_channels(image) 
        
    #k-means clustering and merging to get mask   
    mask = kmeanscluster(image,k,merge=True) 
    
    # Make sure the larger portion of the mask is considered background
    if np.sum(mask==0) < np.sum(mask==1):
        mask = np.where(mask, 0, 1)

    # Deriving individual mask for each object
    labels, numlabels = ndimage.label(mask)           # Labels each component with different number
    if ws == True:
        labels = watershed_segmentation_dist(labels,display=False)     # Watershed
        numlabels = np.amax(labels)+1                 # Plus one due to zero indexing
    label_arrays = []
    for n in range(1, numlabels+1, 1):
        label_mask = np.where(labels == n, 1, 0)
        if sum(sum(label_mask)) >= MIN_SIZE:          # Checks if (predicted) nuclei is big enough to count
            label_arrays.append(label_mask)
        else:
            mask = np.where(labels == n, 0, mask)
    labels, numlabels = ndimage.label(mask)           # Regenerate labels

    #Writing image_df to be added to the main df
    im_df = pd.DataFrame(columns=["ImageID","EncodedPixels"])
    for n in range(1, numlabels+1, 1):
        label_mask = np.where(labels == n, 1, 0)
        rle_string = rle_encoding(label_mask)
        series = pd.Series({'ImageID': image_id, 'EncodedPixels': rle_string})
        im_df = im_df.append(series, ignore_index=True)

    return im_df



# MAIN

# This will be our final dataframe
df = pd.DataFrame(columns=["ImageID","EncodedPixels"])

# Glob the training data and load single image paths one at a time
training_paths = pathlib.Path('/modules/cs342/Assignment2/FullTesting/').glob('*/images/*.png')
training_sorted = sorted([x for x in training_paths])
for image_path in training_sorted:
    image_id = image_path.parts[-3]
    image = imageio.imread(str(image_path))
    if scale == True:
        image = scale_img_channels(image)
    im_df = RLE_mask(image,image_id)
    df = df.append(im_df, ignore_index=True)

#Submission
df.to_csv('Q1submission.csv', index=None)