# Imports

In [1]:
import openslide as op
from PIL import Image
import numpy as np
import random
import glob
import os
import util
import h5py
import cv2
from skimage import measure
from matplotlib import pyplot as plt
from scipy import misc, ndimage
from skimage import morphology
from skimage import color
from skimage import io

Using TensorFlow backend.


# Functions definition

In [2]:
def initArrays(files):
    extracting = {}
    none_extracting = {}
    n_files = len(files)
    for key, val in img_classes.items():
        data[key] = {}
        offset[key] = 0
        if(val==None):
            none_extracting[key]=0
        elif(val<=0):
            extracting[key]=0
        else:
            data[key + '_imgs'] = np.zeros((n_files*val,patchSize,patchSize,3)).astype(np.uint8)
            data[key + '_masks'] = np.zeros((n_files*val,patchSize,patchSize, 1)).astype(np.uint8)
    for file in files:
        print("Working on " + file)
        name = os.path.splitext(os.path.basename(file))[0]
        maskname = glob.glob(os.path.dirname(file)+"/"+name+maskPattern)[0]
        im = op.OpenSlide(file)
        imload = im.read_region((0,0), 1, im.level_dimensions[1])
        mask = Image.open(maskname)
        if(imload.size != mask.size):
           mask = mask.resize(imload.size)
        imArray = np.array(imload)
        maskArray = np.array(mask)
        maskArray= addBackground(imArray, maskArray)
        del imArray
        del im
        del imload
        for key, val in extracting.items():
            nb_patches = (0 - img_classes[key]) + 1
            maskClass = np.array(maskArray)
            np.putmask(maskClass,maskClass!=classes[key],0)
            maskClass = measure.label(maskClass)
            nb_extract = 0
            try:
                if(max_classes[key]<maskClass.max()):
                    print("Found ", maskClass.max(), " components for ", key, " but max number is ", max_classes[key])
                else:
                    print("Found ", maskClass.max(), " components for ", key)
                    max_classes[key] = maskClass.max()
            except KeyError:
                print("Found ", maskClass.max(), " components for ", key)
                max_classes[key] = maskClass.max()   
            extracting[key] += max_classes[key] * nb_patches
            print("New number of extraction for " , key, " : ", extracting[key])
        max_extraction = 0
        for key, val in extracting.items():
            print("Size for ", key, " is ", val)
            data[key + '_imgs'] = np.zeros((val,patchSize,patchSize,3)).astype(np.uint8)
            data[key + '_masks'] = np.zeros((val,patchSize,patchSize, 1)).astype(np.uint8)
            if(val>max_extraction):
                max_extraction=val
        for key, val in none_extracting.items():
            img_classes[key]=int(max_extraction/n_files)
            print("Number of extraction per file for ", key, " is ", img_classes[key])
            data[key + '_imgs'] = np.zeros((max_extraction,patchSize,patchSize,3)).astype(np.uint8)
            data[key + '_masks'] = np.zeros((max_extraction,patchSize,patchSize, 1)).astype(np.uint8)

In [3]:
def addBackground(imArray, maskArray):
    """
        Find the background on the array and put the value 2 on the mask
    """
    im_in = cv2.cvtColor(imArray,cv2.COLOR_BGR2GRAY)
    # Threshold.
    # Set values equal to or above 220 to 0.
    # Set values below 220 to 255.
    th, im_th = cv2.threshold(im_in, 220, 255, cv2.THRESH_BINARY_INV);
    # Copy the thresholded image.
    im_floodfill = im_th.copy()
    # Mask used to flood filling.
    # Notice the size needs to be 2 pixels than the image.
    h, w = im_th.shape[:2]
    mask = np.zeros((h+2, w+2), np.uint8)
    # Floodfill from point (0, 0)
    cv2.floodFill(im_floodfill, mask, (0,0), 255);
    # Invert floodfilled image
    im_floodfill_inv = cv2.bitwise_not(im_floodfill)
    # Combine the two images to get the foreground.
    im_out = im_th | im_floodfill_inv
    # Remove the small parts
    kernel = np.ones((int(maskArray.shape[0]/115),int(maskArray.shape[1]/115)),np.uint8)
    opening = cv2.morphologyEx(im_out, cv2.MORPH_OPEN, kernel)
    # Invert the mask
    np.putmask(opening,opening==0,2)
    np.putmask(opening,opening==255,0)
    opening += maskArray
    return opening

In [4]:
def extractPatches(output,filename,maskname, save=True):
    """
        Extract the patches for the given file and maskname
    """
    global j
    global data
    global offset
    # Opening the files
    im = op.OpenSlide(filename)
    imload = im.read_region((0,0), 1, im.level_dimensions[1])
    mask = Image.open(maskname)
    if(imload.size != mask.size):
       mask = mask.resize(imload.size)
    imArray = np.array(imload)
    maskArray = np.array(mask)
    halfPatch = patchSize//2
 
    #Preprocess
    maskArray_back = addBackground(imArray, maskArray)
    imArray = np.lib.pad(imArray, ((halfPatch, halfPatch), (halfPatch, halfPatch),(0,0)), 'reflect')
    maskArrayPad = np.lib.pad(maskArray, ((halfPatch, halfPatch), (halfPatch, halfPatch)), 'reflect')
    np.putmask(maskArrayPad, maskArrayPad==1, 255)
    # Extraction
    for key, val in classes.items():
        print("Extracting patches for ", key)
        if(img_classes[key]>0):
            indices = np.where(maskArray_back==val)
            sample = random.sample(range(len(indices[0])), img_classes[key])
            maskClass = np.array(maskArrayPad) #TODO : remove this ?  
            for i in sample:
                x=indices[0][i]
                y=indices[1][i]
                x2 = x+patchSize
                y2 = y+patchSize
                croppedIm = imArray[x:x2,y:y2,0:3]
                croppedMask = maskClass[x:x2,y:y2]              
                # create the images if needed
                if(save):
                    imageName = output +  "/image_" + str(j) + "_" + key + ".png"
                    imageNameMask = output  + "/image_" + str(j) + "_" + key +"_mask.png"
                    misc.imsave(imageName,croppedIm)
                    misc.imsave(imageNameMask,croppedMask)
                    os.chmod(imageName , 0o777)
                    os.chmod(imageNameMask, 0o777)
                # concatenate to the arrays
                data[key + '_imgs'][offset[key],:,:,:]=croppedIm
                data[key + '_masks'][offset[key],:,:,:]=np.expand_dims(croppedMask,-1)
                offset[key] += 1
                j+=1
                if(j%100==0):
                    print("",j," patches extracted")
        else:
            nb_patches = (0 - img_classes[key]) + 1
            maskClass = np.array(maskArray_back)
            np.putmask(maskClass,maskClass!=val,0)
            maskClass = measure.label(maskClass)
            print("Found ", maskClass.max(), " components for ", key)
            print("Extracting ",  max_classes[key], "")
            patches = np.zeros((maskClass.max() * nb_patches,patchSize,patchSize,3))
            mask_patches = np.zeros((maskClass.max() * nb_patches,patchSize,patchSize,1))
            in_patches = 0
            for i in (range(1,max_classes[key]+1)):
                indices = np.where(maskClass==i)
                x_center = int((indices[0].min() + indices[0].max()) / 2) + patchSize/2
                y_center = int((indices[1].min() + indices[1].max()) / 2) + patchSize/2
                division = 2
                for h in range(0,nb_patches):
                    x = x_center
                    y = y_center
                    if(h==0):
                        None
                    elif(h==1):
                        y -= patchSize/division
                    elif(h==2):
                        x -= patchSize/division
                    elif(h==3):
                        y += patchSize/division
                    elif(h==4):
                        x += patchSize/division
                    elif(h==5):
                        x -= patchSize/division
                        y -= patchSize/division
                    elif(h==6):
                        x -= patchSize/division
                        y += patchSize/division
                    elif(h==7):
                        x += patchSize/division
                        y += patchSize/division
                    elif(h==8):
                        x += patchSize/division
                        y -= patchSize/division
                    else:
                        x += random.randint(-patchSize/division,patchSize/division)     
                        y += random.randint(-patchSize/division,patchSize/division)
                    x1 = int(x-patchSize/2)
                    x2 = int(x+patchSize/2)
                    y1 = int(y-patchSize/2)
                    y2 = int(y+patchSize/2)
                    croppedIm = imArray[x1:x2,y1:y2,0:3]
                    croppedMask = maskArrayPad[x1:x2,y1:y2]
                    if(save):
                        imageName = output +  "/image_" + str(j) + "_" + key + ".png"
                        imageNameMask = output  + "/image_" + str(j) + "_" + key +"_mask.png"
                        misc.imsave(imageName,croppedIm)
                        misc.imsave(imageNameMask,croppedMask)
                        os.chmod(imageName , 0o777)
                        os.chmod(imageNameMask, 0o777)
                    patches[in_patches,:,:,:]=croppedIm
                    mask_patches[in_patches,:,:,:]=np.expand_dims(croppedMask,-1)
                    in_patches+=1
                    j+=1
                    if(j%100==0):
                        print("",j," patches extracted")

In [5]:
def extractFiles(files, outputFolder, save=False):
    """
        Extract all the files of a folder
    """
    global data
    global offset
    j = 0
    #initialize arrays
    initArrays(files)
    for oneFile in files:
        name = os.path.splitext(os.path.basename(oneFile))[0]
        for key, val in classes.items():
            folder = outputFolder
            if not os.path.exists(folder):
                try:
                  original_umask = os.umask(0)
                  os.makedirs(folder,0o777)
                finally:
                  os.umask(original_umask)
        print("Extracting " + name)
        maskFile = glob.glob(os.path.dirname(oneFile)+"/"+name+maskPattern)[0]
        extractPatches(outputFolder, oneFile,maskFile, save)
        print("Extraction for ", name, " finished")

In [6]:
def normalizeAndSave(outputFile):
    """
        Normalize the data and save it in a file
    """
    # TODO : GROS FDP PASSE TOUT EN FLOAT 32
    global data
    all_data = np.zeros((j,patchSize,patchSize,3))
    i = 0
    # concatenate all the classes in one array to get the mean and std
    for key, val in img_classes.items():
        data[key+"_imgs"] = data[key+"_imgs"].astype(np.float32)/255
        data[key+"_masks"] = data[key+"_masks"].astype(np.float32)
        n = data[key+"_imgs"].shape[0] + i
        print("Writing from ", i, " to ", n, " for ", key)
        all_data[i:n,:,:,:]= data[key+"_imgs"]
        i = n
    mean = all_data.mean()
    std = all_data.std()
    stats = np.zeros(2)
    stats[0] = mean
    stats[1] = std
    # Normalize the data
    for key, val in img_classes.items():
        data[key+"_imgs"] -= mean
        data[key+"_imgs"] /= std
    print("Mean : ", stats[0])
    print("Std : ", stats[1])
    # Create the file
    f = h5py.File(outputFile,"w")
    f.create_dataset("stats", data=stats)
    for key, val in classes.items():
        f.create_dataset(key+"_imgs", data=data[key+"_imgs"])
        f.create_dataset(key+"_masks", data=data[key+"_masks"])
    f.close()

# Configuration

In [7]:
patchSize = 256 # Size of the patches
filenames_train = ['/root/workspace/data/SVS_train/*.svs'] # Files we will extract for training. You can use pattern suchs as *.svs
outputFolder_train = '/root/workspace/data/new_extract/' # Output folder for training
maskPattern= '*.png' # Pattern for the maskfilesoutputFile,
classes = {'neg': 0, 'back' : 2, 'pos' : 1} # Classes and their value
img_classes = {'neg' : None ,'back':5, 'pos' : -4} # Number of patches to extract for each class
max_classes = {'pos': 100}
save = True # if true the images will be save

# Variable initialization

In [8]:
j=0 # Counting variable
data = {} # Dict with all data
offset = {} # Dict with the offset per class

# Code Execution

In [9]:
# Get the files for training
files_train = []
for filename in filenames_train:
    files_train.extend(glob.glob(filename))
print(files_train)
files_train=files_train[0:1]

['/root/workspace/data/SVS_train/IFTA_14_02.svs', '/root/workspace/data/SVS_train/IFTA_15_02.svs', '/root/workspace/data/SVS_train/IFTA_11_02.svs', '/root/workspace/data/SVS_train/IFTA_13_02.svs', '/root/workspace/data/SVS_train/IFTA_10_02.svs']


In [10]:
# Training extraction
extractFiles(files_train,outputFolder_train, save)

Working on /root/workspace/data/SVS_train/IFTA_14_02.svs
Found  168  components for  pos  but max number is  100
New number of extraction for  pos  :  500
Size for  pos  is  500
Number of extraction per file for  neg  is  500
Extracting IFTA_14_02
Found  168  components for  pos
Extracting  100 
 100  patches extracted
 200  patches extracted
 300  patches extracted
 400  patches extracted
 500  patches extracted
 600  patches extracted
 700  patches extracted
 800  patches extracted
 900  patches extracted
 1000  patches extracted
Extraction for  IFTA_14_02  finished


In [11]:
# Normalize and save the data
normalizeAndSave(outputFolder_train+"matrice_train.h5")

Writing from  0  to  5  for  back
Writing from  5  to  505  for  pos
Writing from  505  to  1005  for  neg
Mean :  0.348024833708
Std :  0.368327307502
