# Imports

In [None]:
import openslide as op
from PIL import Image
import numpy as np
import random
import glob
import os
import util
import h5py
from scipy import misc, ndimage
from skimage import morphology
from skimage import color
from skimage import io

# Functions definition

In [None]:
def addBackground(imArray, maskArray):
    """
        Find the background on the array and put the value 2 on the mask
    """
    green = imArray[:,:,1]
    indexes = np.where(green<200)
    background = np.zeros(maskArray.shape)
    for i in range(len(indexes[0])):
        background[indexes[0][i],indexes[1][i]]=1
    # Morphological operations
    slem = morphology.disk(10)
    background = morphology.closing(background,slem)
    background = morphology.erosion(background,slem)
    background = ndimage.morphology.binary_fill_holes(background)
    # Change the original mask
    backgroundIndex = np.where(background==0)
    backArray = np.array(maskArray)
    for i in range(len(backgroundIndex[0])):
        backArray[backgroundIndex[0][i],backgroundIndex[1][i]]=2
    return backArray

In [None]:
def extractPatches(output,filename,maskname, save=False):
   """
       Extract the patches for the given file and maskname
   """
   global j
   global data
   global offset
   # Opening the files
   im = op.OpenSlide(filename)
   imload = im.read_region((0,0), 1, im.level_dimensions[1])
   mask = Image.open(maskname)
   if(imload.size != mask.size):
      mask = mask.resize(imload.size, Image.ANTIALIAS)
   imArray = np.array(imload)
   maskArray = np.array(mask)
   halfPatch = patchSize//2

   #Preprocess
   #maskArray = addBackground(imArray, maskArray)
   imArray = np.lib.pad(imArray, ((halfPatch, halfPatch), (halfPatch, halfPatch),(0,0)), 'reflect')
   maskArrayPad = np.lib.pad(maskArray, ((halfPatch, halfPatch), (halfPatch, halfPatch)), 'reflect')
   np.putmask(maskArrayPad, maskArrayPad==1, 255)
   # Extraction
   for key, val in classes.items():
       indices = np.where(maskArray==val)
       sample = random.sample(range(len(indices[0])), img_classes[key])
       maskClass = np.array(maskArrayPad) #TODO : remove this ?           
       for i in sample:
           x=indices[0][i]
           y=indices[1][i]
           x2 = x+patchSize
           y2 = y+patchSize
           croppedIm = imArray[x:x2,y:y2,0:3]
           croppedMask = maskClass[x:x2,y:y2]
           imageName = output +  "/image_" + str(j) + ".png"
           imageNameMask = output  + "/image_" + str(j) +"_mask.png"
           # create the images if needed
           if(save):
               misc.imsave(imageName,croppedIm)
               misc.imsave(imageNameMask,croppedMask)
               os.chmod(imageName , 0o777)
               os.chmod(imageNameMask, 0o777)
           # concatenate to the arrays
           data[key + '_imgs'][offset[key],:,:,:]=croppedIm.astype(np.float32)/255
           data[key + '_masks'][offset[key],:,:,:]=np.expand_dims(croppedMask,-1).astype(np.float32)/255
           offset[key] += 1
           j+=1
           if(j%100==0):
               print("",j," patches extracted")

In [None]:
def extractFiles(files, outputFolder, save=False):
    """
        Extract all the files of a folder
    """
    global data
    global offset
    n_files = len(files)
    j = 0
    #initialize arrays
    for key, val in img_classes.items():
        data[key] = {}
        offset[key] = 0
        data[key + '_imgs'] = np.zeros((n_files*val,patchSize,patchSize,3)).astype(np.float32)
        data[key + '_masks'] = np.zeros((n_files*val,patchSize,patchSize, 1)).astype(np.float32)
    for oneFile in files:
        name = os.path.splitext(os.path.basename(oneFile))[0]
        for key, val in classes.items():
            folder = outputFolder
            if not os.path.exists(folder):
                try:
                  original_umask = os.umask(0)
                  os.makedirs(folder,0o777)
                finally:
                  os.umask(original_umask)
        print("Extracting " + name)
        maskFile = glob.glob(os.path.dirname(oneFile)+"/"+name+maskPattern)[0]
        extractPatches(outputFolder, oneFile,maskFile, save)
        print("Extraction for ", name, " finished")

In [None]:
def normalizeAndSave(outputFile):
    """
        Normalize the data and save it in a file
    """
    global data
    all_data = np.zeros((j,patchSize,patchSize,3))
    i = 0
    # concatenate all the classes in one array to get the mean and std
    for key, val in img_classes.items():
        n = data[key+"_imgs"].shape[0] + i 
        all_data[i:n,:,:,:]= data[key+"_imgs"]
        i += n
    mean = all_data.mean()
    std = all_data.std()
    stats = np.zeros(2)
    stats[0] = mean
    stats[1] = std
    # Normalize the data
    for key, val in img_classes.items():
        data[key+"_imgs"] -= mean
        data[key+"_imgs"] /= std
    print("Mean : ", stats[0])
    print("Std : ", stats[1])
    # Create the file
    f = h5py.File(outputFile,"w")
    f.create_dataset("stats", data=stats)
    for key, val in classes.items():
        f.create_dataset(key+"_imgs", data=data[key+"_imgs"])
        f.create_dataset(key+"_masks", data=data[key+"_masks"])
    f.close()

# Configuration

In [None]:
patchSize = 256 # Size of the patches
filenames_train = ['/root/workspace/data/SVS_train/*.svs'] # Files we will extract for training. You can use pattern suchs as *.svs
outputFolder_train = '/root/workspace/data/' # Output folder for training
maskPattern= '*.png' # Pattern for the maskfilesoutputFile,
classes = {'neg': 0, 'pos' : 1} # Classes and their value
img_classes = {'neg' : 50 , 'pos' : 200} # Number of patches to extract for each class
save = False # if true the images will be save
j=0 # Counting variable
data = {} # Dict with all data
offset = {} # Dict with the offset per class

# Code Execution

In [None]:
# Get the files for training
files_train = []
for filename in filenames_train:
    files_train.extend(glob.glob(filename))
print(files_train)

In [None]:
# Training extraction
extractFiles(files_train,outputFolder_train, save)

In [None]:
# Normalize and save the data
normalizeAndSave(outputFolder_train+"matrice_train.h5")