# Imports

In [None]:
import openslide as op
from PIL import Image
import numpy as np
import random
import glob
import os
from scipy import misc, ndimage
from skimage import morphology
from skimage import color
from skimage import io

# Functions definition

In [None]:
def addBackground(imArray, maskArray):
    """
        Find the background on the array and put the value 2 on the mask
    """
    green = imArray[:,:,1]
    indexes = np.where(green<200)
    background = np.zeros(maskArray.shape)
    for i in range(len(indexes[0])):
        background[indexes[0][i],indexes[1][i]]=1
    # Morphological operations
    slem = morphology.disk(10)
    background = morphology.closing(background,slem)
    background = morphology.erosion(background,slem)
    background = ndimage.morphology.binary_fill_holes(background)
    # Change the original mask
    backgroundIndex = np.where(background==0)
    backArray = np.array(maskArray)
    for i in range(len(backgroundIndex[0])):
        backArray[backgroundIndex[0][i],backgroundIndex[1][i]]=2
    return backArray

In [None]:
def extractPatches(output,filename,maskname):
   """
       Extract the patches for the given file and maskname
   """
   global j
   # Opening the files
   im = op.OpenSlide(filename)
   imload = im.read_region((0,0), 1, im.level_dimensions[1])
   mask = Image.open(maskname)
   if(imload.size != mask.size):
      mask = mask.resize(imload.size, Image.ANTIALIAS)
   imArray = np.array(imload)
   maskArray = np.array(mask)
   halfPatch = patchSize//2

   #Preprocess
   #maskArray = addBackground(imArray, maskArray)
   imArray = np.lib.pad(imArray, ((halfPatch, halfPatch), (halfPatch, halfPatch),(0,0)), 'reflect')
   maskArrayPad = np.lib.pad(maskArray, ((halfPatch, halfPatch), (halfPatch, halfPatch)), 'reflect')
   
   # Extraction
   for key, val in classes.items():
       indices = np.where(maskArray==val)
       sample = random.sample(range(len(indices[0])), img_classes[key])
       maskClass = np.array(maskArrayPad)
       if(val==1):
           np.putmask(maskClass, maskClass==val, 255)
       for i in sample:
           x=indices[0][i]
           y=indices[1][i]
           x2 = x+patchSize
           y2 = y+patchSize
           croppedIm = imArray[x:x2,y:y2,0:3]
           croppedMask = maskClass[x:x2,y:y2]
           imageName = output +  "/image_" + str(j) + ".png"
           imageNameMask = output  + "/image_" + str(j) +"_mask.png"
           misc.imsave(imageName,croppedIm)
           misc.imsave(imageNameMask,croppedMask)
           os.chmod(imageName , 0o777)
           os.chmod(imageNameMask, 0o777)
           j+=1

In [None]:
def extractFiles(files, outputFolder):
    """
        Extract all the files of a folder
    """
    for oneFile in files:
        name = os.path.splitext(os.path.basename(oneFile))[0]
        for key, val in classes.items():
            folder = outputFolder
            if not os.path.exists(folder):
                try:
                  original_umask = os.umask(0)
                  os.makedirs(folder,0o777)
                finally:
                  os.umask(original_umask)
        print("Extracting " + name)
        maskFile = glob.glob(os.path.dirname(oneFile)+"/"+name+maskPattern)[0]
        extractPatches(outputFolder, oneFile,maskFile)
        print("Extraction for ", name, " finished")

# Configuration

In [None]:
patchSize = 256 # Size of the patches
filenames_train = ['/root/workspace/data/SVS_train/*.svs'] # Files we will extract for training. You can use pattern suchs as *.svs
filenames_test = ['/root/workspace/data/SVS_test/*.svs'] # Files we will extract. for testing You can use pattern suchs as *.svs
outputFolder_train = '/root/workspace/data/mylungrgb/' # Output folder for training
outputFolder_test= '/root/workspace/data/mylungrgbtest/' # Output folder for testing
maskPattern= '*.png' # Pattern for the maskfiles
classes = {'neg': 0, 'pos' : 1} # Classes and their value
img_classes = {'neg' : 100 , 'pos' : 2000} # Number of patches to extract for each class
j=0

# Code Execution

In [None]:
# Get the files for training
files_train = []
for filename in filenames_train:
    files_train.extend(glob.glob(filename))
print(files_train)

In [None]:
# Get the files for testing
files_test = []
for filename in filenames_test:
    files_test.extend(glob.glob(filename))
print(files_test)

In [None]:
# Test extraction
extractFiles(files_test,outputFolder_test)

In [None]:
# Training extraction
extractFiles(files_train,outputFolder_train)