In [None]:
# Install the OpenSlide C library and Python bindings
# After installing these libraries, use `Runtime -> restart and run all` on the menu
!apt-get install openslide-tools
!pip install openslide-python

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libopenslide0
Suggested packages:
  libtiff-tools
The following NEW packages will be installed:
  libopenslide0 openslide-tools
0 upgraded, 2 newly installed, 0 to remove and 20 not upgraded.
Need to get 92.5 kB of archives.
After this operation, 268 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libopenslide0 amd64 3.4.1+dfsg-2 [79.8 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 openslide-tools amd64 3.4.1+dfsg-2 [12.7 kB]
Fetched 92.5 kB in 1s (166 kB/s)
Selecting previously unselected package libopenslide0.
(Reading database ... 124016 files and directories currently installed.)
Preparing to unpack .../libopenslide0_3.4.1+dfsg-2

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from openslide import open_slide, __library_version__ as openslide_version
import os
from PIL import Image
from skimage.color import rgb2gray

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Non-overlap patch extraction via thumbnail API

In [None]:
import pandas as pd
from pathlib import Path
from skimage.filters import threshold_otsu
import glob

def thumbnail_filter(slide_path, mask_path = None, tile_size = 299):
  """
  Create thumbnail from level 0 image
  Use otsu_algorithm to determine non-tissue tiles
  If there's mask, then it's posstive sample
  We use thumbnail from mask to determine cancerous tiles
  Remove non-tissue tiles

  Return a dataframe containing tiles location
  """
  with open_slide(slide_path) as slide:
      #print('dim: ', slide.dimensions[0])
      thumbnail = slide.get_thumbnail((slide.dimensions[0] / tile_size, slide.dimensions[1] / tile_size))
      

      thumbnail_grey = np.array(thumbnail.convert('L')) # convert to grayscale
      thresh = threshold_otsu(thumbnail_grey)
      binary = thumbnail_grey > thresh

      patches = pd.DataFrame(pd.DataFrame(binary).stack())
      patches['is_tissue'] = ~patches[0]
      patches.drop(0, axis=1, inplace=True)
      patches['slide_path'] = slide_path


  with open_slide(str(mask_path)) as truth:
      thumbnail_truth = truth.get_thumbnail((truth.dimensions[0] / tile_size, truth.dimensions[1] / tile_size)) 
  
  patches_y = pd.DataFrame(pd.DataFrame(np.array(thumbnail_truth.convert("L"))).stack())
  patches_y['is_tumor'] = patches_y[0] > 0
  patches_y.drop(0, axis=1, inplace=True)

  samples = pd.concat([patches, patches_y], axis=1)
  #sampletotal.append(pd.concat([patches, patches_y], axis=1))

  samples = samples[samples.is_tissue == True] # remove patches with no tissue
  samples['tile_loc'] = list(samples.index)
  samples.reset_index(inplace=True, drop=True)
  return samples
            

In [None]:
tifs = glob.glob("/content/drive/MyDrive/slides/*.tif")
masks = [item for item in tifs if 'mask' in item]
images = [item for item in tifs if not 'mask' in item and not '038' in item]

In [None]:
#Generate tiles sample
total_samples = pd.DataFrame()
for mask in masks:
  slide_path = mask.replace('_mask', '')
  tiles = thumbnail_filter(slide_path, mask)
  total_samples = total_samples.append(tiles, ignore_index=True)

In [None]:
# Downsample negative slides
def downsampler(all_samples):
  idx=all_samples.index[all_samples['is_tumor'] == False].tolist()
  num_to_drop = sum(all_samples['is_tumor'] == False) - sum(all_samples['is_tumor'] == True)
  drop_indices = np.random.choice(idx, num_to_drop, replace=False)
  # print(len(drop_indices))
  # print(all_samples.shape)
  balanced_samples = all_samples.drop(drop_indices)
  # print(balanced_samples.shape)
  # reorder the index.
  balanced_samples.reset_index(drop=True, inplace=True)
  return balanced_samples

In [None]:
balanced_samples = downsampler(total_samples)

In [None]:
from matplotlib.ticker import TickHelper
from openslide.deepzoom import DeepZoomGenerator

def generate_images(img_paths, sample_df, level = 0, tile_size = 299):

  if not os.path.exists(f'/content/drive/MyDrive/patches/level_{level}_img'):
    os.mkdir(f'/content/drive/MyDrive/patches/level_{level}_img')
  if not os.path.exists(f'/content/drive/MyDrive/patches/level_{level}_mask'):
    os.mkdir(f'/content/drive/MyDrive/patches/level_{level}_mask')

  for img in img_paths:
    img_name = img.split('/')[-1]
    img_name = img_name.split('.')[0]
    print('Current processing: ', img_name)
    img_df = sample_df[sample_df['slide_path']==img]
    img_df = img_df.sample(10, replace = False, random_state=1)
    with open_slide(img) as slide:
      #
      tiles = DeepZoomGenerator(slide, tile_size=tile_size, overlap=0, limit_bounds=False)
      #print('tile levels: ', tiles.level_dimensions)
      #print()
      for i in range(len(img_df)):
        cur_tile = img_df.iloc[i, :]
        tile = tiles.get_tile(tiles.level_count-level-1, cur_tile.tile_loc[::-1])
        loc_x, loc_y = cur_tile.tile_loc[::-1]
        im = np.array(tile)
        is_tumor = 'tumor' if cur_tile.is_tumor else 'normal'
        plt.imsave('/content/drive/MyDrive/patches/level_%d_img/%s_%s_%d_%d_%d.png' % (level, is_tumor, img_name, loc_x, loc_y, level), im)



    mask = img.replace('.tif', '_mask.tif')
    with open_slide(str(mask)) as truth:
      truth_tiles = DeepZoomGenerator(truth, tile_size=299, overlap=0, limit_bounds=False)
      #print('truth levels: ', truth_tiles.level_dimensions)
      #print()
      for i in range(len(img_df)):
        cur_tile = img_df.iloc[i, :]
        tile = truth_tiles.get_tile(truth_tiles.level_count-level-1, cur_tile.tile_loc[::-1])
        loc_x, loc_y = cur_tile.tile_loc[::-1]
        im = np.array(tile)
        is_tumor = 'tumor' if cur_tile.is_tumor else 'normal'
        plt.imsave('/content/drive/MyDrive/patches/level_%d_mask/%s_%s_%d_%d_%d.png' % (level, is_tumor, img_name, loc_x, loc_y, level), im)
      


In [None]:
generate_images(images, balanced_samples, level=0)

## Sliding Window Approach to Extract Tiles from Different Level

In [None]:
import glob
tifs = glob.glob("/content/drive/MyDrive/slides/*.tif")
masks = [item for item in tifs if 'mask' in item]
images = [item for item in tifs if not 'mask' in item and not '038' in item]

In [None]:
def read_slide(slide, x, y, level, width, height, as_float=False):
    im = slide.read_region((x,y), level, (width, height))
    im = im.convert('RGB') # drop the alpha channel
    if as_float:
        im = np.asarray(im, dtype=np.float32)
    else:
        im = np.asarray(im)
    assert im.shape == (height, width, 3)
    return im

def find_tissue_pixels(image, intensity=0.8):

    im_gray = rgb2gray(image)
    assert im_gray.shape == (image.shape[0], image.shape[1])
    indices = np.where(im_gray <= intensity)
    return zip(indices[0], indices[1])
  
def apply_mask(im, mask, color=(1,0,0)):

    masked = np.zeros(im.shape)
    for x,y in mask: masked[x][y] = color
    return masked

In [None]:
## Use level 2 and level 0 Image as input
def generate_samples(image_path, mask_path, high_lvl = 0, low_lvl = 2, window_size = 299):
  """
  Generate a dataframe using a sliding window approach
  Each instance in the dataframe is a image pair:
  A window_size*window_size tile from a low resolution slide
  paired with a window_size*window_size from a high resolution slide 
  corresponding to the center region of the low resolution tile.
  If the high resolution tile contains tumor, label it as positive sample.
  """
  with open_slide(image_path) as slide, open_slide(mask_path) as mask:
    # Sliding over Level 0
    # Pari with Level 2
    tissue_threshold = 0.5
    image_pair_positive = []
    image_pair_negative = []
    for x in range(0, slide.level_dimensions[0][0], window_size*(2**high_lvl)*2):
      for y in range(0, slide.level_dimensions[0][1], window_size*(2**high_lvl)*2):
        # Calculate Paired Window Location at Lower Reso Level 
        #print(x, y)
        ratio = (2**low_lvl-2**high_lvl)/2

        # Top Left Coordinate from Low Reso
        x_low = x-window_size*ratio
        y_low = y-window_size*ratio
        # Bottom right Coordinate from Low Reso
        x_low_b = x_low+window_size*(2**low_lvl)
        y_low_r = y_low+window_size*(2**low_lvl)
        
        # Check If Window Pair is Out of bound
        if x_low<0 or y_low<0 or x_low_b>slide.level_dimensions[0][0] or y_low_r>slide.level_dimensions[0][1]:
          continue
        
        cur_tile = read_slide(slide, 
                          x = x, 
                          y = y, 
                          level = high_lvl, 
                          width = window_size, 
                          height = window_size, 
                          )
        # Filter Tissue Region, threshold = 0.5
        tissue_pixels = find_tissue_pixels(cur_tile)
        tissue_ratio = len(list(tissue_pixels))/(window_size**2)
        #if tissue_ratio>tissue_threshold:
        if tissue_ratio>=tissue_threshold:
          # Determine Positve
          cur_mask = read_slide(mask, 
                            x = x, 
                            y = y, 
                            level = high_lvl, 
                            width = window_size, 
                            height = window_size, 
                            )
          cur_mask = cur_mask[:,:,0]
          is_tumor = np.sum(cur_mask)>0
          if is_tumor:
            image_pair_positive.append([(x, y), (x_low, y_low)])
          else:
            image_pair_negative.append([(x, y), (x_low, y_low)])


  return image_pair_positive, image_pair_negative

In [None]:
total_samples = pd.read_csv('/content/drive/MyDrive/high_res_calibrated.csv')

In [None]:
total_samples.head()

Unnamed: 0,slide,is_tumor,high_reso_loc,low_reso_loc
0,tumor_101.tif,True,"(95082, 16744)","(94633, 16295)"
1,tumor_101.tif,True,"(40664, 28106)","(40215, 27657)"
2,tumor_101.tif,True,"(41262, 29302)","(40813, 28853)"
3,tumor_101.tif,True,"(118404, 20930)","(117955, 20481)"
4,tumor_101.tif,True,"(21528, 16146)","(21079, 15697)"


In [None]:
total_samples.slide.value_counts()

tumor_016.tif    1000
tumor_031.tif    1000
tumor_064.tif    1000
tumor_075.tif    1000
tumor_084.tif    1000
tumor_091.tif    1000
tumor_096.tif    1000
tumor_110.tif    1000
tumor_078.tif    1000
tumor_101.tif    1000
Name: slide, dtype: int64

In [None]:
selected_slides_full = ['/content/drive/MyDrive/tumor_016_mask.tif',
                   '/content/drive/MyDrive/tumor_031_mask.tif',
                   '/content/drive/MyDrive/tumor_110_mask.tif',
                   '/content/drive/MyDrive/tumor_078_mask.tif',
                   '/content/drive/MyDrive/tumor_101_mask.tif',
                   ]

selected_slides_augmented = [
                   '/content/drive/MyDrive/tumor_064_mask.tif',
                   '/content/drive/MyDrive/tumor_075_mask.tif',
                   '/content/drive/MyDrive/tumor_091_mask.tif',
                   '/content/drive/MyDrive/tumor_096_mask.tif',
                   '/content/drive/MyDrive/tumor_084_mask.tif',
                   ]


In [None]:
def slide_sampler(sample_size, df):
  frac_p = sample_size/len(df)
  if frac_p>1:
    rep = True 
  else:
    rep = False

  df = df.sample(frac=frac_p, replace=rep)
  return df

In [None]:
total_samples = pd.DataFrame()

In [None]:
import pandas as pd

lvl_1 = 0
lvl_2 = 2
for mask_path in selected_slides:
  img_path = mask_path.replace('_mask', '')
  print('current_processing: ', img_path)
  positive_samples, negative_samples = generate_samples(img_path,
                                                        mask_path,
                                                        high_lvl=lvl_1,
                                                        low_lvl=lvl_2,
                                                        window_size=299)
  positive_high_reso = [item[0] for item in positive_samples]
  positive_low_reso = [item[1] for item in positive_samples]
  negative_high_reso = [item[0] for item in negative_samples]
  negative_low_reso = [item[1] for item in negative_samples]

  pos_df = pd.DataFrame({'slide':img_path.split('/')[-1],
                         'is_tumor': True,
                         'high_reso_loc': positive_high_reso,
                         'low_reso_loc':positive_low_reso})
  
  neg_df = pd.DataFrame({'slide':img_path.split('/')[-1],
                         'is_tumor': False,
                         'high_reso_loc': negative_high_reso,
                         'low_reso_loc': negative_low_reso})
  sample_size = 500
  pos_df = slide_sampler(sample_size, pos_df)
  neg_df = slide_sampler(sample_size, neg_df)

  total_samples = total_samples.append(pos_df, ignore_index=True)
  total_samples = total_samples.append(neg_df, ignore_index=True)


current_processing:  /content/drive/MyDrive/tumor_101.tif


In [None]:
import matplotlib.pyplot as plt

def generate_images(mask_paths, sample_df, lvl_1, lvl_2, dir):
  """
  Use the image pair dataframe generated above 
  to find the specifc tiles from the slides. 
  Save them to disk for the following training/testing
  """

  for mask_path in mask_paths:
    img = mask_path.replace('_mask.tif', '.tif')
    with open_slide(img) as slide, open_slide(mask_path) as mask:
      file_name = img.split('/')[-1]
      img_name = file_name.split('.')[0]
      print('Current processing: ', img_name)
      img_name = img_name.replace('tumor_', '')
      img_df = sample_df[sample_df['slide']==file_name]
      #img_df = img_df.sample(10, replace = False, random_state=1)
      print('current imgs count', len(img_df))
      #deb = 0
      for i in range(len(img_df)):
        #deb+=1

        cur_tile = img_df.iloc[i, :]
        high_x, high_y = eval(cur_tile.high_reso_loc)
        low_x, low_y = eval(cur_tile.low_reso_loc)

        i_idx = high_x//(299**lvl_1)
        j_idx = high_y//((299**lvl_1))

        is_tumor = 'tumor' if cur_tile.is_tumor else 'normal'
        high_res_tile = read_slide(slide, 
                  x = high_x, 
                  y = high_y, 
                  level = lvl_1, 
                  width = 299, 
                  height = 299, 
                  )
        low_res_tile = read_slide(slide, 
                  x = low_x, 
                  y = low_y, 
                  level = lvl_2, 
                  width = 299, 
                  height = 299, 
                  )
        
        high_res_mask = read_slide(mask, 
                  x = high_x, 
                  y = high_y, 
                  level = lvl_1, 
                  width = 299, 
                  height = 299, 
                  )
        low_res_mask = read_slide(mask, 
                  x = low_x, 
                  y = low_y, 
                  level = lvl_2, 
                  width = 299, 
                  height = 299, 
                  )
        

        plt.imsave('%s/img/%s_%s_%d_%d_%d_%d.png' % (dir, is_tumor, img_name, i, i_idx, j_idx, lvl_1), high_res_tile)
        plt.imsave('%s/img/%s_%s_%d_%d_%d_%d.png' % (dir, is_tumor, img_name, i, i_idx, j_idx, lvl_2), low_res_tile)
        plt.imsave('%s/mask/%s_%s_%d_%d_%d_%d.png' % (dir, is_tumor, img_name, i, i_idx, j_idx, lvl_1), high_res_mask)
        plt.imsave('%s/mask/%s_%s_%d_%d_%d_%d.png' % (dir, is_tumor, img_name, i, i_idx, j_idx, lvl_2), low_res_mask)


In [None]:
generate_images(selected_slides_augmented, total_samples, lvl_1=0, lvl_2=2, dir = '/content/drive/MyDrive/patches_v3')

Current processing:  tumor_064
current imgs count 1000
Current processing:  tumor_075
current imgs count 1000
Current processing:  tumor_091
current imgs count 1000
Current processing:  tumor_096
current imgs count 1000
Current processing:  tumor_084
current imgs count 1000


In [None]:
lvl_1 = 0
lvl_2 = 2
for mask_path in selected_slides[:9]:
  img_path = mask_path.replace('_mask', '')
  print('current_processing: ', img_path)
  positive_samples, negative_samples = generate_samples(img_path,
                                                        mask_path,
                                                        high_lvl=lvl_1,
                                                        low_lvl=lvl_2,
                                                        window_size=299)
  positive_high_reso = [item[0] for item in positive_samples]
  positive_low_reso = [item[1] for item in positive_samples]
  negative_high_reso = [item[0] for item in negative_samples]
  negative_low_reso = [item[1] for item in negative_samples]

  pos_df = pd.DataFrame({'slide':img_path.split('/')[-1],
                         'is_tumor': True,
                         'high_reso_loc': positive_high_reso,
                         'low_reso_loc':positive_low_reso})
  
  neg_df = pd.DataFrame({'slide':img_path.split('/')[-1],
                         'is_tumor': False,
                         'high_reso_loc': negative_high_reso,
                         'low_reso_loc': negative_low_reso})
  sample_size = 500
  pos_df = slide_sampler(sample_size, pos_df)
  neg_df = slide_sampler(sample_size, neg_df)

  total_samples = total_samples.append(pos_df, ignore_index=True)
  total_samples = total_samples.append(neg_df, ignore_index=True)


current_processing:  /content/drive/MyDrive/tumor_016.tif
current_processing:  /content/drive/MyDrive/tumor_031.tif
current_processing:  /content/drive/MyDrive/tumor_064.tif
current_processing:  /content/drive/MyDrive/tumor_075.tif
current_processing:  /content/drive/MyDrive/tumor_084.tif
current_processing:  /content/drive/MyDrive/tumor_091.tif
current_processing:  /content/drive/MyDrive/tumor_096.tif
current_processing:  /content/drive/MyDrive/tumor_110.tif
current_processing:  /content/drive/MyDrive/tumor_078.tif


In [None]:
generate_images(selected_slides[:9], total_samples)

Current processing:  tumor_016
current imgs count 1000
Current processing:  tumor_031
current imgs count 1000
Current processing:  tumor_064
current imgs count 1000
Current processing:  tumor_075
current imgs count 1000
Current processing:  tumor_084
current imgs count 1000
Current processing:  tumor_091
current imgs count 1000
Current processing:  tumor_096
current imgs count 1000
Current processing:  tumor_110
current imgs count 1000
Current processing:  tumor_078
current imgs count 1000


In [None]:
total_samples.head()

Unnamed: 0,slide,is_tumor,high_reso_loc,low_reso_loc
0,tumor_016.tif,True,"(28704, 163852)","(26910, 162058)"
1,tumor_016.tif,True,"(40664, 150696)","(38870, 148902)"
2,tumor_016.tif,True,"(44252, 154284)","(42458, 152490)"
3,tumor_016.tif,True,"(37076, 162656)","(35282, 160862)"
4,tumor_016.tif,True,"(43056, 161460)","(41262, 159666)"
