In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from openslide import open_slide, __library_version__ as openslide_version
import os
from PIL import Image
from skimage.color import rgb2gray
import random

In [14]:
my_path = 'C:/course/4995 Applied Deeping Learning/homework/Project'

In [15]:
def read_slide(slide, x, y, level, width, height, as_float=False, rotation = 0):
    im = slide.read_region((x,y), level, (width, height)) 
    # x,y to (61440,53760) original display area
    # width,height image size
    im = im.convert('RGB') # drop the alpha channel
    im = im.rotate(rotation)
    if as_float:
        im = np.asarray(im, dtype=np.float32)
    else:
        im = np.asarray(im)
    assert im.shape == (height, width, 3)
    return im

def find_tissue_pixels(image, intensity_low=0.1, intensity_high=0.8):
    im_gray = rgb2gray(image)
    assert im_gray.shape == (image.shape[0], image.shape[1])
    pos_min = im_gray >= intensity_low
    pos_max = im_gray <= intensity_high
    pos_rst = pos_min & pos_max
    indices = np.where(pos_rst == True)
    return list(zip(indices[0], indices[1]))


In [16]:
def crop_image_data(slide_path, tumor_mask_path, level_img = 2, width_crop = 256, height_crop = 256, width_move = 100, height_move = 100, category = 'train', tumer_num = '000'):
  global image_index
  image_index_start = image_index

  slide = open_slide(slide_path)
  print ("Read WSI from %s with width: %d, height: %d" % (slide_path, slide.level_dimensions[0][0], slide.level_dimensions[0][1]))

  tumor_mask = open_slide(tumor_mask_path)
  print ("Read tumor mask from %s" % (tumor_mask_path))

  print("Slide includes %d levels" % len(slide.level_dimensions), "Mask includes %d levels" % len(tumor_mask.level_dimensions))
  for i in range(min(len(slide.level_dimensions),len(tumor_mask.level_dimensions))):
      print("Level %d, dimensions: %s downsample factor %d" % (i, slide.level_dimensions[i], slide.level_downsamples[i]))
      assert tumor_mask.level_dimensions[i][0] == slide.level_dimensions[i][0]
      assert tumor_mask.level_dimensions[i][1] == slide.level_dimensions[i][1]

  # Verify downsampling works as expected
  width, height = slide.level_dimensions[level_img]
  assert width * slide.level_downsamples[level_img] == slide.level_dimensions[0][0]
  assert height * slide.level_downsamples[level_img] == slide.level_dimensions[0][1]

  # slide.shape = (height, width, 3), mask.shape = (height, width)
  slide_image = read_slide(slide, x=0, y=0, level=level_img, width=slide.level_dimensions[level_img][0], height=slide.level_dimensions[level_img][1])
  mask_image = read_slide(tumor_mask, x=0, y=0, level=level_img, width=slide.level_dimensions[level_img][0], height=slide.level_dimensions[level_img][1])
  mask_image = mask_image[:,:,0]

  height_num = (height-int(height_crop*1.5))//height_move
  height_range = np.arange(height_num-1) * height_move + height_move
  width_num = (width-int(width_crop*1.5))//width_move
  width_range = np.arange(width_num-1) * width_move + width_move

  crop_number = len(height_range)*len(width_range)
  print('Crop to %d images.' % crop_number)

  # weights[0]: mask==0, weights[1]: mask == 1
  weights = np.asarray([0,0],dtype=np.int64)

  image_area = width_crop*height_crop
  image_start_pos = []
  count = 0
  for h in height_range:
    for w in width_range:
      if count % max((crop_number//20),1) == 0:
        print('.',end='')
      count += 1

      hrand = int(np.random.uniform(low = -height_crop/4, high= height_crop/4))
      wrand = int(np.random.uniform(low = -width_crop/4, high= width_crop/4))
      now_w = int((w+wrand)*slide.level_downsamples[level_img])
      now_h = int((h+hrand)*slide.level_downsamples[level_img])
      image_crop = read_slide(slide, x=now_w, y=now_h, level=level_img, width=width_crop, height=height_crop)
      tissue_pixels = find_tissue_pixels(image_crop)
      percent_tissue = len(tissue_pixels) / float(image_area)
      if percent_tissue > 0.1:
        mask_crop = read_slide(tumor_mask, x=now_w, y=now_h, level=level_img, width=width_crop, height=height_crop)
        mask_crop = mask_crop[:,:,0]
        if np.mean(mask_crop) > 0:
          image_start_pos += [(int(tumer_num),h+hrand,w+wrand)]
          masked = np.sum(mask_crop>=0.5)
          image_crop = Image.fromarray(image_crop, 'RGB')
          mask_crop = Image.fromarray(mask_crop*255, 'L')
          image_crop.save(my_path+'/Crop_Data/level'+str(level_img)+'/'+category+'/image/'+str(image_index).zfill(5)+'.jpg', 'JPEG')
          mask_crop.save(my_path+'/Crop_Data/level'+str(level_img)+'/'+category+'/mask/'+str(image_index).zfill(5)+'.jpg', 'JPEG')
          weights[0] += image_area - masked
          weights[1] += masked

          ### high level image +2
          now_w_up2 = int(now_w)
          now_h_up2 = int(now_h)
          image_crop_up2 = read_slide(slide, x=now_w_up2, y=now_h_up2, level=level_img+2, width=width_crop//4, height=height_crop//4)
          mask_crop_up2 = read_slide(tumor_mask, x=now_w_up2, y=now_h_up2, level=level_img+2, width=width_crop//4, height=height_crop//4)
          mask_crop_up2 = mask_crop_up2[:,:,0]
          image_crop_up2 = Image.fromarray(image_crop_up2, 'RGB')
          mask_crop_up2 = Image.fromarray(mask_crop_up2*255, 'L')
          image_crop_up2.save(my_path+'/Crop_Data/level'+str(level_img+2)+'/'+category+'/image/'+str(image_index).zfill(5)+'.jpg', 'JPEG')
          mask_crop_up2.save(my_path+'/Crop_Data/level'+str(level_img+2)+'/'+category+'/mask/'+str(image_index).zfill(5)+'.jpg', 'JPEG')

          ### high level image +4
          now_w_up4 = int(now_w)
          now_h_up4 = int(now_h)
          image_crop_up4 = read_slide(slide, x=now_w_up4, y=now_h_up4, level=level_img+4, width=width_crop//16, height=height_crop//16)
          mask_crop_up4 = read_slide(tumor_mask, x=now_w_up4, y=now_h_up4, level=level_img+4, width=width_crop//16, height=height_crop//16)
          mask_crop_up4 = mask_crop_up4[:,:,0]
          image_crop_up4 = Image.fromarray(image_crop_up4, 'RGB')
          mask_crop_up4 = Image.fromarray(mask_crop_up4*255, 'L')
          image_crop_up4.save(my_path+'/Crop_Data/level'+str(level_img+4)+'/'+category+'/image/'+str(image_index).zfill(5)+'.jpg', 'JPEG')
          mask_crop_up4.save(my_path+'/Crop_Data/level'+str(level_img+4)+'/'+category+'/mask/'+str(image_index).zfill(5)+'.jpg', 'JPEG')

          image_index += 1
        else:
          u = np.random.uniform()
          if u < 0.5:
            image_start_pos += [(int(tumer_num),h+hrand,w+wrand)]
            image_crop = Image.fromarray(image_crop, 'RGB')
            mask_crop = Image.fromarray(mask_crop*255, 'L')
            image_crop.save(my_path+'/Crop_Data/level'+str(level_img)+'/'+category+'/image/'+str(image_index).zfill(5)+'.jpg', 'JPEG')
            mask_crop.save(my_path+'/Crop_Data/level'+str(level_img)+'/'+category+'/mask/'+str(image_index).zfill(5)+'.jpg', 'JPEG')
            weights[0] += image_area

            ### high level image +2
            now_w_up2 = int(now_w)
            now_h_up2 = int(now_h)
            image_crop_up2 = read_slide(slide, x=now_w_up2, y=now_h_up2, level=level_img+2, width=width_crop//4, height=height_crop//4)
            mask_crop_up2 = read_slide(tumor_mask, x=now_w_up2, y=now_h_up2, level=level_img+2, width=width_crop//4, height=height_crop//4)
            mask_crop_up2 = mask_crop_up2[:,:,0]
            image_crop_up2 = Image.fromarray(image_crop_up2, 'RGB')
            mask_crop_up2 = Image.fromarray(mask_crop_up2*255, 'L')
            image_crop_up2.save(my_path+'/Crop_Data/level'+str(level_img+2)+'/'+category+'/image/'+str(image_index).zfill(5)+'.jpg', 'JPEG')
            mask_crop_up2.save(my_path+'/Crop_Data/level'+str(level_img+2)+'/'+category+'/mask/'+str(image_index).zfill(5)+'.jpg', 'JPEG')
            
            ### high level image +4
            now_w_up4 = int(now_w)
            now_h_up4 = int(now_h)
            image_crop_up4 = read_slide(slide, x=now_w_up4, y=now_h_up4, level=level_img+4, width=width_crop//16, height=height_crop//16)
            mask_crop_up4 = read_slide(tumor_mask, x=now_w_up4, y=now_h_up4, level=level_img+4, width=width_crop//16, height=height_crop//16)
            mask_crop_up4 = mask_crop_up4[:,:,0]
            image_crop_up4 = Image.fromarray(image_crop_up4, 'RGB')
            mask_crop_up4 = Image.fromarray(mask_crop_up4*255, 'L')
            image_crop_up4.save(my_path+'/Crop_Data/level'+str(level_img+4)+'/'+category+'/image/'+str(image_index).zfill(5)+'.jpg', 'JPEG')
            mask_crop_up4.save(my_path+'/Crop_Data/level'+str(level_img+4)+'/'+category+'/mask/'+str(image_index).zfill(5)+'.jpg', 'JPEG')

            image_index += 1
  
  print('')
  print('Weight proportion： %d:%d' %(weights[0], weights[1]))
  print('Generate %d valid images' %int(image_index - image_index_start))

  return image_start_pos, weights
  

In [17]:
def crop_train_test_data(tumor_num=['000'], level_img = 5, width_crop = 256, height_crop = 256, width_move = 100, height_move = 100, category = 'train'):
  global image_index

  image_index = 0
  image_start_pos = []
  weights = np.array([0,0])
  for t in tumor_num:
    slide_path = my_path+'/Project Data/tumor_' + t + '.tif'
    tumor_mask_path = my_path+'/Project Data/tumor_' + t + '_mask.tif'
    image_start_pos_t, new_weights = crop_image_data(slide_path, tumor_mask_path, level_img=level_img, width_crop=width_crop, height_crop=height_crop, width_move=width_move, height_move=height_move, category=category, tumer_num=t)
    image_start_pos += image_start_pos_t
    weights += np.array(new_weights)
    print('Finish tumor image:', t)
    print('')
  
  print('Weight proportion on %s data： %d:%d' %(category, weights[0], weights[1]))
  print('A total of %d %s data is generated.' % (image_index, category))
  print('')
  np.save(my_path+'/Crop_Data/level'+str(level)+'/'+category+'/image_start_pos.npy',image_start_pos)
  np.save(my_path+'/Crop_Data/level'+str(level)+'/'+category+'/weights.npy',weights)
  return image_start_pos

In [18]:
def delete_image(level_img = 5, category = 'train'):
  image_path = my_path+'/Crop_Data/level'+str(level_img)+'/'+category+'/image/'
  mask_path = my_path+'/Crop_Data/level'+str(level_img)+'/'+category+'/mask/'
  image_files = os.listdir(image_path)
  mask_files = os.listdir(mask_path)

  print('Start deleting ' + category + ' images and masks, number of images:'+str(len(image_files))+' ,number of masks:'+str(len(mask_files )))

  for file in image_files:
    os.remove(image_path + file)
  for file in os.listdir(mask_path):
    os.remove(mask_path + file)
  
  print('Finish deleting ' + category + ' images and masks, number of images:'+str(len(os.listdir(image_path)))+' ,number of masks:'+str(len(os.listdir(mask_path))))
  try:
    os.remove(my_path+'/Crop_Data/level'+str(level)+'/'+category+'/image_start_pos.npy')
    os.remove(my_path+'/Crop_Data/level'+str(level)+'/'+category+'/weights.npy')
  except:
    pass

def delete_image_all(level_img = 5):
  delete_image(level_img, 'train')
  delete_image(level_img, 'valid')
  delete_image(level_img, 'test')
  print('Finish deleting level', level_img)

In [19]:
level = 2
width_crop = 256
height_crop = 256
width_move = 192
height_move = 192
for i in range(8):
  try:
    delete_image_all(i)
  except:
    pass

Start deleting train images and masks, number of images:0 ,number of masks:0
Finish deleting train images and masks, number of images:0 ,number of masks:0
Start deleting valid images and masks, number of images:0 ,number of masks:0
Finish deleting valid images and masks, number of images:0 ,number of masks:0
Start deleting test images and masks, number of images:0 ,number of masks:0
Finish deleting test images and masks, number of images:0 ,number of masks:0
Finish deleting level 0
Start deleting train images and masks, number of images:0 ,number of masks:0
Finish deleting train images and masks, number of images:0 ,number of masks:0
Start deleting valid images and masks, number of images:0 ,number of masks:0
Finish deleting valid images and masks, number of images:0 ,number of masks:0
Start deleting test images and masks, number of images:0 ,number of masks:0
Finish deleting test images and masks, number of images:0 ,number of masks:0
Finish deleting level 1
Start deleting train image

In [20]:
tumor_num_train = [1,2,5,12,16,19,23,31,35,57,59,64,75,78,81]
tumor_num_train = [str(i).zfill(3) for i in tumor_num_train]
tumor_num_valid = [84,91,94]
tumor_num_valid = [str(i).zfill(3) for i in tumor_num_valid]
tumor_num_test = [96,101,110]
tumor_num_test = [str(i).zfill(3) for i in tumor_num_test]
train_start_pos = crop_train_test_data(tumor_num_train, level_img=level, width_crop=width_crop, height_crop=height_crop, width_move=width_move, height_move=height_move, category ='train')
valid_start_pos = crop_train_test_data(tumor_num_valid, level_img=level, width_crop=width_crop, height_crop=height_crop, width_move=width_move, height_move=height_move, category ='valid')
test_start_pos = crop_train_test_data(tumor_num_test, level_img=level, width_crop=width_crop, height_crop=height_crop, width_move=width_move, height_move=height_move, category ='test')

Read WSI from C:/course/4995 Applied Deeping Learning/homework/Project/Project Data/tumor_001.tif with width: 97792, height: 221184
Read tumor mask from C:/course/4995 Applied Deeping Learning/homework/Project/Project Data/tumor_001_mask.tif
Slide includes 10 levels Mask includes 9 levels
Level 0, dimensions: (97792, 221184) downsample factor 1
Level 1, dimensions: (48896, 110592) downsample factor 2
Level 2, dimensions: (24448, 55296) downsample factor 4
Level 3, dimensions: (12224, 27648) downsample factor 8
Level 4, dimensions: (6112, 13824) downsample factor 16
Level 5, dimensions: (3056, 6912) downsample factor 32
Level 6, dimensions: (1528, 3456) downsample factor 64
Level 7, dimensions: (764, 1728) downsample factor 128
Level 8, dimensions: (382, 864) downsample factor 256
Crop to 35340 images.
....................
Weight proportion： 177589965:1192243
Generate 2728 valid images
Finish tumor image: 001

Read WSI from C:/course/4995 Applied Deeping Learning/homework/Project/Projec

Crop to 19177 images.
.....................
Weight proportion： 237277953:6909183
Generate 3726 valid images
Finish tumor image: 094

Weight proportion on valid data： 425954885:15823291
A total of 6741 valid data is generated.

Read WSI from C:/course/4995 Applied Deeping Learning/homework/Project/Project Data/tumor_096.tif with width: 131072, height: 71680
Read tumor mask from C:/course/4995 Applied Deeping Learning/homework/Project/Project Data/tumor_096_mask.tif
Slide includes 9 levels Mask includes 9 levels
Level 0, dimensions: (131072, 71680) downsample factor 1
Level 1, dimensions: (65536, 35840) downsample factor 2
Level 2, dimensions: (32768, 17920) downsample factor 4
Level 3, dimensions: (16384, 8960) downsample factor 8
Level 4, dimensions: (8192, 4480) downsample factor 16
Level 5, dimensions: (4096, 2240) downsample factor 32
Level 6, dimensions: (2048, 1120) downsample factor 64
Level 7, dimensions: (1024, 560) downsample factor 128
Level 8, dimensions: (512, 280) downsamp