### 2018-03-23 Random Splitting by Tile and Size Filtering

Candidate plaques are grouped by their WSI-source, as well as their tile. This ensures that all plaques within a uniform 256 x 256 image are labeled (or multi-labeled) for the subsequent classification task. 

We use a pixel threshold of 1500 pixels, then randomly sample the tiled WSIs and copy them to a new directory for the dataset.

In [1]:
import os
import glob
import shutil

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import cv2

from tqdm import tqdm

In [2]:
BLOB_CSV = 'image_details.csv'
CSV_DIR = 'data/seg/'

NORM_SRC =  'data/seg/blobs_bboxes/'
NORM_DEST = 'data/seg/size_filtered/blobs_bboxes/'

RAW_SRC   = 'data/seg/blobs/'
RAW_DEST  = 'data/seg/size_filtered/blobs/'

We will use a pixel threshold of 1500 pixels. We have a conversion rate of 0.5 microns per pixel, so 40 pixels wide = 20 microns wide. 

In [3]:
# Define a pixel threshold
THRESHOLD = 1500

In [4]:
# Load all the image data
image_df = pd.read_csv(CSV_DIR+BLOB_CSV)

In [5]:
image_df.head()

Unnamed: 0,imagename,source,tile_column,tile_row,image coordinates (xywh),blob coordinates (xywh),blob size
0,NA4160-02_AB_20_30_0.jpg,NA4160-02_AB,30,20,[539 0 256 256],[632 0 70 24],1384
1,NA4160-02_AB_20_30_1.jpg,NA4160-02_AB,30,20,[796 134 256 256],[882 212 84 100],5876
2,NA4160-02_AB_20_5_0.jpg,NA4160-02_AB,5,20,[347 0 256 256],[450 0 50 38],1296
3,NA4160-02_AB_20_5_1.jpg,NA4160-02_AB,5,20,[1280 0 256 256],[1400 0 24 34],532
4,NA4160-02_AB_20_5_2.jpg,NA4160-02_AB,5,20,[542 0 256 256],[658 2 24 26],416


In [6]:
# Group by tile
grouped = image_df.groupby(['source', 'tile_column', 'tile_row'])

In [7]:
# This is a list of the group keys
tiles = list(grouped.groups)

In [8]:
# set a random seed
np.random.seed(42)

# shuffle the tiles
np.random.shuffle(tiles)

In [9]:
SPLIT = int(0.5 * len(tiles))

In [10]:
imageset = []
sources = []
for key in tiles[:SPLIT]:
    value = grouped.groups[key]
    images = image_df.loc[value]
    filtered = images[images['blob size'] > THRESHOLD]
    images = filtered['imagename']
    source = filtered['source']
    imageset.extend(images)
    sources.extend(source)

In [11]:
# down to 103409 images to label of greater than 1500 pixels
len(imageset)

75647

In [12]:
for image, source in tqdm(zip(imageset, sources)):
    rawdir = RAW_DEST + source
    normdir = NORM_DEST + source
    if not os.path.exists(rawdir):
        os.makedirs(rawdir)
    if not os.path.exists(normdir):
        os.makedirs(normdir)
    filename = source + "/" + image
    shutil.copy(RAW_SRC + filename, RAW_DEST + filename)
    shutil.copy(NORM_SRC + filename, NORM_DEST + filename)

75647it [05:17, 238.48it/s]
