<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"></ul></div>

In [3]:
import pandas as pd
import numpy as np  
from pathlib import Path  
import h5py
import hashlib
from tqdm import tqdm_notebook as tqdm

PATH = Path('.')
DATA_PATH = PATH/"../data/"

In [4]:
def create_pcam16_hash_table(dset_img):
    # creates a dictionary where each key is the sha-1 hash of the image data and the value is the index in the h5 file
    hash_table = {}
    for h5_id,img_id in enumerate(dset_img):
        hash_table[hashlib.sha1(img_id).hexdigest()] = h5_id
    return hash_table

def create_labels_dict(f):
    # creates a dictionary where each key is the img_id and the value is tumor or not
    df = pd.read_csv(f)
    labels_dict = {}
    for idx in range(len(df)):
        img_id = df.iloc[idx,0]
        is_tumor = df.iloc[idx,1]
        labels_dict[img_id] = int(is_tumor)
    return labels_dict

def create_wsi_dict(f):
    # creates a dictionary where each key is the h5 index and the value is the wsi
    df = pd.read_csv(f)
    
    dict_list = []
    for y,x,tp,ctp,wsi in zip(df.coord_y.tolist(), df.coord_x.tolist(), df.tumor_patch.tolist(), df.center_tumor_patch.tolist(), df.wsi.tolist()):
        dict_list.append({'coord_y':y,'coord_x':x, 'tumor_patch':tp,'center_tumor_patch':ctp,'wsi':wsi})
    wsi_dict = dict(zip(df.index.tolist(),dict_list))
    return wsi_dict

In [5]:
def postprocess(h5_file, labels_file, meta_file, name='new'):
    fimg = h5py.File(DATA_PATH/h5_file, 'r')

    hash_table = create_pcam16_hash_table(fimg['x'])
    labels_dict = create_labels_dict(DATA_PATH/labels_file)
    wsi_dict = create_wsi_dict(DATA_PATH/meta_file)

    ids = []
    label = []
    
    y = []
    x = []
    tp = []
    ctp = []
    wsi = []

    error_count = 0
    pbar = tqdm(labels_dict.items())
    for img_id, is_tumor in pbar:
        if img_id in hash_table.keys():
            h5_idx = hash_table[img_id]

            ids.append(img_id)
            label.append(is_tumor)

            y.append(wsi_dict[h5_idx]['coord_y'])
            x.append(wsi_dict[h5_idx]['coord_x'])
            tp.append(wsi_dict[h5_idx]['tumor_patch'])
            ctp.append(wsi_dict[h5_idx]['center_tumor_patch'])
            wsi_id = wsi_dict[h5_idx]['wsi']
            wsi.append(wsi_id)

            pbar.set_description("[ERROR:{}] h5:{}, wsi_id:{}, id:{}, is_tumor:{}".format(error_count, h5_idx, wsi_id, img_id, is_tumor))

        else:
            error_count = error_count+1
            pbar.set_description("[ERROR:{}] Image {} does not exist in external file".format(error_count, img_id))
            
    
    df_full_wsi = pd.DataFrame({'id':ids,'coord_y':y,'coord_x':x, 'tumor_patch':tp,'center_tumor_patch':ctp,'wsi':wsi,'is_tumor':label},columns=['id', 'coord_y', 'coord_x', 'tumor_patch', 'center_tumor_patch', 'wsi','is_tumor'])
    df_full_wsi.to_csv(DATA_PATH/labels_file.replace('.csv', '_wsi_{}.csv'.format(name)),index=False)
    
    print("Out of {} images, {} is unknown".format(len(pbar), error_count))

In [8]:
postprocess('camelyonpatch_level_2_split_valid_x.h5', "sample_submission.csv", 'camelyonpatch_level_2_split_valid_meta.csv', name='valid')

HBox(children=(IntProgress(value=0, max=57458), HTML(value='')))


Out of 57458 images, 29350 is unknown


In [9]:
postprocess('camelyonpatch_level_2_split_test_x.h5', "sample_submission.csv", 'camelyonpatch_level_2_split_test_meta.csv', name='test')

HBox(children=(IntProgress(value=0, max=57458), HTML(value='')))


Out of 57458 images, 28108 is unknown


In [14]:
origin = pd.read_csv(DATA_PATH/'hyp.csv').set_index('id')
origin_dict = origin.to_dict()

wsi_valid = pd.read_csv(DATA_PATH/'sample_submission_wsi_valid.csv').set_index('id')
wsi_valid_dict = wsi_valid.to_dict()

wsi_test = pd.read_csv(DATA_PATH/'sample_submission_wsi_test.csv').set_index('id')
wsi_test_dict = wsi_test.to_dict()

In [11]:
wsi_valid_dict

{'center_tumor_patch': {'db3278887b143632c75e5301608f3283607991e8': False,
  '77656c206f3b8a1c1238332afbe43bf3ca09f6d1': False,
  '2326bdb271a0e71f23b1b2c89df824e4df4be529': True,
  'b2a303e1c3d82457ab15e997b3418f78fe7a43e1': False,
  '1a9471385c773df28414453eca7d19e36b20df7d': True,
  '23badba6556c0fb74d1757a2915b4da7895c05b1': True,
  '1f3929e8256c2f385c1db4dbe9ee49b8ed161c61': True,
  '9d9e87f1e9773004f5cfea84fc0cae7eb8b0d6d4': False,
  '4fefce14b10451cdc84dc50d999bcc9f011d8aed': False,
  '43a413819ec4bec526808a9c9f528c3cdb3a9e5b': False,
  '9b919b2831501646e572e65eabc3235c0dcff287': False,
  'b5451b3d2f6230d579fd0b5219f03f5778ee7cf7': False,
  '526875c88144298578f95a12f1940d811c3cc23c': True,
  'dd7fd9e0d8d419f052a73029fb4623f3f8f896c8': True,
  '9d4b495eb7ed76a9455e08ea7ab35935007ebc80': True,
  '42eb18db8373bd8c4e9ed1a9a2ac64d0251e56b3': True,
  '0443737b84530547ec567c008ecf65ef0b4f216e': False,
  '6dcd2f81e0bd16648e4ac65ab18dc892a99971d0': True,
  'c0e8b7aac1982fdade46d5bc257c67

In [17]:
pbar = tqdm(origin_dict['label'].keys())

img_id_new = []
label_new = []

error_count = 0
for img_id in pbar:
    if img_id in wsi_valid_dict['tumor_patch'].keys():
        if wsi_valid_dict['tumor_patch'][img_id] == False:
            img_id_new.append(img_id)
            label_new.append(0.)
            pbar.set_description("[ERROR={}] {} = 0".format(error_count, img_id))
        else:
            img_id_new.append(img_id)
            label_new.append(origin_dict['label'][img_id])
    else:
        img_id_new.append(img_id)
        label_new.append(origin_dict['label'][img_id])
        error_count = error_count+1
        pbar.set_description("[ERROR={}] No id found.".format(error_count))

out = pd.DataFrame({'id': img_id_new, 'label': label_new}, columns=['id', 'label'])
out.to_csv(DATA_PATH/'output.csv',index=False)

HBox(children=(IntProgress(value=0, max=57458), HTML(value='')))

In [19]:
pbar = tqdm(origin_dict['label'].keys())

img_id_new = []
label_new = []

error_count = 0
for img_id in pbar:
    if img_id in wsi_test_dict['tumor_patch'].keys():
        if wsi_test_dict['tumor_patch'][img_id] == False:
            img_id_new.append(img_id)
            label_new.append(0.)
            pbar.set_description("[ERROR={}] {} = 0".format(error_count, img_id))
        else:
            img_id_new.append(img_id)
            label_new.append(origin_dict['label'][img_id])
    else:
        img_id_new.append(img_id)
        label_new.append(origin_dict['label'][img_id])
        error_count = error_count+1
        pbar.set_description("[ERROR={}] No id found.".format(error_count))

out = pd.DataFrame({'id': img_id_new, 'label': label_new}, columns=['id', 'label'])
out.to_csv(DATA_PATH/'output.csv',index=False)

HBox(children=(IntProgress(value=0, max=57458), HTML(value='')))

In [21]:
pbar = tqdm(origin_dict['label'].keys())

img_id_new = []
label_new = []

error_count = 0
for img_id in pbar:
    if img_id in wsi_test_dict['tumor_patch'].keys():
        if wsi_test_dict['center_tumor_patch'][img_id] == False:
            img_id_new.append(img_id)
            label_new.append(1)
            pbar.set_description("[ERROR={}] {} = 0".format(error_count, img_id))
        elif wsi_test_dict['center_tumor_patch'][img_id] == True:
            img_id_new.append(img_id)
            label_new.append(0)
            pbar.set_description("[ERROR={}] {} = 0".format(error_count, img_id))
        else:
            img_id_new.append(img_id)
            label_new.append(origin_dict['label'][img_id])
            error_count = error_count+1
    elif img_id in wsi_valid_dict['tumor_patch'].keys():
        if wsi_valid_dict['center_tumor_patch'][img_id] == False:
            img_id_new.append(img_id)
            label_new.append(0)
            pbar.set_description("[ERROR={}] {} = 0".format(error_count, img_id))
        elif wsi_valid_dict['center_tumor_patch'][img_id] == True:
            img_id_new.append(img_id)
            label_new.append(1)
            pbar.set_description("[ERROR={}] {} = 0".format(error_count, img_id))
        else:
            img_id_new.append(img_id)
            label_new.append(origin_dict['label'][img_id])
            error_count = error_count+1
    else:
        img_id_new.append(img_id)
        label_new.append(origin_dict['label'][img_id])
        error_count = error_count+1
        pbar.set_description("[ERROR={}] No id found.".format(error_count))

out = pd.DataFrame({'id': img_id_new, 'label': label_new}, columns=['id', 'label'])
out.to_csv(DATA_PATH/'output.csv',index=False)

HBox(children=(IntProgress(value=0, max=57458), HTML(value='')))