In [2]:
import os
import numpy as np
import pandas as pd
import pydicom
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

# Making Mixed Labels

In [2]:
PATH = "/home/koke_cacao/Documents/Koke_Cacao/Python/WorkSpace/RedstoneTorch/data/siim_dataset/train-rle.csv"
NEW_PATH = "/home/koke_cacao/Documents/Koke_Cacao/Python/WorkSpace/RedstoneTorch/data/siim_dataset/train-rle-mix.csv"
dataset = pd.read_csv(PATH)
dataset.head()

Unnamed: 0,ImageId,EncodedPixels
0,1.2.276.0.7230010.3.1.4.8323329.5597.151787518...,-1
1,1.2.276.0.7230010.3.1.4.8323329.12515.15178752...,-1
2,1.2.276.0.7230010.3.1.4.8323329.4904.151787518...,175349 7 1013 12 1009 17 1005 19 1003 20 1002 ...
3,1.2.276.0.7230010.3.1.4.8323329.32579.15178751...,407576 2 1021 7 1015 10 1013 12 1011 14 1008 1...
4,1.2.276.0.7230010.3.1.4.8323329.32579.15178751...,252069 1 1021 3 1020 4 1018 5 1018 6 1016 7 10...


In [3]:
def mask2rle(img, width=1024, height=1024, max_color=1):
    rle = []
    lastColor = 0
    currentPixel = 0
    runStart = -1
    runLength = 0

    for x in range(width):
        for y in range(height):
            currentColor = img[x][y]
            if currentColor != lastColor:
                if currentColor == max_color:
                    runStart = currentPixel
                    runLength = 1
                else:
                    rle.append(str(runStart))
                    rle.append(str(runLength))
                    runStart = -1
                    runLength = 0
                    currentPixel = 0
            elif runStart > -1:
                runLength += 1
            lastColor = currentColor
            currentPixel+=1

    return " ".join(rle)

def rle2mask(rle, width=1024, height=1024):
    mask = np.zeros(width * height)
    """WARNING: This function should only be used in SIIM dataset because it constains .T() transformation here"""
    if rle == '-1': return mask.reshape(width, height)
    array = np.asarray([int(x) for x in rle.split()])
    starts = array[0::2]
    lengths = array[1::2]

    current_position = 0
    for index, start in enumerate(starts):
        current_position += start
        mask[current_position:current_position+lengths[index]] = 255
        current_position += lengths[index]
    """WARNING: This function should only be used in SIIM dataset because it constains .T() transformation here"""
    return mask.reshape(width, height)

In [4]:
dataset_dict = dict()
dataset_id = dataset['ImageId']
dataset_set = set(dataset_id.to_list())
print("Unique Size: {} Actual Size: {}".format(len(dataset_set), len(dataset_id)))
for id in dataset_set:
    dataset_dict[id] = dataset.loc[dataset_id == id]['EncodedPixels']
dataset_dict

Unique Size: 10675 Actual Size: 11582


{'1.2.276.0.7230010.3.1.4.8323329.13726.1517875247.412197.dcm': 6826    -1
 Name: EncodedPixels, dtype: object,
 '1.2.276.0.7230010.3.1.4.8323329.14006.1517875249.93352.dcm': 3591    -1
 Name: EncodedPixels, dtype: object,
 '1.2.276.0.7230010.3.1.4.8323329.5587.1517875188.903880.dcm': 11469    -1
 Name: EncodedPixels, dtype: object,
 '1.2.276.0.7230010.3.1.4.8323329.1469.1517875167.950672.dcm': 9228    -1
 Name: EncodedPixels, dtype: object,
 '1.2.276.0.7230010.3.1.4.8323329.372.1517875162.608228.dcm': 8445    213226 10 1011 14 1008 18 1003 22 1000 23 999 ...
 8446    239133 1 1019 6 1017 7 1015 10 1013 11 1012 12...
 8447    589951 2 1019 8 1014 11 1011 13 1009 16 1007 1...
 8448    126496 27 992 37 982 52 968 61 961 73 949 88 9...
 8449    214256 1 1020 8 1012 11 1011 13 1009 14 1007 1...
 Name: EncodedPixels, dtype: object,
 '1.2.276.0.7230010.3.1.4.8323329.31942.1517875157.591754.dcm': 10000    -1
 Name: EncodedPixels, dtype: object,
 '1.2.276.0.7230010.3.1.4.8323329.12980.15178752

In [5]:
def get_mask_sum(encodes):
#     print("This has length: {}".format(length))
    masks = []
    for encode in encodes:
        mask = rle2mask(encode)
#         plt.imshow(mask)
        masks.append(mask)
#     print("Original Pixels: {}".format(np.array(masks).sum()/256))
    masks = (np.array(masks).sum(axis=0) > 0).astype(np.byte)
#     print("Transformed Pixels: {}".format(masks.sum()/1))
    
    """Testing if encode is correct"""
#     plt.imshow(masks)
#     plt.imshow(rle2mask(mask2rle(masks)))

    masks = mask2rle(masks)
    return masks
    
new_dataset = dict()
pbar = tqdm(dataset_dict.items())
for i, (id, encodes) in enumerate(pbar):
    encodes = encodes.to_list()
    length = len(encodes)
    mask = encodes
    if length > 1:
        mask = [get_mask_sum(mask)]
    assert len(mask) == 1
    new_dataset[i] = [id, mask[0]]

HBox(children=(IntProgress(value=0, max=10675), HTML(value='')))




In [6]:
df = pd.DataFrame.from_dict(new_dataset, orient='index', columns=['ImageId', 'EncodedPixels'])
df.head()

Unnamed: 0,ImageId,EncodedPixels
0,1.2.276.0.7230010.3.1.4.8323329.13726.15178752...,-1
1,1.2.276.0.7230010.3.1.4.8323329.14006.15178752...,-1
2,1.2.276.0.7230010.3.1.4.8323329.5587.151787518...,-1
3,1.2.276.0.7230010.3.1.4.8323329.1469.151787516...,-1
4,1.2.276.0.7230010.3.1.4.8323329.372.1517875162...,126496 27 992 37 982 52 968 61 961 73 949 88 9...


In [7]:
df.to_csv(NEW_PATH, index=False)

# Adding Metadata & Saving to `.npy`

In [14]:
SAVE = False
IMG_PATH = "/home/koke_cacao/Documents/Koke_Cacao/Python/WorkSpace/RedstoneTorch/data/siim_dataset/siim-original/dicom-images-train"
PATH = "/home/koke_cacao/Documents/Koke_Cacao/Python/WorkSpace/RedstoneTorch/data/siim_dataset/train-rle-mix.csv"
NEW_PATH = "/home/koke_cacao/Documents/Koke_Cacao/Python/WorkSpace/RedstoneTorch/data/siim_dataset/train-rle-mix-added.csv"
dataset = pd.read_csv(PATH)
dataset.head()

Unnamed: 0,ImageId,EncodedPixels
0,1.2.276.0.7230010.3.1.4.8323329.13726.15178752...,-1
1,1.2.276.0.7230010.3.1.4.8323329.14006.15178752...,-1
2,1.2.276.0.7230010.3.1.4.8323329.5587.151787518...,-1
3,1.2.276.0.7230010.3.1.4.8323329.1469.151787516...,-1
4,1.2.276.0.7230010.3.1.4.8323329.372.1517875162...,126496 27 992 37 982 52 968 61 961 73 949 88 9...


In [9]:
def get_metadata_by_id(path):
    ds = pydicom.dcmread(path)
    return ds.PatientAge, ds.PatientSex, ds.PixelSpacing, ds.ReferringPhysicianName, ds.SeriesDescription, ds.ViewPosition

def get_load_image_by_id(path):
    ds = pydicom.read_file(path)
    return np.array(ds.pixel_array)

In [10]:
for id in dataset['ImageId'].to_list():
    print(dataset.loc[dataset['ImageId'] == id].to_dict())
    break

{'ImageId': {0: '1.2.276.0.7230010.3.1.4.8323329.13726.1517875247.412197.dcm'}, 'EncodedPixels': {0: '-1'}}


In [15]:
# imageid = []
# encodedpixels = []
# PatientAge = []
# PatientSex = []
# PixelSpacing = []
# ReferringPhysicianName = []
# SeriesDescription = []
# ViewPosition = []

new_dataset = dict()
num_saved = 0

pbar = tqdm(dataset.iterrows())
for i, (index, row) in enumerate(pbar):
#     imageid.append(row['ImageId'])
#     encodedpixels.append(row['EncodedPixels'])
    path = os.path.join(IMG_PATH, row['ImageId'])
    a,b,c,d,e,f = get_metadata_by_id(path)
    
    save_path = path.replace(".dcm", ".npy")
    if SAVE and not os.path.isfile(save_path):
        np.save(save_path, get_load_image_by_id(path))
        num_saved = num_saved + 1
    pbar.set_description("num_saved = {}".format(num_saved))
    
#     PatientAge.append(a)
#     PatientSex.append(b)
#     PixelSpacing.append(c)
#     ReferringPhysicianName.append(d)
#     SeriesDescription.append(e)
#     ViewPosition.append(f)
    if SAVE:
        new_dataset[i] = [row['ImageId'].replace(".dcm", ".npy"), row['EncodedPixels'], a,b,c,d,e,f]
    else:
        new_dataset[i] = [row['ImageId'], row['EncodedPixels'], a,b,c,d,e,f]


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [16]:
df = pd.DataFrame.from_dict(new_dataset, orient='index', columns=['ImageId', 'EncodedPixels', 'PatientAge', 'PatientSex', 'PixelSpacing', 'ReferringPhysicianName', 'SeriesDescription', 'ViewPosition'])
df.head()

Unnamed: 0,ImageId,EncodedPixels,PatientAge,PatientSex,PixelSpacing,ReferringPhysicianName,SeriesDescription,ViewPosition
0,1.2.276.0.7230010.3.1.4.8323329.13726.15178752...,-1,69,M,"[0.14300000000000002, 0.14300000000000002]",,view: PA,PA
1,1.2.276.0.7230010.3.1.4.8323329.14006.15178752...,-1,46,F,"[0.168, 0.168]",,view: AP,AP
2,1.2.276.0.7230010.3.1.4.8323329.5587.151787518...,-1,54,F,"[0.14300000000000002, 0.14300000000000002]",,view: PA,PA
3,1.2.276.0.7230010.3.1.4.8323329.1469.151787516...,-1,48,F,"[0.168, 0.168]",,view: AP,AP
4,1.2.276.0.7230010.3.1.4.8323329.372.1517875162...,126496 27 992 37 982 52 968 61 961 73 949 88 9...,34,M,"[0.19431099999999998, 0.19431099999999998]",,view: PA,PA


In [17]:
df.to_csv(NEW_PATH, index=False)

# Leakage Usage

In [3]:
PATH = "/home/koke_cacao/Documents/Koke_Cacao/Python/WorkSpace/RedstoneTorch/data/siim_dataset/old_sample_submission_leakage.csv"
NEW_PATH = "/home/koke_cacao/Documents/Koke_Cacao/Python/WorkSpace/RedstoneTorch/data/siim_dataset/train-rle-mix-added-leak.csv"
dataset = pd.read_csv(PATH)
dataset.head()

Unnamed: 0,ImageId,EncodedPixels
0,1.2.276.0.7230010.3.1.4.8323329.9488.151787521...,-1
1,1.2.276.0.7230010.3.1.4.8323329.7388.151787520...,-1
2,1.2.276.0.7230010.3.1.4.8323329.8104.151787520...,-1
3,1.2.276.0.7230010.3.1.4.8323329.952.1517875165...,-1
4,1.2.276.0.7230010.3.1.4.8323329.680.1517875164...,-1


In [7]:
dic = dict()

# count each appearance
for id in dataset['ImageId'].tolist():
    if id not in dic.keys():
        dic[id] = 1
    else:
        dic[id] = dic[id] + 1

# classify each appearance
appear = dict()
for id, appearance in dic.items():
    if dic[id] not in appear.keys():
        appear[dic[id]] = [id]
    else:
        appear[dic[id]].append(id)

print(appear) # we know that multiple appearance is not empty

{1: ['1.2.276.0.7230010.3.1.4.8323329.9488.1517875218.175563', '1.2.276.0.7230010.3.1.4.8323329.7388.1517875204.201431', '1.2.276.0.7230010.3.1.4.8323329.8104.1517875209.54822', '1.2.276.0.7230010.3.1.4.8323329.952.1517875165.756716', '1.2.276.0.7230010.3.1.4.8323329.680.1517875164.49743', '1.2.276.0.7230010.3.1.4.8323329.586.1517875163.569244', '1.2.276.0.7230010.3.1.4.8323329.7050.1517875202.507168', '1.2.276.0.7230010.3.1.4.8323329.600.1517875163.632969', '1.2.276.0.7230010.3.1.4.8323329.8303.1517875210.164933', '1.2.276.0.7230010.3.1.4.8323329.6707.1517875199.622316', '1.2.276.0.7230010.3.1.4.8323329.9338.1517875217.483777', '1.2.276.0.7230010.3.1.4.8323329.6652.1517875199.400704', '1.2.276.0.7230010.3.1.4.8323329.8599.1517875212.789106', '1.2.276.0.7230010.3.1.4.8323329.8129.1517875209.182736', '1.2.276.0.7230010.3.1.4.8323329.6115.1517875196.616758', '1.2.276.0.7230010.3.1.4.8323329.6682.1517875199.532073', '1.2.276.0.7230010.3.1.4.8323329.8507.1517875212.328410', '1.2.276.0.7230

In [9]:
l = []
for key, item in appear.items():
    if key != 1:
        l = l + item
print("There are {} images we can use in the test set.".format(len(l)))

There are 248 images we can use in the test set.
