In [1]:
import os
import numpy as np
import pandas as pd
import pydicom
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm_notebook as tqdm
from datetime import datetime

In [2]:
def mask2rle(img, width=1024, height=1024, max_color=1):
    rle = []
    lastColor = 0
    currentPixel = 0
    runStart = -1
    runLength = 0

    for x in range(width):
        for y in range(height):
            currentColor = img[x][y]
            if currentColor != lastColor:
                if currentColor == max_color:
                    runStart = currentPixel
                    runLength = 1
                else:
                    rle.append(str(runStart))
                    rle.append(str(runLength))
                    runStart = -1
                    runLength = 0
                    currentPixel = 0
            elif runStart > -1:
                runLength += 1
            lastColor = currentColor
            currentPixel+=1

    return " ".join(rle)

def rle2mask(rle, width=1024, height=1024, max_color=255):
    mask = np.zeros(width * height)
    """WARNING: This function should only be used in SIIM dataset because it constains .T() transformation here"""
    if rle == '-1': return mask.reshape(width, height)
    array = np.asarray([int(x) for x in rle.split()])
    starts = array[0::2]
    lengths = array[1::2]

    current_position = 0
    for index, start in enumerate(starts):
        current_position += start
        mask[current_position:current_position+lengths[index]] = max_color
        current_position += lengths[index]
    """WARNING: This function should only be used in SIIM dataset because it constains .T() transformation here"""
    return mask.reshape(width, height)

In [3]:
def draw(image, vmin=0, vmax=1):
    F = plt.figure()
    plt.imshow(np.squeeze(image), cmap='plasma', vmin=vmin, vmax=vmax)
    plt.grid(False)
def draw_gray(image, vmin=0, vmax=255):
    F = plt.figure()
    plt.imshow(np.squeeze(image), cmap='Greys', vmin=vmin, vmax=vmax)
    plt.grid(False)
def get_metadata_by_id(path):
    ds = pydicom.dcmread(path)
    return ds.PatientAge, ds.PatientSex, ds.PixelSpacing, ds.ReferringPhysicianName, ds.SeriesDescription, ds.ViewPosition
def get_load_image_by_id(path):
    ds = pydicom.read_file(path)
    return np.array(ds.pixel_array)
def get_time(ts): #1517875163.537053 -> 2018-02-05 23:59:23
    return datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')

# Settings

In [72]:
SIIM_DATASET = "/home/koke_cacao/Documents/Koke_Cacao/Python/WorkSpace/RedstoneTorch/data/siim_dataset/siim_dataset.csv"
External_PATH1 = "/home/koke_cacao/Documents/Koke_Cacao/Python/WorkSpace/RedstoneTorch/data/siim_dataset/Data_Entry_2017.csv"
External_PATH2 = "/home/koke_cacao/Documents/Koke_Cacao/Python/WorkSpace/RedstoneTorch/data/siim_dataset/BBox_List_2017.csv"
OUTPUT_FILE = "/home/koke_cacao/Documents/Koke_Cacao/Python/WorkSpace/RedstoneTorch/data/siim_dataset/siim_dataset_external.csv"

In [None]:
siim_csv = pd.read_csv(SIIM_DATASET).set_index("ImageId")
siim_csv.head()

# External Dataset Preprocess

In [7]:
# concat two external datasets
external_csv = pd.concat([pd.read_csv(External_PATH1).set_index("Image Index")["Finding Labels"], pd.read_csv(External_PATH2).set_index("Image Index")["Finding Label"]], sort=False)
external_csv.head()

Image Index
00000001_000.png              Cardiomegaly
00000001_001.png    Cardiomegaly|Emphysema
00000001_002.png     Cardiomegaly|Effusion
00000002_000.png                No Finding
00000003_000.png                    Hernia
dtype: object

In [18]:
# filter out non-sick images
sick = external_csv[external_csv.str.contains("Pneumothorax")]
sick.head()

# filter out non-sick images and images in siim stage-1 tran & test
sick_external = sick[~sick.index.isin(siim_csv["NIHId"])]
sick_external.head()

# Set everything to 1
sick_external[:] = 1
sick_external.head()

Image Index
00000013_011.png    1
00000013_032.png    1
00000013_033.png    1
00000013_037.png    1
00000103_002.png    1
dtype: int64

In [19]:
# filter out sick images
non_sick = external_csv[~external_csv.str.contains("Pneumothorax")]
non_sick.head()

# filter out non-sick images and images in siim stage-1 tran & test
non_sick_external = non_sick[~non_sick.index.isin(siim_csv["NIHId"])]
non_sick_external.head()

# Set everything to 0
non_sick_external[:] = 0
non_sick_external.head()

Image Index
00000001_000.png    0
00000001_001.png    0
00000001_002.png    0
00000002_000.png    0
00000003_000.png    0
dtype: int64

##### Some Junk Code

In [None]:
# get out patient who never had pneumothorax before
def get_no_sick_patient(df):
    # print(df[df == "Pneumothorax"])
    index = df.index.tolist()
    at_least_one = []

    for i in tqdm(index):
        if "Pneumothorax" in df[i]:
            at_least_one.append(i.split("_")[0])
    no_sick = [int(i) for i in set([i.split("_")[0] for i in df.index.tolist()]) - set(at_least_one)]
    print("Index: {}; No sick: {}; At least one: {}".format(len(index), len(no_sick), len(at_least_one)))
    return no_sick

# Concat External to SIIM Big File

In [68]:
siim_csv = pd.read_csv(SIIM_DATASET).set_index("NIHId")
print(len(siim_csv))
siim_csv.head()

12047


Unnamed: 0_level_0,ImageId,PatientId,Test,Rle,Label,Time,TimeFloat
NIHId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00028864_007.png,1.2.276.0.7230010.3.1.4.8323329.5597.151787518...,28864,train,-1,0,2018-02-05 23:59:48,1517875000.0
00015606_016.png,1.2.276.0.7230010.3.1.4.8323329.12515.15178752...,15606,train,-1,0,2018-02-06 00:00:39,1517875000.0
00018865_006.png,1.2.276.0.7230010.3.1.4.8323329.4904.151787518...,18865,train,175349 7 1013 12 1009 17 1005 19 1003 20 1002 ...,1,2018-02-05 23:59:45,1517875000.0
00011831_006.png,1.2.276.0.7230010.3.1.4.8323329.32579.15178751...,11831,train,162376 12 1007 18 1003 22 999 26 996 28 993 30...,1,2018-02-05 23:59:21,1517875000.0
00011460_024.png,1.2.276.0.7230010.3.1.4.8323329.1314.151787516...,11460,train,119331 47 972 82 937 107 912 122 897 137 882 1...,1,2018-02-05 23:59:27,1517875000.0


In [69]:
siim_csv = siim_csv.append(sick_external.to_frame(name="Label"), sort=False)
print(len(siim_csv))

for row_index in tqdm(siim_csv.loc[sick_external.index.tolist()].index.tolist()):
    siim_csv.loc[row_index, "PatientId"] = int(row_index.split("_")[0])


siim_csv.tail()
# siim_csv[siim_csv["NIHId"].isin(sick_external)].head()

13197


HBox(children=(IntProgress(value=0, max=1194), HTML(value='')))




Unnamed: 0,ImageId,PatientId,Test,Rle,Label,Time,TimeFloat
00028924_009.png,,28924.0,,,1,,
00018055_038.png,,18055.0,,,1,,
00016937_014.png,,16937.0,,,1,,
00020671_010.png,,20671.0,,,1,,
00018055_045.png,,18055.0,,,1,,


In [70]:
siim_csv = siim_csv.append(non_sick_external[:-1].to_frame(name="Label"), sort=False)
print(len(siim_csv))


for row_index in tqdm(siim_csv.loc[non_sick_external[:-1].index.tolist()].index.tolist()):
    siim_csv.loc[row_index, "PatientId"] = int(row_index.split("_")[0])

siim_csv.tail()
# siim_csv[siim_csv["NIHId"].isin(sick_external)].head()

112889


HBox(children=(IntProgress(value=0, max=101337), HTML(value='')))




Unnamed: 0,ImageId,PatientId,Test,Rle,Label,Time,TimeFloat
00013187_002.png,,13187.0,,,0,,
00029464_015.png,,29464.0,,,0,,
00025769_001.png,,25769.0,,,0,,
00016837_002.png,,16837.0,,,0,,
00026920_000.png,,26920.0,,,0,,


In [71]:
siim_csv.loc[sick_external.index, "Test"] = "external"
siim_csv.loc[non_sick_external[:-1].index, "Test"] = "external"
siim_csv.tail()

Unnamed: 0,ImageId,PatientId,Test,Rle,Label,Time,TimeFloat
00013187_002.png,,13187.0,external,,0,,
00029464_015.png,,29464.0,external,,0,,
00025769_001.png,,25769.0,external,,0,,
00016837_002.png,,16837.0,external,,0,,
00026920_000.png,,26920.0,external,,0,,


In [None]:
siim_csv.to_csv(OUTPUT_FILE.