In [1]:
import pandas as pd
import os
from tqdm import tqdm
import pydicom as dcm
from PIL import Image
import numpy as np
import csv
import tensorflow as tf
from matplotlib import pyplot as plt
import time
from joblib import Parallel, delayed
import sys

In [2]:
orig_train_img_dir = '/home/kwhom/rsna-intracranial-hemorrhage-detection/stage_2_train/'
train_img_dir = '/home/jupyter/rsna-intracranial-hemorrhage-detection/stage_2_train_imgs/'

In [3]:
def get_center_and_width(dicom):
    return tuple([int(x[0]) if type(x) == dcm.multival.MultiValue else int(x) for x in [dicom.WindowCenter, dicom.WindowWidth]])
def normalize_minmax(img):
    mi, ma = img.min(), img.max()
    if mi == ma:
        return img-mi
    return (img - mi) / (ma - mi)

def window_filter(img, center, width, slope, intercept):
    out = np.copy(img)
    out = out*slope + intercept
    lowest_visible = center - width//2
    highest_visible = center + width//2
    
    out[out < lowest_visible] = lowest_visible
    out[out > highest_visible] = highest_visible
    return normalize_minmax(out) * 255

def get_img_tensor(img_path):
    dicom = dcm.dcmread(img_path, force=True)
    
    img = dicom.pixel_array
    center, width = get_center_and_width(dicom)
    slope, intercept = dicom.RescaleSlope, dicom.RescaleIntercept
    brain = window_filter(img, 40, 80, slope, intercept)
    subdural = window_filter(img, 80, 200, slope, intercept)
    tissue = window_filter(img, 40, 380, slope, intercept)
    
    return np.stack([brain, subdural, tissue], axis=2).astype(np.int8)

def write_to_png(img_name, png_dir):
    img_id = img_name.split('.')[0]
    
    try:
        img_array = get_img_tensor(orig_train_img_dir+img_name)
        if img_array.shape == (512,512,3):
            img = Image.fromarray(img_array, 'RGB')
            img.save(png_dir+img_id+'.png')
    except:
        pass
    
    # os.remove(img_dir+img_name)
f = 0
i = 0
present = set(os.listdir(train_img_dir))
def write_if_not_present(img_name):
    global f, i
    if (img_name.split('.')[0] + '.png') not in present:
        write_to_png(img_name, train_img_dir)
        f += 1
    i += 1
    # sys.stdout.write(f'\r{i}, {f}')
    # sys.stdout.flush()

In [4]:
# Parallel(n_jobs=10)(delayed(write_if_not_present)(img_name) for img_name in tqdm(os.listdir(orig_train_img_dir)))
# list(present)[:20]
# for img_name in tqdm(os.listdir(orig_train_img_dir)):
    # write_if_not_present(img_name)

In [5]:
j = 0

def remove_if_error(img_name):
    try:
        Image.open(train_img_dir+img_name)
    except:
        global j
        os.remove(train_img_dir+img_name)
        j += 1
    
Parallel(n_jobs=10, backend='threading', batch_size=10)(delayed(remove_if_error)(img_name) for img_name in tqdm(os.listdir(train_img_dir)))
print(f'removed {j}')

100%|██████████| 752526/752526 [21:38<00:00, 579.52it/s]  


removed 28
