In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import tensorflow as tf
import glob
import pydicom
import PIL

from PIL import ImageOps
from tqdm import tqdm_notebook
from multiprocessing import Pool
from multiprocessing import cpu_count

sys.path.append('../')
from rsna import utils

## config

In [3]:
! ls /mirco-kaggle/rsna/

GCP Credits Request Link - RSNA.txt  stage_1_test_images_jpg
all.zip				     stage_1_test_images_jpg_dev
label_map.pbtxt			     stage_1_test_images_jpg_eqhist
stage_1_detailed_class_info.csv      stage_1_train_images
stage_1_sample_submission.csv	     stage_1_train_images.zip
stage_1_test_images		     stage_1_train_images_jpg
stage_1_test_images.zip		     stage_1_train_images_jpg_eqhist
stage_1_test_images_dcm_dev	     stage_1_train_labels.csv


In [4]:
# source_images_path = "/mirco-kaggle/rsna/stage_1_test_images/"
source_images_path = "/mirco-kaggle/rsna/stage_1_train_images/"


In [5]:
# destination_images_path = "/mirco-kaggle/rsna/stage_1_test_images_jpg/"
# destination_images_path = "/mirco-kaggle/rsna/stage_1_train_images_jpg/"

# destination_images_path = "/mirco-kaggle/rsna/stage_1_test_images_jpg_eqhist/"
# destination_images_path = "/mirco-kaggle/rsna/stage_1_train_images_jpg_eqhist/"
destination_images_path = "/home/ubuntu/stage_1_train_images_jpg_eqhist/"

In [6]:
if not os.path.exists(destination_images_path):
    os.makedirs(destination_images_path)

In [7]:
dcm_image_filepaths = glob.glob(os.path.join(source_images_path, "*.dcm"))
print(f"total number of DCM files: {len(dcm_image_filepaths)}")

total number of DCM files: 25684


## extract image from dcm

In [8]:
def dcm_to_jpg(dcm_image_filepath):
    image_filename = os.path.basename(dcm_image_filepath).split(".")[0]
    image_array = utils.Annot._read_dcm(dcm_image_filepath)
    image_pil = PIL.Image.fromarray(image_array)
    
    image_pil = ImageOps.equalize(image_pil)
    
    image_pil.save(os.path.join(destination_images_path, image_filename + ".jpg"), "JPEG", quality=100)

In [9]:
with Pool(cpu_count()) as p:
    tf_examples = list(tqdm_notebook(p.imap(dcm_to_jpg, dcm_image_filepaths), total=len(destination_images_path)))

HBox(children=(IntProgress(value=0, max=45), HTML(value='')))




## potentially fix jpgs

In [10]:
# from: https://github.com/tensorflow/models/issues/2194

def fix_jpg():
    path_images = destination_images_path
    filenames_src = tf.gfile.ListDirectory(path_images)
    for filename_src in tqdm_notebook(filenames_src):
        stem, extension = os.path.splitext(filename_src)
        if (extension.lower() != '.jpg'): continue


        pathname_jpg = '{}/{}'.format(path_images, filename_src)
        with tf.gfile.GFile(pathname_jpg, 'rb') as fid:
            encoded_jpg = fid.read(4)
        # png
        if(encoded_jpg[0] == 0x89 and encoded_jpg[1] == 0x50 and encoded_jpg[2] == 0x4e and encoded_jpg[3] == 0x47):
            # copy jpg->png then encode png->jpg
            print('png:{}'.format(filename_src))
            pathname_png = '{}/{}.png'.format(path_images, stem)
            tf.gfile.Copy(pathname_jpg, pathname_png, True)
            PIL.Image.open(pathname_png).convert('RGB').save(pathname_jpg, "jpeg")   
        # gif
        elif(encoded_jpg[0] == 0x47 and encoded_jpg[1] == 0x49 and encoded_jpg[2] == 0x46):
            # copy jpg->gif then encode gif->jpg
            print('gif:{}'.format(filename_src))
            pathname_gif = '{}/{}.gif'.format(path_images, stem)
            tf.gfile.Copy(pathname_jpg, pathname_gif, True)
            PIL.Image.open(pathname_gif).convert('RGB').save(pathname_jpg, "jpeg")   
        elif(filename_src == 'beagle_116.jpg' or filename_src == 'chihuahua_121.jpg'):
            # copy jpg->jpeg then encode jpeg->jpg
            print('jpeg:{}'.format(filename_src))
            pathname_jpeg = '{}/{}.jpeg'.format(path_images, stem)
            tf.gfile.Copy(pathname_jpg, pathname_jpeg, True)
            PIL.Image.open(pathname_jpeg).convert('RGB').save(pathname_jpg, "jpeg")   
        elif(encoded_jpg[0] != 0xff or encoded_jpg[1] != 0xd8 or encoded_jpg[2] != 0xff):
            print('not jpg:{}'.format(filename_src))


In [11]:
fix_jpg()

HBox(children=(IntProgress(value=0, max=25684), HTML(value='')))


