In [1]:
from pydicom import dcmread
from PIL import Image
from pandas import read_csv
from os import walk, mkdir
from random import seed, shuffle

In [2]:
seed(10)

In [3]:
dataset_path = '/Users/snoopy/Desktop/Other/Harvard/Final Project/RSNA/rsna-pneumonia-detection-challenge'
"""split_lengths = {'train':{'normal': 1341, 'pneumonia': 3875},
                 'test':{'normal': 234, 'pneumonia': 390},
                 'val':{'normal': 8, 'pneumonia': 8}}"""
split_lengths = {'train':{'normal': 12403, 'pneumonia': 3607},
                 'test':{'normal': 4135, 'pneumonia': 1203},
                 'val':{'normal': 4134, 'pneumonia': 1202}}

In [4]:
labels = read_csv(filepath_or_buffer=dataset_path+'/stage_2_train_labels.csv',
                  sep=',',
                  usecols=['patientId', 'Target'],
                  skip_blank_lines=True,
                  on_bad_lines='skip')
labels.drop_duplicates(subset=['patientId'], inplace=True)

In [5]:
files_in_order = walk(dataset_path+'/stage_2_train_images')
files_in_order = next(files_in_order)[2]

In [6]:
filenames_by_class = {'normal': [], 'pneumonia': []}
for filename in files_in_order:
    patient_ID = filename[:-4]
    cur_label = labels[labels['patientId']==patient_ID]['Target'].iloc[0]
    if cur_label==1:
        filenames_by_class['pneumonia'].append(filename)
    else:
        filenames_by_class['normal'].append(filename)

In [7]:
shuffle(filenames_by_class['normal'])
shuffle(filenames_by_class['pneumonia'])

In [8]:
def create_dir(dir_path):
    try:
        mkdir(path=dir_path)
    except FileExistsError:
        pass

In [9]:
#jpeg_dataset_path = 'RSNA-jpeg'
jpeg_dataset_path = 'RSNA-full-jpeg'
splits = ['train', 'test', 'val']
img_classes = ['normal', 'pneumonia']

In [10]:
create_dir(jpeg_dataset_path)
for split in splits:
    create_dir(jpeg_dataset_path+'/'+split)
    for img_class in img_classes:
        create_dir(jpeg_dataset_path+'/'+split+'/'+img_class)

In [11]:
start_index = {'normal': 0, 'pneumonia': 0}
for split in splits:
    for img_class in img_classes:
        for i in range(start_index[img_class], start_index[img_class]+split_lengths[split][img_class]):
            dcm_filename = filenames_by_class[img_class][i]
            jpeg_filename = dcm_filename[:-3]+'jpeg'
            dcm_img = dcmread(dataset_path+'/stage_2_train_images/'+dcm_filename)
            jpeg_img = Image.fromarray(dcm_img.pixel_array)
            jpeg_img.save(fp=jpeg_dataset_path+'/'+split+'/'+img_class+'/'+jpeg_filename,
                          format='jpeg',
                          quality=100,
                          subsampling=0)
        start_index[img_class] = i+1