Adapted from : https://www.kaggle.com/code/theoviel/dicom-resized-png-jpg

**Dataset Links :**
- Part 1 : https://www.kaggle.com/datasets/theoviel/rsna-abdominal-trauma-detection-png-pt1
- Part 2 : https://www.kaggle.com/datasets/theoviel/rsna-abdominal-trauma-detection-png-pt2
- Part 3 : https://www.kaggle.com/datasets/theoviel/rsna-2023-abdominal-trauma-detection-pngs-3-8
- Part 4 : https://www.kaggle.com/datasets/theoviel/rsna-abdominal-trauma-detection-png-pt4
- Part 5 : https://www.kaggle.com/datasets/theoviel/rsna-abdominal-trauma-detection-png-pt5
- Part 6 : https://www.kaggle.com/datasets/theoviel/rsna-abdominal-trauma-detection-png-pt6
- Part 7 : https://www.kaggle.com/datasets/theoviel/rsna-abdominal-trauma-detection-pngs-pt7
- Part 8 : https://www.kaggle.com/datasets/theoviel/rsna-2023-abdominal-trauma-detection-pngs-18

**Changes :**
- Apply `standardize_pixel_array` function
- Update links

**TODO :**
- Dicom processing on GPU
- Figure out why example dicom is too dark

In [1]:
#!pip install -qU python-gdcm pydicom pylibjpeg
#!pip install opencv-python
#!pip install seaborn

In [2]:
import os
import cv2
import glob
import gc
import gdcm
import pydicom
import zipfile
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from joblib import Parallel, delayed
from pydicom.pixel_data_handlers.util import apply_voi_lut
from multiprocessing import Pool

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [3]:
def standardize_pixel_array(dcm: pydicom.dataset.FileDataset) -> np.ndarray:
    """
    Source : https://www.kaggle.com/competitions/rsna-2023-abdominal-trauma-detection/discussion/427217
    """
    # Correct DICOM pixel_array if PixelRepresentation == 1.
    pixel_array = dcm.pixel_array
    if dcm.PixelRepresentation == 1:
        bit_shift = dcm.BitsAllocated - dcm.BitsStored
        dtype = pixel_array.dtype 
        new_array = (pixel_array << bit_shift).astype(dtype) >>  bit_shift
        pixel_array = pydicom.pixel_data_handlers.util.apply_modality_lut(new_array, dcm)
    return pixel_array

In [4]:
TRAIN_PATH = "/home/junseonglee/01_codes/input/rsna-2023-abdominal-trauma-detection/train_images/"
BASE_PATH = '/home/junseonglee/01_codes/input/rsna-2023-abdominal-trauma-detection'

print('Number of training patients :', len(os.listdir(TRAIN_PATH)))

Number of training patients : 3147


### Save the processed data

In [5]:
n_chunk = 8
patients = os.listdir(TRAIN_PATH)
n_patients = len(patients)
rng_patients = np.linspace(0, n_patients, n_chunk+1, dtype = int)
N_FOLDS = 5
rng_patients
patients

['43082',
 '29731',
 '65083',
 '9860',
 '38822',
 '61255',
 '30424',
 '24322',
 '19054',
 '50330',
 '28927',
 '51007',
 '58435',
 '42159',
 '46358',
 '23434',
 '19461',
 '14178',
 '65450',
 '17897',
 '56637',
 '521',
 '32819',
 '908',
 '9549',
 '31767',
 '56344',
 '64777',
 '29181',
 '28371',
 '56836',
 '2629',
 '16645',
 '5392',
 '33007',
 '61128',
 '16859',
 '4406',
 '26501',
 '17185',
 '7202',
 '63319',
 '4384',
 '7392',
 '716',
 '15783',
 '48778',
 '12487',
 '3607',
 '2475',
 '38109',
 '50324',
 '43057',
 '31853',
 '24439',
 '36251',
 '33967',
 '51959',
 '11908',
 '54776',
 '3934',
 '36738',
 '41855',
 '55490',
 '15968',
 '65355',
 '18362',
 '49219',
 '15809',
 '54835',
 '12011',
 '18947',
 '55882',
 '33100',
 '496',
 '31645',
 '45870',
 '33133',
 '54037',
 '51927',
 '30877',
 '48830',
 '36327',
 '8044',
 '4852',
 '36542',
 '30091',
 '52714',
 '62066',
 '1526',
 '36105',
 '62397',
 '56082',
 '49096',
 '63948',
 '22954',
 '12875',
 '32568',
 '15697',
 '54669',
 '30954',
 '21147',
 '

In [6]:
data_path = TRAIN_PATH
save_folder = BASE_PATH + '/3d_preprocessed'
RESOL = 256

def process_3d(ind_chunk, rng_patients = rng_patients, patients = patients, data_path = data_path, save_folder = save_folder):
    for i in tqdm(range(rng_patients[ind_chunk], rng_patients[ind_chunk+1])):
        patient = patients[i]
        for study in sorted(os.listdir(data_path + patient)):
            imgs = {}

            for f in sorted(glob.glob(data_path + f"{patient}/{study}/*.dcm")):
                dicom = pydicom.dcmread(f)

                pos_z = dicom[(0x20, 0x32)].value[-1]

                img = standardize_pixel_array(dicom)
                #img = (img - img.min()) / (img.max() - img.min() + 1e-6)
                imgs[pos_z] = img
            #print(len(imgs))
            sample_z = np.linspace(0, len(imgs)-1, 256, dtype=int)
            #sample_z = np.round(sample_z).astype(int)
            imgs_3d = []
            for i, k in enumerate(sorted(imgs.keys())):
                if i in sample_z:
                    img = imgs[k]
                    imgs_3d.append(cv2.resize(img, (RESOL, RESOL))[None])
            imgs_3d = np.vstack(imgs_3d)
            nu = np.zeros((RESOL, RESOL, RESOL))
            for i in range(0, len(imgs_3d[0,0])):
                nu[:,:,i] = cv2.resize(imgs_3d[:,:,i], (RESOL, RESOL))
            imgs_3d  = nu
            imgs_3d = ((imgs_3d - imgs_3d.min()) / (imgs_3d.max() - imgs_3d.min()))

            if dicom.PhotometricInterpretation == "MONOCHROME1":
                imgs_3d = 1 - imgs_3d
            imgs_3d*=255
            imgs_3d = imgs_3d.astype(np.uint8)

            imgs_3d = imgs_3d.reshape(256, RESOL*RESOL)
            cv2.imwrite(f'{save_folder}/{patient}_{study}.png', imgs_3d)

            del imgs_3d, imgs, img, nu
            gc.collect()
   

In [7]:
%%time

with Pool(n_chunk) as p:
    p.map(process_3d, range(0, n_chunk))
if (__name__ == '__main__'):
    pool = Pool(n_chunk)
    pool.map(process_3d, range(0, n_chunk))
    pool.close()
    pool.join()

  1%|▏         | 5/393 [00:10<13:24,  2.07s/it]

KeyboardInterrupt: 

In [None]:
len(glob.glob(f'{save_folder}/*'))

In [None]:
train_df = pd.read_csv(f'{BASE_PATH}/train.csv')
train_meta = pd.read_csv(f'{BASE_PATH}/train_series_meta.csv')
train_df = train_df.sort_values(by=['patient_id'])
train_df

TRAIN_PATH = BASE_PATH + "/train_images/"
n_chunk = 8
patients = os.listdir(TRAIN_PATH)
n_patients = len(patients)
rng_patients = np.linspace(0, n_patients+1, n_chunk+1, dtype = int)
patients_cts = glob.glob(f'{TRAIN_PATH}/*/*')
n_cts = len(patients_cts)
patients_cts_arr = np.zeros((n_cts, 2), int)
data_paths=[]
for i in range(0, n_cts):
    patient, ct = patients_cts[i].split('/')[-2:]
    patients_cts_arr[i] = patient, ct
    data_paths.append(f'{BASE_PATH}/3d_preprocessed/{patients_cts_arr[i,0]}_{patients_cts_arr[i,1]}.png')
TRAIN_IMG_PATH = BASE_PATH + '/processed' 

#Generate tables for training
train_meta_df = pd.DataFrame(patients_cts_arr, columns = ['patient_id', 'series'])

#5-fold splitting
train_df['fold'] = 0
labels = train_df[['bowel_healthy','bowel_injury',
                    'extravasation_healthy','extravasation_injury',
                    'kidney_healthy','kidney_low','kidney_high',
                    'liver_healthy','liver_low','liver_high',
                    'spleen_healthy','spleen_low','spleen_high',
                    'any_injury']].to_numpy()

mskf = MultilabelStratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=0)
counter = 0
for train_index, test_index in mskf.split(np.ones(len(train_df)), labels):
    for i in range(0, len(test_index)):
        train_df['fold'][test_index[i]] = counter
    counter+=1

train_meta_df = train_meta_df.join(train_df.set_index('patient_id'), on='patient_id')
train_meta_df['path']=data_paths
train_meta_df.to_csv(f'{BASE_PATH}/train_meta.csv', index = False)
np.unique(train_df['fold'].to_numpy(), return_counts = True)


Done ! 