This script is used to process the existing dataset of DICOM images, and save the preprocessed images as new files on a folder

Imports

In [1]:
import os
import random
from PIL import Image
import numpy as np
import pydicom
from skimage.measure import block_reduce

from datasets.Sarcopenie.utils import transform_to_hu, normalize_pixel, crop_image, add_pad, crop_image_with_coord

Variables

In [2]:
dataset_source_folder = "C:/Users/tmayet/Documents/datasets/fusion/"
# dataset_destination_folder = "C:/Users/tmayet/Documents/datasets/fusion_preprocessed128/"
dataset_destination_folder = "C:/Users/tmayet/Documents/datasets/fusion_preprocessed128_4/"

# here we only take one part of the image in order to process the image faster
image_size = [512, 512]

# if None, will use the data from dicom
window_level= None #  -320
window_width = None # 800
shuffle = True

XY_proportion = 33
XOnly_proportion = 40
YOnly_proportion = 27
proportion_total = XY_proportion + XOnly_proportion + YOnly_proportion

How one tuple of DICOM image is transformed into np.ndarry:

In [3]:
def preprocessing_image(x_dicom, y_dicom, intercept, slope, window_level, window_width, global_var) -> tuple[np.ndarray, np.ndarray, dict]:
    # region do not touch
    # fetch data from DICOM format
    x: np.ndarray = x_dicom.pixel_array.astype(float)
    y: np.ndarray = y_dicom.pixel_array.astype(float)

    # transform the image in HU
    x = transform_to_hu(x, intercept, slope)

    # normalize the image
    x = normalize_pixel(x, window_level, window_width)

    # remove the background
    x, top_left, bottom_right = crop_image(x, return_coord=True)
    y = crop_image_with_coord(y, top_left, bottom_right)

    global_var['max_height'] = max(x.shape[0], global_var['max_height'])
    global_var['max_width'] = max(x.shape[1], global_var['max_width'])
    global_var['min_height'] = min(x.shape[0], global_var['min_height'])
    global_var['min_width'] = min(x.shape[1], global_var['min_width'])
    # endregion

    # pad image to the same size
    x = add_pad(x, new_height=image_size[0], new_width=image_size[1], padding_value=0., vertical_padding='even', horizontal_padding='even')
    y = add_pad(y, new_height=image_size[0], new_width=image_size[1], padding_value=0., vertical_padding='even', horizontal_padding='even')

    # crop again if needed to match the desired dimension
    # in the case we just want a subpart of the image
    # x = crop_image_with_coord(x, top_left=[0, 0], bottom_right=image_size)
    # y = crop_image_with_coord(y, top_left=[0, 0], bottom_right=image_size)

    """
    multiple classes are present into Y mask: 0, 1, 5 and 10
    we need to only have a continuous range of integer into y, so we need to map them
    """
    not_in_range = np.logical_and(y!=0, y!=1)
    not_in_range = np.logical_and(not_in_range, y!=5)
    not_in_range = np.logical_and(not_in_range, y!=10)
    y[not_in_range]=0

    y[y==0]=0
    y[y==1]=1
    y[y==5]=2
    y[y==10]=3

    # if we just want one class we let this line
    # otherwise we comment it
    # y[y!=1]=0

    # down sample the image
    # if we want 512, 512 we can comment this lines
    x = block_reduce(x, block_size=(4, 4), func=np.mean)
    y = block_reduce(y, block_size=(4, 4), func=np.max)

    return  x, y, global_var

In [4]:
# get all the patient folder in the dataset_source_folder
folders = [folder_i_name for folder_i_name in os.listdir(dataset_source_folder)]
print(f"number of patient folder: {len(folders)}")
# print(folders)

number of patient folder: 527


In [5]:
def get_file(base_path: str, rest_path: str) -> str:
    path = os.path.join(base_path, rest_path)

    files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

    if len(files) != 1:
        # raise Exception(f'Wrong number of files in {path}: {len(files)} files found')
        print(f'Wrong number of files in {path}: {len(files)} files found')
        files = [files[0]]

    file = files[0]
    return file

def save_data(iteration: int, x_array: np.ndarray, y_array: np.ndarray, mode: str) -> None:
    # Save as numpy array
    x_save = os.path.join(dataset_destination_folder, f'{iteration}_x')
    y_save = os.path.join(dataset_destination_folder, f'{iteration}_y')
    mode_save = os.path.join(dataset_destination_folder, f'{iteration}_mode')

    np.save(x_save+"_numpy", x)
    np.save(y_save+"_numpy", y)
    np.save(mode_save+"_numpy", {'XY': 1, 'X':2, 'Y':3}[mode])

    # Save as image
    x_image = Image.fromarray(x_array*255).convert('RGB')
    y_image = Image.fromarray(y_array*255).convert('RGB')
    x_image.save(x_save+".jpeg")
    y_image.save(y_save+".jpeg")

global_var = dict(
    window_level_window_width=set(),
    intercept_slope=set(),
    max_width=-1,
    max_height=-1,
    min_height=99999,
    min_width=99999,
)
if shuffle:
    random.shuffle(folders)

dataset_size = len(folders)
number_of_xy = int(XY_proportion/proportion_total * dataset_size)
number_of_x_only = int(XOnly_proportion/proportion_total * dataset_size)
number_of_y_only = dataset_size - number_of_xy - number_of_x_only

cpt_xy = 0
cpt_x = 0
cpt_y = 0

for i, patient_folder in enumerate(folders):
    path_to_patient = os.path.join(dataset_source_folder, patient_folder)
    # print(path_to_patient)

    x_file = get_file(path_to_patient, 'sliceTDM/')
    y_file = get_file(path_to_patient, 'Mask/')

    x_dicom = pydicom.dcmread(os.path.join(path_to_patient, 'sliceTDM/', x_file))
    y_dicom = pydicom.dcmread(os.path.join(path_to_patient, 'Mask/', y_file))

    # fetch dicom data
    intercept = x_dicom.RescaleIntercept
    slope = x_dicom.RescaleSlope

    if window_level is None:
        window_level = x_dicom.WindowCenter
    if window_width is None:
        window_width = x_dicom.WindowWidth
    if not isinstance(window_level, (int, float)):
        window_level = window_level[0]
    if not isinstance(window_width, (int, float)):
        window_width = window_width[0]

    # process the data
    global_var['intercept_slope'] |= {(intercept, slope)}
    global_var['window_level_window_width'] |= {(window_level, window_width)}
    x, y, global_var = preprocessing_image(x_dicom, y_dicom, intercept, slope, window_level, window_width, global_var)

    # save the data
    if i<number_of_xy:
        mode='XY'
        cpt_xy+=1
    elif i<number_of_x_only+number_of_xy:
        mode='X'
        cpt_x +=1
    else:
        cpt_y += 1
        mode='Y'
    save_data(i, x, y, mode)

print("END")

Wrong number of files in C:/Users/tmayet/Documents/datasets/fusion/0522c0251-2762152755250732-115557\sliceTDM/: 2 files found
Wrong number of files in C:/Users/tmayet/Documents/datasets/fusion/0522c0251-2762152755250732-115557\Mask/: 2 files found
Wrong number of files in C:/Users/tmayet/Documents/datasets/fusion/R01-066-1488278016399684-104343\sliceTDM/: 2 files found
Wrong number of files in C:/Users/tmayet/Documents/datasets/fusion/SENIOR_15013101151001\sliceTDM/: 2 files found
Wrong number of files in C:/Users/tmayet/Documents/datasets/fusion/0522c0419-2762152755250732-114727\sliceTDM/: 2 files found
Wrong number of files in C:/Users/tmayet/Documents/datasets/fusion/0522c0419-2762152755250732-114727\Mask/: 2 files found
Wrong number of files in C:/Users/tmayet/Documents/datasets/fusion/0522c0276-2819497684894126-153338\Mask/: 2 files found
Wrong number of files in C:/Users/tmayet/Documents/datasets/fusion/0522c0248-2762152755250732-112254\sliceTDM/: 2 files found
Wrong number of fi

In [6]:
print(f"number of patient folder: {len(folders)}")
print(f"{cpt_xy=}")
print(f"{cpt_x=}")
print(f"{cpt_y=}")
print(global_var)
# 511, 511 without the cropping

number of patient folder: 527
cpt_xy=173
cpt_x=210
cpt_y=144
{'window_level_window_width': {('35.0', '350.0')}, 'intercept_slope': {('0.0', '1.0'), ('-1000.0', '1.0'), ('-1024.0', '1.0')}, 'max_width': 511, 'max_height': 504, 'min_height': 173, 'min_width': 294}
