# Experimentation: Data Pre-Processing
---

In [1]:
# Import libraries
from monai.data import DataLoader
from monai.transforms import (EnsureChannelFirstd,
Compose, LoadImaged, ResampleToMatchd, MapTransform, SaveImaged, LoadImage)

from monai.apps import TciaDataset
from monai.apps.auto3dseg import AutoRunner
from monai.bundle import ConfigParser

from monai.config import print_config
import json

import dicom2nifti
import nibabel as nib
import numpy as np
import pydicom

print_config()


  from .autonotebook import tqdm as notebook_tqdm


MONAI version: 1.3.0
Numpy version: 1.26.4
Pytorch version: 2.2.1
MONAI flags: HAS_EXT = False, USE_COMPILED = False, USE_META_DICT = False
MONAI rev id: 865972f7a791bf7b42efbcd87c8402bd865b329e
MONAI __file__: c:\Users\<username>\AppData\Local\anaconda3\envs\ITP\Lib\site-packages\monai\__init__.py

Optional dependencies:
Pytorch Ignite version: 0.4.11
ITK version: 5.3.0
Nibabel version: 5.2.1
scikit-image version: 0.22.0
scipy version: 1.12.0
Pillow version: 10.2.0
Tensorboard version: 2.16.2
gdown version: 4.7.3
TorchVision version: 0.17.1
tqdm version: 4.66.2
lmdb version: 1.4.1
psutil version: 5.9.8
pandas version: 2.2.1
einops version: 0.7.0
transformers version: 4.38.2
mlflow version: 2.11.1
pynrrd version: 1.0.0
clearml version: 1.14.5rc0

For details about installing the optional dependencies, please visit:
    https://docs.monai.io/en/latest/installation.html#installing-the-recommended-dependencies



In [2]:
# Specify the collection and segmentation type
collection, seg_type = "HCC-TACE-Seg", "SEG"

# Create a dictionary to map the labels in the segmentation to the labels in the image
label_dict = {'Liver': 0,
  'Mass': 1,
  'Necrosis': 2,
  'Portal vein': 3,
  'Abdominal aorta': 4}

class UndoOneHotEncoding(MapTransform):
    def __init__(self, keys):
        super().__init__(keys)

    def __call__(self, data):
        for key in self.keys:
            data[key] = data[key].argmax(dim=0).unsqueeze(0)
        return data
    
# Create a composed transform that loads the image and segmentation, resamples the image to match the segmentation,
# and undoes the one-hot encoding of the segmentation
transform = Compose(
    [
        LoadImaged(reader="PydicomReader", keys=["image", "seg"], label_dict=label_dict),
        EnsureChannelFirstd(keys=["image", "seg"]),
        #ResampleToMatchd(keys="image", key_dst="seg"),
        #UndoOneHotEncoding(keys="seg"),
        #SaveImaged(keys="seg", output_dir="/segmentations", output_postfix="seg", output_ext=".dcm", output_dtype="torch.float32", data_root_dir="../data/HCC-TACE-Seg"),
    ]
)

# Create a dataset for the training with a validation split
train_dataset = TciaDataset(
    root_dir="../data",
    collection=collection,
    section="training",
    transform=transform,
    download=True,
    download_len=2,
    seg_type=seg_type,
    progress=True,
    cache_rate=0.0,
    val_frac=0.0,
)

2024-03-18 12:54:50,329 - INFO - Expected md5 is None, skip md5 check for file ..\data\HCC-TACE-Seg\1.2.276.0.7230010.3.1.3.8323329.41.1604860085.518229.zip.
2024-03-18 12:54:50,329 - INFO - File exists: ..\data\HCC-TACE-Seg\1.2.276.0.7230010.3.1.3.8323329.41.1604860085.518229.zip, skipped downloading.
2024-03-18 12:54:50,335 - INFO - Writing into directory: ..\data\HCC-TACE-Seg\raw\1.2.276.0.7230010.3.1.3.8323329.41.1604860085.518229.
2024-03-18 12:54:50,965 - INFO - Expected md5 is None, skip md5 check for file ..\data\HCC-TACE-Seg\1.3.6.1.4.1.14519.5.2.1.1706.8374.172517341095680731665822868712.zip.
2024-03-18 12:54:50,965 - INFO - File exists: ..\data\HCC-TACE-Seg\1.3.6.1.4.1.14519.5.2.1.1706.8374.172517341095680731665822868712.zip, skipped downloading.
2024-03-18 12:54:50,981 - INFO - Writing into directory: ..\data\HCC-TACE-Seg\HCC_017\300\image.
2024-03-18 12:54:51,265 - INFO - Expected md5 is None, skip md5 check for file ..\data\HCC-TACE-Seg\1.2.276.0.7230010.3.1.3.8323329.208

In [18]:
from monai.transforms import LoadImage, Transform, SaveImage
import pydicom
import os
import re

class UndoOneHotEncoding(Transform):
    def __call__(self, img):
        return img.argmax(dim=0).unsqueeze(0)

# Define the transform
loader = LoadImage(reader= "PydicomReader", image_only=True, ensure_channel_first=True)
undo_one_hot = UndoOneHotEncoding()



# Regular expression to match the patient directories
patient_dir_pattern = re.compile(r"HCC_\d{3}")
data_root = "../data/HCC-TACE-Seg"

# Iterate over all directories in the dataset
for directory in os.listdir(data_root):
    # If the directory is a patient directory
    if patient_dir_pattern.match(directory):
        
        patient_seg_dir = os.path.join(data_root, directory, "300", "seg")
        
        # Convert the One-Hot-Encoded DICOM segmentations to a single DICOM segmentation
        for seg_file in os.listdir(patient_seg_dir):
            if seg_file.endswith(".dcm"):
                seg_path = os.path.join(patient_seg_dir, seg_file)
                
                # Load the DICOM file
                dicom = loader(seg_path)

                # Apply the transform to the segmentation
                seg = undo_one_hot(dicom)

                print(seg_path)
                
                save_image = SaveImage(output_dir=patient_seg_dir, output_postfix="", output_ext=".dcm", output_dtype="torch.float32", separate_folder=False)
                save_image(seg, seg_path)
                
                # Remove the One-Hot-Encoded segmentation
                # if not seg_file.endswith("_seg.dcm"):
                #     os.remove(seg_path)
                

../data/HCC-TACE-Seg\HCC_017\300\seg\00000001.dcm
2024-03-18 12:31:44,924 INFO image_writer.py:197 - writing: ..\data\HCC-TACE-Seg\HCC_017\300\seg\00000001.dcm
../data/HCC-TACE-Seg\HCC_077\300\seg\00000001.dcm
2024-03-18 12:31:47,934 INFO image_writer.py:197 - writing: ..\data\HCC-TACE-Seg\HCC_077\300\seg\00000001.dcm


In [None]:
# Create a dataloader
train_loader = DataLoader(train_dataset, batch_size=1, num_workers=0)

In [None]:
# Sample a batch of data from the dataloader
batch = next(iter(train_loader))


In [None]:
batch2 = next(iter(train_loader))

In [None]:
# Print the batch data keys
print(batch.keys())

# Print the batch data shapes
print(batch["image"].shape, batch["seg"].shape)

# Print the batch data types
print(batch["image"].dtype, batch["seg"].dtype)

In [52]:
# Separate the image and segmentation from the batch
image, seg = batch["image"], batch["seg"]

# Undo the one-hot encoding of the segmentation
# seg = seg.argmax(dim=1)
# seg = seg.unsqueeze(1)


print(image.shape, seg.shape, seg.unique())


NameError: name 'batch' is not defined

In [None]:
import torch
import matplotlib.pyplot as plt

slice_idx = 60

# Sample a slice from the image
CT_slice = image[0, 0, :, :, slice_idx]

# Get the maximum segmentation class for each pixel in the slice
CT_seg_slice = seg[0, 0, :, :, slice_idx]

print(CT_slice.shape, CT_seg_slice.shape)

# Plot the image and segmentation slice as a subplot
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
axes[0].imshow(CT_slice, cmap="gray")
axes[0].set_title("CT Image")
axes[1].imshow(CT_seg_slice, cmap="jet")
axes[1].set_title("CT Segmentation")
plt.colorbar(mappable=axes[1].imshow(CT_seg_slice, cmap='jet'), ax=axes[1])
plt.show()

## Setup AutoRunner for automatic segmentation model training and hyperparameter finetuning
---

In [4]:
# Add a fold key to all the training data
train_dataset.datalist = [{**item, 'fold': 0} for item in train_dataset.datalist]

# Change "seg" to "label" in the datalist
for item in train_dataset.datalist:
    item["label"] = item.pop("seg")

# Concatenate the training and test datalists
data_list = {"training": train_dataset.datalist}

datalist_file = "../auto3dseg_datalist.json"
with open(datalist_file, "w") as f:
    json.dump(data_list, f)

In [5]:
# Create input configuration .yaml file
input_config = {
    "name": "HCC-TACE-Seg",
    "task": "segmentation",  
    "modality": "CT", 
    "datalist": "../auto3dseg_datalist.json", 
    "dataroot": "../data", 
}

config_yaml = "./auto3dseg_config.yaml"
ConfigParser.export_config_file(input_config, config_yaml)

In [1]:
import torch
print(torch.cuda.is_available())

True


In [3]:
runner = AutoRunner(work_dir = "../data/auto3dseg", input=input_config)
runner.run()

NameError: name 'AutoRunner' is not defined