<a href="https://colab.research.google.com/github/ImagingDataCommons/ai_medima_misc/blob/main/nnunet/notebooks/inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **IDC nnUNet Use-case: Data Inference**

... and Brief Description here.

## **Environment Setup**

In [1]:
import os
import sys

import yaml

import time
import tqdm

from IPython.display import clear_output

# useful information
curr_dir = !pwd
curr_droid = !hostname
curr_pilot = !whoami

print(time.asctime(time.localtime()))

print("\nCurrent directory :", curr_dir[-1])
print("Hostname          :", curr_droid[-1])
print("Username          :", curr_pilot[-1])

print("Python version    :", sys.version.split('\n')[0])

Mon Mar 21 10:11:22 2022

Current directory : /content
Hostname          : 1ae7446a8879
Username          : root
Python version    : 3.7.12 (default, Jan 15 2022, 18:48:18) 


In [2]:
from google.colab import auth
auth.authenticate_user()

In [3]:
from google.cloud import storage
bucket_name = 'idc-medima-paper'
project_name = "idc-sandbox-000"

# location where to store the data (and check if a patient was processed already)
# if a patient was processed already, copy over the segmentation and run only
# the post-processing (split the masks, etc.)
bucket_base_uri = "gs://%s/"%(bucket_name)

In [4]:
!mkdir -p src

!git clone https://github.com/pieper/dicomsort src/dicomsort
!git clone https://github.com/AIM-Harvard/pyplastimatch src/pyplastimatch

Cloning into 'src/dicomsort'...
remote: Enumerating objects: 126, done.[K
remote: Total 126 (delta 0), reused 0 (delta 0), pack-reused 126[K
Receiving objects: 100% (126/126), 37.03 KiB | 6.17 MiB/s, done.
Resolving deltas: 100% (63/63), done.
Cloning into 'src/pyplastimatch'...
remote: Enumerating objects: 333, done.[K
remote: Counting objects: 100% (333/333), done.[K
remote: Compressing objects: 100% (314/314), done.[K
remote: Total 333 (delta 31), reused 302 (delta 12), pack-reused 0[K
Receiving objects: 100% (333/333), 55.56 MiB | 26.22 MiB/s, done.
Resolving deltas: 100% (31/31), done.


Install Plastimatch [...] and check the process was successful.

In [5]:
%%capture
!apt install plastimatch

In [6]:
# check plastimatch was correctly installed
!plastimatch --version

plastimatch version 1.7.0


Download and unpack DCMQI:

In [7]:
# FIXME: always parse the latest?
dcmqi_release_url = "https://github.com/QIICR/dcmqi/releases/download/v1.2.4/dcmqi-1.2.4-linux.tar.gz"
dcmqi_download_path = "/content/dcmqi-1.2.4-linux.tar.gz"
dcmqi_path = "/content/dcmqi-1.2.4-linux"

!wget -O $dcmqi_download_path $dcmqi_release_url

!tar -xvf $dcmqi_download_path

!mv $dcmqi_path/bin/* /bin

--2022-03-21 10:12:30--  https://github.com/QIICR/dcmqi/releases/download/v1.2.4/dcmqi-1.2.4-linux.tar.gz
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/50675718/04f07880-81ee-11eb-92ec-30c7426dae5d?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20220321%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20220321T101231Z&X-Amz-Expires=300&X-Amz-Signature=f45133540b1c1ea10740a51cfa858f7ec5a44061049e866d4490ea3fafb237bb&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=50675718&response-content-disposition=attachment%3B%20filename%3Ddcmqi-1.2.4-linux.tar.gz&response-content-type=application%2Foctet-stream [following]
--2022-03-21 10:12:31--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/50675718/04f07880-81ee-11eb-92ec-30c7426

---

In [8]:
%%capture
!pip install pydicom SimpleITK nnunet

In [9]:
import numpy as np
import pandas as pd
import SimpleITK as sitk

import src.pyplastimatch.pyplastimatch.pyplastimatch as pypla

from google.cloud import bigquery as bq

In [10]:
# FIXME: for development purposes - we will switch to a proper query soon!

# name of the project
PROJECT_NAME = "idc-sandbox-000"

# name of the BQ dataset where the table is copied
DATASET_NAME = "dennis_cohorts"

# name of the BQ table copied from IDC to the user's own project
TABLE_NAME = "nsclc-nnunet-he"

# Table ID to use with the BQ command
TABLE_ID = "%s.%s.%s"%(PROJECT_NAME, DATASET_NAME, TABLE_NAME)

In [11]:
# the query we are going to execute to gather data about the selected cohort
query_str = "SELECT * FROM `%s`"%(TABLE_ID)

# init the BQ client
client = bq.Client(project = "idc-sandbox-000")

# run the query
query_job = client.query(query_str)

# convert the results to a Pandas dataframe
cohort_df = query_job.to_dataframe()

In [12]:
# create the directory tree
!mkdir -p data models output

!mkdir -p data/raw 
!mkdir -p data/raw/tmp data/raw/nsclc-radiomics
!mkdir -p data/raw/nsclc-radiomics/dicom

!mkdir -p data/processed
!mkdir -p data/processed/nsclc-radiomics
!mkdir -p data/processed/nsclc-radiomics/nrrd
!mkdir -p data/processed/nsclc-radiomics/nii
!mkdir -p data/processed/nsclc-radiomics/dicomseg

!mkdir -p data/model_input/
!mkdir -p data/nnunet_output/

Copy the JSON metadata file (generated using [...])

In [13]:
bucket_data_base_uri = os.path.join(bucket_base_uri, "nnunet/data")
dicomseg_json_uri = os.path.join(bucket_data_base_uri, "dicomseg_metadata.json")
dicomseg_json_path = "/content/data/dicomseg_metadata.json"

!gsutil cp $dicomseg_json_uri $dicomseg_json_path

Copying gs://idc-medima-paper/nnunet/data/dicomseg_metadata.json...
/ [1 files][  2.6 KiB/  2.6 KiB]                                                
Operation completed over 1 objects/2.6 KiB.                                      


Download the segmentation models:

In [14]:
# FIXME: download from pvt Dropbox to speed up the development
#        the final notebook should use the official resources only (Zenodo)
seg_model_url = "https://www.dropbox.com/s/m7es2ojn8h0ybhv/Task055_SegTHOR.zip?dl=0"
model_download_path = "/content/models/Task055_SegTHOR.zip"

!wget -O $model_download_path $seg_model_url

--2022-03-21 10:13:07--  https://www.dropbox.com/s/m7es2ojn8h0ybhv/Task055_SegTHOR.zip?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.1.18, 2620:100:6016:18::a27d:112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.1.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/m7es2ojn8h0ybhv/Task055_SegTHOR.zip [following]
--2022-03-21 10:13:08--  https://www.dropbox.com/s/raw/m7es2ojn8h0ybhv/Task055_SegTHOR.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc956005e362f2400eb0bcd7b531.dl.dropboxusercontent.com/cd/0/inline/Bh5Jpco9bWMiRJLoXHxGOQ7hVEfkecnyja49xYtrafhAJ1mfBkf3GKq1DCtWSjsqH787JdRk_QfxdOULcJQJ6T32xa9olqC4c3zPuWx42ImrWhOsWZq8pXeCet5LoTae92ied9Qvhk2vuY7yiDKO3es5vez5eDl-d6V8seMjW0OnMg/file# [following]
--2022-03-21 10:13:08--  https://uc956005e362f2400eb0bcd7b531.dl.dropboxusercontent.com/cd/0/inline/Bh5Jpco9bWMiRJLoXHxGOQ7hVEfkecnyja49xYtraf

Initialize a few environment variables [...]

In [15]:
os.environ["RESULTS_FOLDER"] = "/content/data/nnunet_output/"
os.environ["WEIGHTS_FOLDER"] = "/content/data/nnunet_output/nnUNet"

In [16]:
%%capture
!nnUNet_install_pretrained_model_from_zip $model_download_path

---

# **Function Definition**

## **Data Download and Preparation**

The following function handles the download of a single patient data from the IDC buckets using `gsutil cp`. Furthermore, to organise the data in a more human-understandable and, above all, standardized fashion, the function makes use of [DICOMSort](https://github.com/pieper/dicomsort).

DICOMSort is an open source tool for custom sorting and renaming of dicom files based on their specific DICOM tags. In our case, we will exploit DICOMSort to organise the DICOM data by `PatientID` and `Modality` - so that the final directory will look like the following:

```
raw/nsclc-radiomics/dicom/$PatientID
 └─── CT
       ├─── $SOPInstanceUID_slice0.dcm
       ├─── $SOPInstanceUID_slice1.dcm
       ├───  ...
       │
      RTSTRUCT 
       ├─── $SOPInstanceUID_RTSTRUCT.dcm
      SEG
       └─── $SOPInstanceUID_RTSEG.dcm

```

In [17]:
def download_patient_data(raw_base_path, sorted_base_path,
                          patient_df, remove_raw = True):

  """
  Download raw DICOM data and run dicomsort to standardise the input format.

  Arguments:
    raw_base_path    : required - path to the folder where the raw data will be stored.
    sorted_base_path : required - path to the folder where the sorted data will be stored.
    patient_df       : required - Pandas dataframe (returned from BQ) storing all the
                                  patient information required to pull data from the IDC buckets.
    remove_raw       : optional - whether to remove or not the raw non-sorted data
                                  (after sorting with dicomsort). Defaults to True.
  
  Outputs:
    This function [...]
  """

  # FIXME: this gets overwritten every single time; use `tempfile` library?
  gs_file_path = "gcs_paths.txt"
  patient_df["gcs_url"].to_csv(gs_file_path, header = False, index = False)

  pat_id = patient_df["PatientID"].values[0]
  download_path = os.path.join(raw_base_path, pat_id)

  if not os.path.exists(download_path):
    os.mkdir(download_path)

  # FIXME: ok for a notebook; for scripting, change this to `subprocess`

  start_time = time.time()
  print("Copying files from IDC buckets to %s..."%(download_path))
  !cat $gs_file_path | gsutil -q -m cp -Ir $download_path
  elapsed = time.time() - start_time
  print("Done in %g seconds."%elapsed)

  start_time = time.time()
  print("\nSorting DICOM files..." )
  !python src/dicomsort/dicomsort.py -u $download_path $sorted_base_path/%PatientID/%Modality/%SOPInstanceUID.dcm
  elapsed = time.time() - start_time
  print("Done in %g seconds."%elapsed)

  print("Sorted DICOM data saved at: %s"%(os.path.join(sorted_base_path, pat_id)))

  # get rid of the temporary folder, storing the unsorted DICOM data 
  if remove_raw:
    print("Removing un-sorted data at %s..."%(download_path))
    !rm -r $download_path
    print("... Done.")

---

## **Data Preprocessing**

Brief description here.



In [18]:
def pypla_dicom_ct_to_nrrd(sorted_base_path, processed_nrrd_path,
                           pat_id, verbose = True):
  
  """
  Sorted DICOM patient data to NRRD file (CT volume).

  Arguments:
    sorted_base_path    : required - path to the folder where the sorted data should be stored.
    processed_nrrd_path : required - path to the folder where the preprocessed NRRD data are stored
    remove_raw          : required - patient ID (used for naming purposes).
    verbose             : optional - whether to run pyplastimatch in verbose mode. Defaults to true.
  
  Outputs:
    This function [...]
  """

  # given that everything is standardised already, compute the paths
  path_to_dicom_ct_folder = os.path.join(sorted_base_path, pat_id, "CT")
  
  # sanity check
  assert(os.path.exists(path_to_dicom_ct_folder))
  
  pat_dir_nrrd_path = os.path.join(processed_nrrd_path, pat_id)
  if not os.path.exists(pat_dir_nrrd_path):
    os.mkdir(pat_dir_nrrd_path)

  # output NRRD CT
  ct_nrrd_path = os.path.join(pat_dir_nrrd_path, pat_id + "_CT.nrrd")

  # logfile for the plastimatch conversion
  log_file_path = os.path.join(pat_dir_nrrd_path, pat_id + '_pypla.log')

  # DICOM CT to NRRD conversion (if the file doesn't exist yet)
  if not os.path.exists(ct_nrrd_path):
    convert_args_ct = {"input" : path_to_dicom_ct_folder,
                       "output-img" : ct_nrrd_path}

    # clean old log file if it exist
    if os.path.exists(log_file_path): os.remove(log_file_path)
    
    pypla.convert(verbose = verbose,
                  path_to_log_file = log_file_path,
                  **convert_args_ct)

---

Brief description here.

In [19]:
def pypla_dicom_ct_to_nifti(sorted_base_path, processed_nifti_path,
                            pat_id, verbose = True):
  
  """
  Sorted DICOM patient data to NIfTI file (CT volume).

  Arguments:
    sorted_base_path     : required - path to the folder where the sorted data should be stored.
    processed_nifti_path : required - path to the folder where the preprocessed NIfTI data are stored
    remove_raw           : required - patient ID (used for naming purposes).
    verbose              : optional - whether to run pyplastimatch in verbose mode. Defaults to true.
  
  Outputs:
    This function [...]
  """

  # given that everything is standardised already, compute the paths
  path_to_dicom_ct_folder = os.path.join(sorted_base_path, pat_id, "CT")
  
  # sanity check
  assert(os.path.exists(path_to_dicom_ct_folder))
  
  pat_dir_nifti_path = os.path.join(processed_nifti_path, pat_id)
  if not os.path.exists(pat_dir_nifti_path):
    os.mkdir(pat_dir_nifti_path)

  # output NRRD CT
  ct_nifti_path = os.path.join(pat_dir_nifti_path, pat_id + "_CT.nii.gz")

  # logfile for the plastimatch conversion
  log_file_path = os.path.join(pat_dir_nifti_path, pat_id + '_pypla.log')

  # DICOM CT to NRRD conversion (if the file doesn't exist yet)
  if not os.path.exists(ct_nifti_path):
    convert_args_ct = {"input" : path_to_dicom_ct_folder,
                       "output-img" : ct_nifti_path}

    # clean old log file if it exist
    if os.path.exists(log_file_path): os.remove(log_file_path)
    
    pypla.convert(verbose = verbose,
                  path_to_log_file = log_file_path,
                  **convert_args_ct)

---

Brief description here.

In [20]:
def pypla_dicom_rtstruct_to_nrrd(sorted_base_path, processed_nrrd_path,
                                 pat_id, verbose = True):
  
  """
  Sorted DICOM patient data to NRRD file (RTSTRUCT).

  Arguments:
    sorted_base_path    : required - path to the folder where the sorted data should be stored.
    processed_nrrd_path : required - path to the folder where the preprocessed NRRD data are stored
    remove_raw          : required - patient ID (used for naming purposes).
    verbose             : optional - whether to run pyplastimatch in verbose mode. Defaults to true.
  
  Outputs:
    This function [...]
  """

  # given that everything is standardised already, compute the paths
  path_to_dicom_ct_folder = os.path.join(sorted_base_path, pat_id, "CT")
  path_to_dicom_rt_folder = os.path.join(sorted_base_path, pat_id, "RTSTRUCT")

  pat_dir_nrrd_path = os.path.join(processed_nrrd_path, pat_id)

  # sanity check
  assert(os.path.exists(path_to_dicom_rt_folder))
  assert(os.path.exists(pat_dir_nrrd_path))

  # output NRRD CT
  rt_folder_path = os.path.join(pat_dir_nrrd_path, "rt_segmasks")
  rt_list_path = os.path.join(rt_folder_path, pat_id + "_rt_list.txt")

  # path to the file storing the names of the exported segmentation masks
  # (from the DICOM RTSTRUCT)
  log_file_path = os.path.join(pat_dir_nrrd_path, pat_id + '_pypla.log')

  # DICOM CT to NRRD conversion (if the file doesn't exist yet)
  if not os.path.exists(rt_folder_path):
    convert_args_rt = {"input" : path_to_dicom_rt_folder, 
                       "referenced-ct" : path_to_dicom_ct_folder,
                       "output-prefix" : rt_folder_path,
                       "prefix-format" : 'nrrd',
                       "output-ss-list" : rt_list_path}

    
    pypla.convert(verbose = verbose,
                  path_to_log_file = log_file_path,
                  **convert_args_rt)

---

Brief description here.

In [21]:
def prep_input_data(processed_nifti_path, model_input_folder, pat_id):
  
  """
  Sorted DICOM patient data to NRRD file (RTSTRUCT).

  Arguments:
    src_folder : required - path to the folder where the sorted data should be stored.
    dst_folder : required - path to the folder where the preprocessed NRRD data are stored
    pat_id     : required - patient ID (used for naming purposes).
  
  Outputs:
    This function [...]
  """

  # FIXME: ok for a notebook; for scripting, change this to `shutil`

  pat_dir_nifti_path = os.path.join(processed_nifti_path, pat_id)
  ct_nifti_path = os.path.join(pat_dir_nifti_path, pat_id + "_CT.nii.gz")
  
  copy_to_path = os.path.join(model_input_folder, pat_id + "_0000.nii.gz")
    
  # copy NIfTI to the right dir for nnU-Net processing
  if not os.path.exists(copy_to_path):
    print("Copying %s\nto %s..."%(ct_nifti_path, copy_to_path))
    !cp $ct_nifti_path $copy_to_path
    print("... Done.")

---

## **Data Processing**

Brief description here.

In [22]:
def process_patient_nnunet(model_input_folder, model_output_folder, 
                           nnunet_model, use_tta = False, export_prob_maps = False,
                           verbose = False):

  """
  Infer the thoracic organs at risk segmentation maps using one of the nnU-Net models.

  Arguments:
    model_input_folder  : required - path to the folder where the data to be inferred should be stored.
    model_output_folder : required - path to the folder where the inferred segmentation masks will be stored.
    nnunet_model        : required - pre-trained nnU-Net model to use during the inference phase.
    use_tta             : optional - whether to use or not test time augmentation (TTA). Defaults to False.
    export_prob_maps    : optional - whether to export or not softmax probabilities. Defaults to False.
    verbose             : optional - whether to output text from `nnUNet_predict` or not. Defaults to False.

  Outputs:
    This function [...]
  """
  
  export_prob_maps = "--save_npz" if export_prob_maps == True else ""
  direct_to = "" if verbose == True else "> /dev/null"
  use_tta = "" if use_tta == True else "--disable_tta"

  assert(nnunet_model in ["2d", "3d_lowres", "3d_fullres", "3d_cascade_fullres"])

  start_time = time.time()

  print("Running `nnUNet_predict` with `%s` model..."%(nnunet_model))

  pat_fn_list = sorted([f for f in os.listdir(model_input_folder) if ".nii.gz" in f])
  pat_fn_path = os.path.join(model_input_folder, pat_fn_list[-1])

  print("Processing file at %s..."%(pat_fn_path))

  # run the inference phase
  # accepted options for --model are: 2d, 3d_lowres, 3d_fullres or 3d_cascade_fullres
  !nnUNet_predict --input_folder $model_input_folder \
                  --output_folder $model_output_folder \
                  --task_name "Task055_SegTHOR" \
                  --model $nnunet_model $use_tta $direct_to $export_prob_maps

  elapsed = time.time() - start_time

  print("Done in %g seconds."%elapsed)

---

## **Data Postprocessing**

Description here.

In [23]:
def pypla_nifti_to_nrrd(pred_nifti_path, processed_nrrd_path,
                        pat_id, verbose = True):
  
  """
  Sorted DICOM patient data to NRRD file (RTSTRUCT).

  Arguments:
    src_folder : required - path to the folder where the sorted data should be stored.
    dst_folder : required - path to the folder where the preprocessed NRRD data are stored
    pat_id     : required - patient ID (used for naming purposes).
  
  Returns:
    pred_nrrd_path - 

  Outputs:
    This function [...]
  """

  pred_nrrd_path = os.path.join(processed_nrrd_path, pat_id, pat_id + "_pred_segthor.nrrd")
  log_file_path = os.path.join(processed_nrrd_path, pat_id, pat_id + "_pypla.log")
  
  # Inferred NIfTI segmask to NRRD
  convert_args_pred = {"input" : pred_nifti_path, 
                       "output-img" : pred_nrrd_path}

  pypla.convert(verbose = verbose,
                path_to_log_file = log_file_path,
                **convert_args_pred)
  
  return pred_nrrd_path

---

Description here.

In [24]:
def pypla_postprocess(processed_nrrd_path, model_output_folder, pat_id):

  """
  Sorted DICOM patient data to NRRD file (RTSTRUCT).

  Arguments:
    processed_nrrd_path  : required - path to the folder where the sorted data should be stored.
    model_output_folder  : required - path to the folder where the inferred segmentation masks should be stored.
    pat_id               : required - patient ID (used for naming purposes). 

  Outputs:
    This function [...]
  """

  pred_nifti_fn = pat_id + ".nii.gz"
  pred_nifti_path = os.path.join(model_output_folder, pred_nifti_fn)

  # parse NRRD file - we will make use of if to populate the header of the
  # NRRD mask we are going to get from the inferred segmentation mask
  ct_nrrd_path = os.path.join(processed_nrrd_path, pat_id, pat_id + "_CT.nrrd")
  sitk_ct = sitk.ReadImage(ct_nrrd_path)

  pred_nrrd_path = pypla_nifti_to_nrrd(pred_nifti_path = pred_nifti_path,
                                       processed_nrrd_path = processed_nrrd_path,
                                       pat_id = pat_id, verbose = True)

---

Description here.

In [25]:
def numpy_to_nrrd(model_output_folder, processed_nrrd_path, pat_id,
                  output_folder_name = "pred_softmax", output_dtype = "uint8",
                  structure_list = ["Background", "Esophagus",
                                    "Heart", "Trachea", "Aorta"]):

  """
  Convert softmax probability maps to NRRD. For simplicity, the probability maps
  are converted by default to UInt8

  Arguments:
    model_output_folder : required - path to the folder where the inferred segmentation masks should be stored.
    processed_nrrd_path : required - path to the folder where the preprocessed NRRD data are stored.
    pat_id              : required - patient ID (used for naming purposes).
    output_folder_name  : optional - name of the subfolder under the patient directory 
                                     (under `processed_nrrd_path`) where the softmax NRRD
                                     files will be saved. Defaults to "pred_softmax".
    output_dtype        : optional - output data type. Float16 is not supported by the NRRD standard,
                                     so the choice should be between uint8, uint16 or float32.
                                     Please note this will greatly impact the size of the DICOM PM
                                     file that will be generated.
    structure_list      : optional - list of the structures whose probability maps are stored in the 
                                     first channel of the `.npz` file (output from the nnU-Net pipeline
                                     when `export_prob_maps` is set to True). Defaults to the structure
                                     list for the SegTHOR challenge (background = 0 included).

  Outputs:
    This function [...]
  """

  pred_softmax_fn = pat_id + ".npz"
  pred_softmax_path = os.path.join(model_output_folder, pred_softmax_fn)

  # parse NRRD file - we will make use of if to populate the header of the
  # NRRD mask we are going to get from the inferred segmentation mask
  ct_nrrd_path = os.path.join(processed_nrrd_path, pat_id, pat_id + "_CT.nrrd")
  sitk_ct = sitk.ReadImage(ct_nrrd_path)

  output_folder_path = os.path.join(processed_nrrd_path, pat_id, output_folder_name)
  
  if not os.path.exists(output_folder_path):
    os.mkdir(output_folder_path)

  pred_softmax_all = np.load(pred_softmax_path)["softmax"]

  for channel, structure in enumerate(structure_list):

    # FIXME: NRRD does not support float16 tensors. For now, convert to a float32. 
    #        Then replace with a direct conversion to DICOM?

    pred_softmax_segmask = pred_softmax_all[channel].astype(dtype = np.float32)

    assert(output_dtype in ["uint8", "uint16", "float32"])      

    if output_dtype == "float32":
      # no rescale needed - the values will be between 0 and 1
      # set SITK image dtype to Float32
      sitk_dtype = sitk.sitkFloat32

    elif output_dtype == "uint8":
      # rescale between 0 and 255, quantize
      pred_softmax_segmask = (255*pred_softmax_segmask).astype(np.int)
      # set SITK image dtype to UInt8
      sitk_dtype = sitk.sitkUInt8

    elif output_dtype == "uint16":
      # rescale between 0 and 65536
      pred_softmax_segmask = (65536*pred_softmax_segmask).astype(int)
      # set SITK image dtype to UInt16
      sitk_dtype = sitk.sitkUInt16
    
    pred_softmax_segmask_sitk = sitk.GetImageFromArray(pred_softmax_segmask)
    pred_softmax_segmask_sitk.CopyInformation(sitk_ct)
    pred_softmax_segmask_sitk = sitk.Cast(pred_softmax_segmask_sitk, sitk_dtype)

    output_fn = "%s.nrrd"%(structure)
    output_path = os.path.join(output_folder_path, output_fn)

    writer = sitk.ImageFileWriter()

    writer.UseCompressionOn()
    writer.SetFileName(output_path)
    writer.Execute(pred_softmax_segmask_sitk)

---

Description here.

In [26]:
def nrrd_to_dicomseg(sorted_base_path, processed_base_path,
                     dicomseg_json_path, pat_id, skip_empty_slices = True):

  """
  Export DICOM SEG object from segmentation masks stored in NRRD files.

  Arguments:
    sorted_base_path    : required - path to the folder where the sorted data should be stored.
    processed_base_path : required - path to the folder where the preprocessed NRRD data are stored
    dicomseg_json_path  : required - ...
    pat_id              : required - patient ID (used for naming purposes). 

  Outputs:
    This function [...]
  """

  path_to_ct_dir = os.path.join(sorted_base_path, pat_id, "CT")

  processed_dicomseg_path = os.path.join(processed_base_path, "dicomseg")
  pat_dir_dicomseg_path = os.path.join(processed_dicomseg_path, pat_id)

  if not os.path.exists(pat_dir_dicomseg_path):
    os.mkdir(pat_dir_dicomseg_path)

  pred_segmasks_nrrd = os.path.join(processed_nrrd_path, pat_id, pat_id + "_pred_segthor.nrrd")

  dicom_seg_out_path = os.path.join(pat_dir_dicomseg_path, pat_id + "_SEG.dcm")

  # transform from bool to int according to `itkimage2segimage` requirements
  skip_flag = "--skip" if skip_empty_slices == True else ""

  !itkimage2segimage --inputImageList $pred_segmasks_nrrd \
                     --inputDICOMDirectory $path_to_ct_dir \
                     --outputDICOM $dicom_seg_out_path \
                     --inputMetadata $dicomseg_json_path $skip_flag

---

---

## **General Utilities**

In [27]:
def file_exists_in_bucket(project_name, bucket_name, file_gs_uri):
  
  """
  Check whether a file exists in the specified Google Cloud Storage Bucket.

  Arguments:
    project_name : required - name of the GCP project.
    bucket_name  : required - name of the bucket (without gs://)
    file_gs_uri  : required - file GS URI
  
  Returns:
    file_exists : boolean variable, True if the file exists in the specified,
                  bucket, at the specified location; False if it doesn't.

  Outputs:
    This function [...]
  """

  storage_client = storage.Client(project = project_name)
  bucket = storage_client.get_bucket(bucket_name)
  
  bucket_gs_url = "gs://%s/"%(bucket_name)
  path_to_file_relative = file_gs_uri.split(bucket_gs_url)[-1]

  print("Searching `%s` for: \n%s\n"%(bucket_gs_url, path_to_file_relative))

  file_exists = bucket.blob(path_to_file_relative).exists(storage_client)

  return file_exists

---


In [28]:
def listdir_bucket(project_name, bucket_name, dir_gs_uri):
  
  """
  Export DICOM SEG object from segmentation masks stored in NRRD files.

  Arguments:
    project_name : required - name of the GCP project.
    bucket_name  : required - name of the bucket (without gs://)
    file_gs_uri  : required - directory GS URI
  
  Returns:
    file_list : list of files in the specified GCS bucket.

  Outputs:
    This function [...]
  """

  storage_client = storage.Client(project = project_name)
  bucket = storage_client.get_bucket(bucket_name)
  
  bucket_gs_url = "gs://%s/"%(bucket_name)
  path_to_dir_relative = dir_gs_uri.split(bucket_gs_url)[-1]


  print("Getting the list of files at `%s`..."%(dir_gs_uri))

  file_list = list()

  for blob in storage_client.list_blobs(bucket_name,  prefix = path_to_dir_relative):
    fn = os.path.basename(blob.name)
    file_list.append(fn)

  return file_list

---

In [29]:
def format_dict(input_dict):
  
  """
  Format dictionary [...]

  Arguments:
    input_dict : required - 
    
  Returns:
    output_df : 

  """

  output_df = pd.DataFrame.from_dict(data = input_dict, orient = "index")

  output_df = output_df.reset_index()
  output_df = output_df.rename(columns = {"index" : "PatientID", 
                                          0 : "inference_time"}) 
  
  return output_df

---

# **Putting Everything Together**

## **Parsing Cohort Information from BigQuery Tables**

In [30]:
# FIXME: for debug purposes, only process a handful of patients for now
pat_id_list = sorted(list(set(cohort_df["PatientID"].values)))

print("Total number of unique Patient IDs:", len(pat_id_list))

display(cohort_df.info())

display(cohort_df.head())

Total number of unique Patient IDs: 357
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44157 entries, 0 to 44156
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   PatientID           44157 non-null  object
 1   collection_id       44157 non-null  object
 2   source_DOI          44157 non-null  object
 3   StudyInstanceUID    44157 non-null  object
 4   SeriesInstanceUID   44157 non-null  object
 5   SOPInstanceUID      44157 non-null  object
 6   crdc_study_uuid     44157 non-null  object
 7   crdc_series_uuid    44157 non-null  object
 8   crdc_instance_uuid  44157 non-null  object
 9   gcs_url             44157 non-null  object
 10  idc_version         44157 non-null  object
dtypes: object(11)
memory usage: 3.7+ MB


None

Unnamed: 0,PatientID,collection_id,source_DOI,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,crdc_study_uuid,crdc_series_uuid,crdc_instance_uuid,gcs_url,idc_version
0,LUNG1-389,nsclc_radiomics,10.7937/K9/TCIA.2015.PF0M9REI,1.3.6.1.4.1.32722.99.99.3103908869703256044664...,1.3.6.1.4.1.32722.99.99.2352658665619663766339...,1.3.6.1.4.1.32722.99.99.2414242025218459804872...,0184ddf7-bf1c-4812-ae58-37f0f62eef51,b3324cee-7e85-4b2c-ae2d-4b89a8da50ca,034324d3-0089-4726-bfbb-cd754febde6c,gs://idc-dev-cr/034324d3-0089-4726-bfbb-cd754f...,7.0
1,LUNG1-389,nsclc_radiomics,10.7937/K9/TCIA.2015.PF0M9REI,1.3.6.1.4.1.32722.99.99.3103908869703256044664...,1.3.6.1.4.1.32722.99.99.2352658665619663766339...,1.3.6.1.4.1.32722.99.99.2648221267343378996555...,0184ddf7-bf1c-4812-ae58-37f0f62eef51,b3324cee-7e85-4b2c-ae2d-4b89a8da50ca,053d8545-4adf-44b9-9776-2f149b255796,gs://idc-dev-cr/053d8545-4adf-44b9-9776-2f149b...,7.0
2,LUNG1-389,nsclc_radiomics,10.7937/K9/TCIA.2015.PF0M9REI,1.3.6.1.4.1.32722.99.99.3103908869703256044664...,1.3.6.1.4.1.32722.99.99.2352658665619663766339...,1.3.6.1.4.1.32722.99.99.2483895798395217229931...,0184ddf7-bf1c-4812-ae58-37f0f62eef51,b3324cee-7e85-4b2c-ae2d-4b89a8da50ca,065427e7-c0de-41f1-9cdb-2cd28ee2009c,gs://idc-dev-cr/065427e7-c0de-41f1-9cdb-2cd28e...,7.0
3,LUNG1-389,nsclc_radiomics,10.7937/K9/TCIA.2015.PF0M9REI,1.3.6.1.4.1.32722.99.99.3103908869703256044664...,1.3.6.1.4.1.32722.99.99.2352658665619663766339...,1.3.6.1.4.1.32722.99.99.2829070204870971800854...,0184ddf7-bf1c-4812-ae58-37f0f62eef51,b3324cee-7e85-4b2c-ae2d-4b89a8da50ca,0799aa73-2587-4214-a22a-95d6bb323466,gs://idc-dev-cr/0799aa73-2587-4214-a22a-95d6bb...,7.0
4,LUNG1-389,nsclc_radiomics,10.7937/K9/TCIA.2015.PF0M9REI,1.3.6.1.4.1.32722.99.99.3103908869703256044664...,1.3.6.1.4.1.32722.99.99.2352658665619663766339...,1.3.6.1.4.1.32722.99.99.3684779892051361103024...,0184ddf7-bf1c-4812-ae58-37f0f62eef51,b3324cee-7e85-4b2c-ae2d-4b89a8da50ca,0caa0508-e7f0-4ab6-b514-1c035d1c34d9,gs://idc-dev-cr/0caa0508-e7f0-4ab6-b514-1c035d...,7.0


---

## **Set Run Parameters**


In [31]:
data_base_path = "/content/data"
raw_base_path = "/content/data/raw/tmp"
sorted_base_path = "/content/data/raw/nsclc-radiomics/dicom"

processed_base_path = "/content/data/processed/nsclc-radiomics/"
processed_nrrd_path = os.path.join(processed_base_path, "nrrd")
processed_nifti_path = os.path.join(processed_base_path, "nii")

processed_dicomseg_path = os.path.join(processed_base_path, "dicomseg")
processed_dicompm_path = os.path.join(processed_base_path, "dicompm")

model_input_folder = "/content/data/model_input/"
model_output_folder = "/content/data/nnunet_output/"

bucket_output_base_uri = os.path.join(bucket_base_uri, "nnunet/nnunet_output")

# -----------------
# nnU-Net pipeline parameters

# choose from: "2d", "3d_lowres", "3d_fullres", "3d_cascade_fullres"
nnunet_model = "3d_fullres"
use_tta = True
export_prob_maps = True

experiment_folder_name = nnunet_model + "-tta" if use_tta == True else + nnunet_model + "-no_tta"
bucket_experiment_folder_uri = os.path.join(bucket_output_base_uri, experiment_folder_name)

bucket_log_folder_uri = os.path.join(bucket_experiment_folder_uri, 'log')

bucket_nifti_folder_uri = os.path.join(bucket_experiment_folder_uri, 'nii')
bucket_softmax_pred_folder_uri = os.path.join(bucket_experiment_folder_uri, 'softmax_pred')

bucket_dicomseg_folder_uri = os.path.join(bucket_experiment_folder_uri, 'dicomseg')

# -----------------
# save run information

yaml_fn = "run_params.yaml"
yaml_out_path = os.path.join(data_base_path, yaml_fn)

settings_dict = dict()
settings_dict["bucket"] = dict()
settings_dict["bucket"]["name"] = bucket_name
settings_dict["bucket"]["base_uri"] = bucket_base_uri
settings_dict["bucket"]["output_base_uri"] = bucket_output_base_uri
settings_dict["bucket"]["experiment_folder_uri"] = bucket_experiment_folder_uri
settings_dict["bucket"]["nifti_folder_uri"] = bucket_nifti_folder_uri
settings_dict["bucket"]["softmax_pred_folder_uri"] = bucket_softmax_pred_folder_uri
settings_dict["bucket"]["dicomseg_folder_uri"] = bucket_dicomseg_folder_uri
settings_dict["bucket"]["log_folder_uri"] = bucket_log_folder_uri

settings_dict["inference"] = dict()
settings_dict["inference"]["model"] = nnunet_model
settings_dict["inference"]["use_tta"] = use_tta
settings_dict["inference"]["export_prob_maps"] = export_prob_maps

with open(yaml_out_path, 'w') as fp:
  yaml.dump(settings_dict, fp, default_flow_style = False)

gs_uri_yaml_file = os.path.join(bucket_log_folder_uri, yaml_fn)

!gsutil -m cp $yaml_out_path $gs_uri_yaml_file

Copying file:///content/data/run_params.yaml [Content-Type=application/octet-stream]...
/ [1/1 files][  635.0 B/  635.0 B] 100% Done                                    
Operation completed over 1 objects/635.0 B.                                      


In [32]:
# exclude from processing all the patients for which a DICOM SEG object was exported already
# (stored in the specified Google Cloud Storage Bucket)
dicomseg_bucket_list = listdir_bucket(project_name = project_name,
                                      bucket_name = bucket_name,
                                      dir_gs_uri = bucket_dicomseg_folder_uri)

already_processed_id_list = [f.split("_SEG")[0] for f in dicomseg_bucket_list]

print("\nFound %g patients already processed."%(len(already_processed_id_list)))

pat_to_process_id_list = sorted(list(set(pat_id_list) - set(already_processed_id_list)))

print("Moving on with the remaining %g..."%(len(pat_to_process_id_list)))

Getting the list of files at `gs://idc-medima-paper/nnunet/nnunet_output/3d_fullres-tta/dicomseg`...

Found 5 patients already processed.
Moving on with the remaining 352...


## **Running the Per-patient Analysis**

In [None]:
for idx, pat_id in enumerate(pat_to_process_id_list):

  # -----------------
  # init

  start_total = time.time()

  # init every single time, as the most recent logs are loaded from the bucket
  inference_time_dict = dict()
  total_time_dict = dict()

  clear_output(wait = True)

  print("(%g/%g) Processing patient %s"%(idx + 1, len(pat_to_process_id_list), pat_id))

  patient_df = cohort_df[cohort_df["PatientID"] == pat_id]

  has_segmask_already = False

  dicomseg_fn = pat_id + "_SEG.dcm"

  input_nifti_fn = pat_id + "_0000.nii.gz"
  input_nifti_path = os.path.join(model_input_folder, input_nifti_fn)

  pred_nifti_fn = pat_id + ".nii.gz"
  pred_nifti_path = os.path.join(model_output_folder, pred_nifti_fn)

  pred_softmax_folder_name = "pred_softmax"
  pred_softmax_folder_path = os.path.join(processed_nrrd_path, pat_id, pred_softmax_folder_name)
  
  # -----------------
  # GS URI definition

  # gs URI at which the *nii.gz object is or will be stored in the bucket
  gs_uri_nifti_file = os.path.join(bucket_nifti_folder_uri, pred_nifti_fn)

  # gs URI at which the folder storing the *.nrrd softmax probabilities is or will be stored in the bucket
  gs_uri_softmax_pred_folder = os.path.join(bucket_softmax_pred_folder_uri, pat_id)

  # gs URI at which the DICOM SEG object is or will be stored in the bucket
  gs_uri_dicomseg_file = os.path.join(bucket_dicomseg_folder_uri, dicomseg_fn)


  # -----------------
  # cross-load the CT data from the IDC buckets, run the preprocessing

  # check whether the NIfTI seg mask exists already
  has_segmask_already = file_exists_in_bucket(project_name = project_name,
                                              bucket_name = bucket_name,
                                              file_gs_uri = gs_uri_nifti_file)

  # if the raw segmentation file exists in the output directory but the DICOM SEG
  # doesn't, skip the inference phase. Data still need to be downloaded because
  # the DICOM folder is essential in the DICOM SEG generation process
  download_patient_data(raw_base_path = raw_base_path,
                        sorted_base_path = sorted_base_path,
                        patient_df = patient_df,
                        remove_raw = True)
    
  # DICOM CT to NRRD - good to have for a number of reasons
  pypla_dicom_ct_to_nrrd(sorted_base_path = sorted_base_path,
                         processed_nrrd_path = processed_nrrd_path,
                         pat_id = pat_id, verbose = True)

  # -----------------
  # DL-inference
  
  if has_segmask_already == True:
    # copy the mask in the correct folder etc.
    print("Retrieving the segmentation mask from the specified bucket...")
    print("Copying from %s"%(gs_uri_nifti_file))
    !gsutil -m cp $gs_uri_nifti_file $pred_nifti_path

  else:

    # DICOM CT to NIfTI - required for the processing
    pypla_dicom_ct_to_nifti(sorted_base_path = sorted_base_path,
                            processed_nifti_path = processed_nifti_path,
                            pat_id = pat_id, verbose = True)

    # FIXME: could we get rid of these at least in the inference notebook?
    # DICOM RTSTRUCT to NRRD - good to have for a number of reasons
    """
    pypla_dicom_rtstruct_to_nrrd(sorted_base_path = sorted_base_path,
                                 processed_nrrd_path = processed_nrrd_path,
                                 pat_id = pat_id, verbose = True)
    """

    # prepare the `model_input` folder for the inference phase
    prep_input_data(processed_nifti_path = processed_nifti_path,
                    model_input_folder = model_input_folder,
                    pat_id = pat_id)

    start_inference = time.time()
    # run the DL-based prediction
    process_patient_nnunet(model_input_folder = model_input_folder,
                           model_output_folder = model_output_folder, 
                           nnunet_model = nnunet_model, use_tta = use_tta,
                           export_prob_maps = export_prob_maps, verbose = False)

    elapsed_inference = time.time() - start_inference
    inference_time_dict[pat_id] = elapsed_inference

    # convert the softmax predictions to NRRD files
    numpy_to_nrrd(model_output_folder = model_output_folder,
                  processed_nrrd_path = processed_nrrd_path,
                  pat_id = pat_id,
                  output_folder_name = pred_softmax_folder_name)

    # copy the nnU-Net *.npz softmax probabilities in the chosen bucket
    !gsutil -m cp $pred_softmax_folder_path/* $gs_uri_softmax_pred_folder

    # copy the nnU-Net *.nii.gz binary masks in the chosen bucket
    !gsutil -m cp $pred_nifti_path $gs_uri_nifti_file

    # remove the NIfTI file the prediction was computed from
    !rm $input_nifti_path
    

  # -----------------
  # post-processing
  pypla_postprocess(processed_nrrd_path = processed_nrrd_path,
                    model_output_folder = model_output_folder,
                    pat_id = pat_id)
  
  nrrd_to_dicomseg(sorted_base_path = sorted_base_path,
                   processed_base_path = processed_base_path,
                   dicomseg_json_path = dicomseg_json_path,
                   pat_id = pat_id)

  pred_dicomseg_path = os.path.join(processed_dicomseg_path, pat_id, dicomseg_fn)

  !gsutil -m cp $pred_dicomseg_path $gs_uri_dicomseg_file

  elapsed_total = time.time() - start_total

  if has_segmask_already == False:
    total_time_dict[pat_id] = elapsed_total

  print("End-to-end processing of %s completed in %g seconds.\n"%(pat_id, elapsed_total))

  # -----------------
  # save inference time information - upload after every processing step

  csv_fn = "inference_time.csv"

  csv_path = os.path.join(data_base_path, csv_fn)
  gs_uri_csv_file = os.path.join(bucket_log_folder_uri, csv_fn)

  has_csv_already = file_exists_in_bucket(project_name = project_name,
                                          bucket_name = bucket_name,
                                          file_gs_uri = gs_uri_csv_file)

  # if the log CSV is found already, append to it
  if has_csv_already == True:

    # copy the log CSV from bucket and load it as a DataFrame, append to it
    !gsutil -m cp $gs_uri_csv_file $csv_path

    inference_time_df = pd.read_csv(csv_path, index_col = [0]) 
    add_to_inference_time_df = format_dict(inference_time_dict) 
    new_inference_time_df = pd.concat([inference_time_df, add_to_inference_time_df],
                                      ignore_index = True)
    
    # push the updated version to the bucket
    new_inference_time_df.to_csv(csv_path)
    !gsutil -m cp $csv_path $gs_uri_csv_file

  # in the case the log CSV does not exist yet, create a new one from scratch
  else:
    inference_time_df = format_dict(inference_time_dict) 
    inference_time_df.to_csv(csv_path)

    !gsutil -m cp $csv_path $gs_uri_csv_file

  # -----------------
  # save total processing time information - upload after every processing step

  csv_fn = "total_processing_time.csv"

  csv_path = os.path.join(data_base_path, csv_fn)
  gs_uri_csv_file = os.path.join(bucket_log_folder_uri, csv_fn)

  has_csv_already = file_exists_in_bucket(project_name = project_name,
                                          bucket_name = bucket_name,
                                          file_gs_uri = gs_uri_csv_file)

  # if the log CSV is found already, append to it
  if has_csv_already == True:

    # copy the log CSV from bucket and load it as a DataFrame, append to it
    !gsutil -m cp $gs_uri_csv_file $csv_path

    total_time_df = pd.read_csv(csv_path, index_col = [0]) 
    add_to_total_time_df = format_dict(total_time_dict) 
    new_total_time_df = pd.concat([total_time_df, add_to_total_time_df],
                                  ignore_index = True)
    
    # push the updated version to the bucket
    new_total_time_df.to_csv(csv_path)
    !gsutil -m cp $csv_path $gs_uri_csv_file

  # in the case the log CSV does not exist yet, create a new one from scratch
  else:
    total_time_df = format_dict(total_time_dict) 
    total_time_df.to_csv(csv_path)

    !gsutil -m cp $csv_path $gs_uri_csv_file


(1/352) Processing patient LUNG1-007
Searching `gs://idc-medima-paper/` for: 
nnunet/nnunet_output/3d_fullres-tta/nii/LUNG1-007.nii.gz

Copying files from IDC buckets to /content/data/raw/tmp/LUNG1-007...
Done in 11.7594 seconds.

Sorting DICOM files...
100% 131/131 [00:01<00:00, 69.42it/s] 
Files sorted
Done in 2.45961 seconds.
Sorted DICOM data saved at: /content/data/raw/nsclc-radiomics/dicom/LUNG1-007
Removing un-sorted data at /content/data/raw/tmp/LUNG1-007...
... Done.

Running 'plastimatch convert' with the specified arguments:
  --input /content/data/raw/nsclc-radiomics/dicom/LUNG1-007/CT
  --output-img /content/data/processed/nsclc-radiomics/nrrd/LUNG1-007/LUNG1-007_CT.nrrd
... Done.

Running 'plastimatch convert' with the specified arguments:
  --input /content/data/raw/nsclc-radiomics/dicom/LUNG1-007/CT
  --output-img /content/data/processed/nsclc-radiomics/nii/LUNG1-007/LUNG1-007_CT.nii.gz
... Done.
Copying /content/data/processed/nsclc-radiomics/nii/LUNG1-007/LUNG1-007_CT

---
