# Prostate segmentation on IDC collection -- MRI US Prostate Biopsy
*   Dataset : [MRI US Prostate Biopsy]()
*   Goal : Prostate segmentation using Task24 Prostate nnU-net pre-trained model, T2 input

resample preds to idc image data with .nrrd == LPS orientation instead of RAS == necessary for DICOM conversion of AI segs

# Global variables

## Variables used for resampling -- inference -- setup labelID for ground truth segs

In [None]:
# IDC collection for paths setup
collection_name = "prostatex" #up to user
##nnunet
model_type = '3d_fullres'#other options are '2d',..
nnunet_labelID = 1 #for task005 model, labelid=1 means PZ and 2 means TZ
# ##labelID seg retrieval
# ground_truth_cat_CodeMeaning = 'Anatomical Structure'
# ground_truth_type_CodeMeaning = 'Peripheral zone of the prostate'
#Resampling scheme -- this option is made for qin-rep-repeatability(SeriesDescription tag dependence,etc), not modular for other collections with multi modal inference input
desired_grid = "T2" #Other option is "ADC" 

## Global paths

In [None]:
import os 

#qin_prostate_repeatibility
os.environ["prostatex_root"] = os.path.join(os.getcwd(), "prostateX")
os.environ["prostatex_dicom"] = os.path.join(os.environ["prostatex_root"], "dicom")
os.environ["prostatex_nii"] = os.path.join(os.environ["prostatex_root"], "nii")

#nnunet
os.environ["nnUNet"] = os.path.join(os.getcwd(), "nnUNet")
os.environ["nnUNet_data"] = os.path.join(os.environ["nnUNet"], "data")
os.environ['nnUNet_raw_data_base'] = os.path.join(os.environ["nnUNet_data"], "nnUNet_raw_data")
os.environ['nnUNet_preprocessed'] = os.path.join(os.environ["nnUNet_data"], "processed")
os.environ["nnUNet_models"] = os.path.join(os.environ["nnUNet"], "models")
os.environ["RESULTS_FOLDER"] = os.path.join(os.environ["nnUNet"], "output", "preds")
os.environ["nnUNet_preds_post_processed"] = os.path.join(os.environ["nnUNet"], "output", "preds_processed")
os.environ["nnUNet_preds_resampled"] = os.path.join(os.environ["nnUNet"], "output", "preds_resampled")
os.environ["nnUNet_preds_resampled_dcm"] = os.path.join(os.environ["nnUNet"], "output", "preds_resampled_dcm")
os.environ["nnUNet_raw_data"] = os.path.join(os.environ["nnUNet"], "raw_data")
#path where model pre-trained weights are stored
os.environ["PATH_TO_MODEL_FILE"] = os.path.join(os.environ["nnUNet"], "models", "Task024_Prostate.zip")

#misc
os.environ["IDC_Downloads"] = os.path.join(os.getcwd(), "IDC_DL")
os.environ["IDC_Downloads_Sorted"] = os.path.join(os.getcwd(), "IDC_DL", "Sorted")
os.environ["logs"] = os.path.join(os.getcwd(), "logs")

#create dirs for specific folders names
for key, path in os.environ.items():
  check_patterns = [True for el in ["prostatex", "nnunet", "IDC", "nnUNet", "logs", "RESULTS_FOLDER"] if el in key]
  if True in check_patterns:
    !mkdir -p $path

# Custom functions

In [None]:
def convert_image_dcm_to_nrrd(input_path, output_path_root, target_format="nii", prefix=""):
  if not os.path.exists(output_path_root): 
    !mkdir -p $output_path_root
  !dcm2niix -z y -m y -f %i_{prefix} -o $output_path_root $input_path
  # out_path_file = f"{output_path_root}/{prefix}.nrrd" 
  # !plastimatch convert \
  # --input $input_path \
  # --output-img $out_path_file

In [None]:
def convert_seg_to_nii(input_path, output_path):
  if not os.path.exists(output_path): 
    !mkdir -p $output_path
  
  print(f'input path : {input_path}')
  print(f'output_path : {output_path}')
  !segimage2itkimage --inputDICOM $input_path --outputDirectory $output_path \
  --outputType nii 

In [None]:
def convert_dcm_sorted(input_path,output_path, idc_df):
  path_dicom = os.environ["prostatex_dicom"] 
  for serie_folder in sorted(glob.glob(os.path.join(input_path, "**", "**", "*"))):#, recursive = True):
    path_serie_dcm_lst = glob.glob(os.path.join(serie_folder, "*.dcm"))
    modality = idc_df[idc_df["SeriesInstanceUID"] == path_serie_dcm_lst[0].split('/')[-2]]["Modality"].iloc[0]#'SEG' if pydicom.dcmread(path_serie_dcm_lst[0]).Modality == "SEG" else "MR"
    seriesInstanceUID = serie_folder.split("/")[-1]
    studyInstanceUID = serie_folder.split("/")[-2]
    patientID = serie_folder.split("/")[-3]
    print(f"Serie processed : {serie_folder}")
    print(f"SeriesDescription : {pydicom.read_file(glob.glob(os.path.join(serie_folder, '*.dcm'))[0]).SeriesDescription}")
    print(f"Modality : {pydicom.read_file(glob.glob(os.path.join(serie_folder, '*.dcm'))[0]).Modality}")
    #convert to nii
    convert_image_dcm_to_nrrd(input_path=serie_folder, 
                           output_path_root=os.path.join(output_path, "nii"),
                           prefix=f"{seriesInstanceUID}")

In [None]:
# https://pydicom.github.io/pydicom/stable/tutorials/dicom_json.html
def get_seg_dcm_tags_pydicom(seg_path_dcm):
  ds = pydicom.dcmread(seg_path_dcm)
  # print(ds)
  dcm_dict = ds.to_json_dict()
  out_dict = {
  'ReferencedSeriesInstanceUID' : dcm_dict['00081115']['Value'][0]['0020000E']['Value'][0], #RefSerieUID == correspond to T2,
  'StudyInstanceUID' : dcm_dict['0020000D']['Value'][0],
  'patientID' : dcm_dict['00100020']['Value'][0],# patientID 
  'SOPClassUID' : dcm_dict['00080016']['Value'][0], # SOP Class UID
  'SOPInstanceUID' : dcm_dict['00080018']['Value'][0],
  'SeriesInstanceUID' : dcm_dict['0020000E']['Value'][0],
  'Modality' : dcm_dict['00080060']['Value'][0], # Modality 
  'SeriesDescription' : dcm_dict['0008103E']['Value'][0], # SeriesDescription
  'studydesc' : dcm_dict['00081030']['Value'][0], # StudyDescription
  'series_time' : dcm_dict['00080031']['Value'][0],#SeriesTime
  'study_time' : dcm_dict['00080030']['Value'][0],#StudyTime
  'series_date' : dcm_dict['00080021']['Value'][0], #SeriesDate
  'study_date' : dcm_dict['00080020']['Value'][0] #StudyDate
  }
  # 00081115 == Referenced Series Sequence
  # 0020000E == Referenced Series Instance UID
  return out_dict

In [None]:
def add_ohif_url_nnunet(row, datastore='', dataset='', app=''):
    #test
    app='fir-idc-prostate-ohif.web.app'
    project='idc-sandbox-003'
    location='us-central1'
    dataset='prostate-seg'
    datastore='whole_prostate_nnunet_id24_2ServersIDC' #whole_prostate_nnunet_id24_2ServersIDC #pz_tz_nnunet_id05_2ServersIDC
    studyUID = row['StudyInstanceUID']
    return f'https://{app}/viewer/{studyUID}!secondGoogleServer=/projects/{project}/locations/{location}/datasets/{dataset}/dicomStores/{datastore}'
    #"https://fir-idc-prostate-ohif.web.app/projects/idc-sandbox-003/locations/us-central1/datasets/prostate-seg/dicomStores/prostatex_no_gt_datastore/study/"+studyUID

In [None]:
def download_idc_data_serie_uid(idc_df):
  # save the list of GCS URLs into a file
  selection_manifest = os.path.join(os.environ["IDC_Downloads"], "idc_manifest.txt")
  idc_df["gcs_url"].to_csv(selection_manifest, header=False, index=False)
  # let's make sure the download folder is clean, in case you ran this cell earlier
  # for a different dataset
  # !rm -rf {os.environ["IDC_Downloads"]+"/*.dcm"}
  !cat {selection_manifest} | gsutil -m cp -I {os.environ["IDC_Downloads"]}
  !python dicomsort/dicomsort.py -k -u {os.environ["IDC_Downloads"]} {os.environ["IDC_Downloads_Sorted"]}/%PatientID/%StudyInstanceUID/%SeriesInstanceUID/%SOPInstanceUID.dcm
  # !rm -rf {os.environ["qin_prostate_rep_dicom"]+"/*"} 
  !mv {os.environ['IDC_Downloads_Sorted']+'/*'} {os.environ["prostatex_dicom"]}
  convert_dcm_sorted(input_path=os.environ["prostatex_dicom"],
                  output_path=os.environ["prostatex_root"], idc_df=selection_df)
  return selection_df

In [None]:
def reformat_image_nnunet():
  #reformats images to correct format, 
  #from global path to nnunet folder==nnUNet preprocessed
  for mr_vol in glob.glob(os.path.join(os.environ["prostatex_nii"], f"*.nii.gz")):
    serieUID = mr_vol.split('/')[-1].split("_")[1].replace(".nii.gz","")#.split(".")[0]
    patientID = mr_vol.split('/')[-1].split("_")[0]
    nnunet_idx = "0000" #if "T2" in mr_vol.split('/')[-2] else "0001"#0000 for T2 and 0001 for ADC
    nnunet_path = os.path.join(os.environ["nnUNet_preprocessed"], 
                                "_".join([patientID, serieUID, nnunet_idx]) + ".nii.gz") 
    !cp $mr_vol $nnunet_path

In [None]:
def largest_component_retrieval(input_path : str, output_path=None):
    """Largest component retrieval 
    Args:
        input_path (str): input seg nifti path, binary image
        output_path (str): output_path after conversion
    Convert binary image into a connected component image, each component has an integer label.
    Relabel components so that they are sorted according to size (there is an optional minimumObjectSize parameter to get rid of small components).
    Get largest connected componet, label==1 in sorted component image.
    """
    assert os.path.exists(input_path)
    input_image = sitk.ReadImage(input_path, imageIO="NiftiImageIO")
    assert len(np.unique(sitk.GetArrayFromImage(input_image))) == 2 # make sure its a binary image
    component_image = sitk.ConnectedComponent(input_image)
    sorted_component_image = sitk.RelabelComponent(component_image, sortByObjectSize=True)
    largest_component_binary_image = sorted_component_image == 1
    if output_path is not None: 
        # assert os.path.exists(output_path)
        print(output_path)
        sitk.WriteImage(largest_component_binary_image, output_path, imageIO="NiftiImageIO")
    else:
        print("No writing on disk of largest component of input.")
    # sanity checks == logs
    print(f"Input path : {input_path}")
    print(f"Output path : {output_path}")
    print(f"Number of components found : {len(np.unique(sitk.GetArrayFromImage(sorted_component_image)))}")
    # print("Done!")
    # print("\n")

In [None]:
def resample_preds(input_path_nnunet_preds="", input_path_t2_idc="", output_path=""):
  for pred_path in sorted(glob.glob(os.path.join(input_path_nnunet_preds, "*.nii.gz"))):
    search_t2_path = os.path.join(input_path_t2_idc, \
                                  f"{pred_path.split('/')[-1].split('_')[0]}_{pred_path.split('/')[-1].split('_')[1].replace('.nii.gz','')}*.nii.gz") #PatientID_SerieUID.nii.gz
                                      #get serieUID
    print(f"search path for {desired_grid} : {search_t2_path}")
    t2_path = glob.glob(search_t2_path, recursive=True)[0]                              
    print(f"{desired_grid}_path found : {t2_path}")
    print(f"pred path : {pred_path}")
    out_processed = os.path.join(os.environ["nnUNet_preds_post_processed"], pred_path.split("/")[-1])
    largest_component_retrieval(input_path=pred_path,output_path=out_processed)
    resample_args_to_t2_origin = {"input" : out_processed,#change to pred path if no largest_component_retrieval necessary 
                          "output" : os.path.join(output_path, 
                                                  f"{pred_path.split('/')[-1][:-7]}_resampled.nii.gz"),
                          "fixed" : t2_path,
                          "interpolation" : "nn"}
    
    path_log = os.path.join(os.environ["logs"], 'log_pypla_res_pred' + pred_path.split('/')[-1].split('.')[0] + '.txt')            
    !touch $path_log
    pypla.resample(verbose = False, **resample_args_to_t2_origin, path_to_log_file=path_log)
    print()

In [None]:
def seg_nii_to_dicom(idc_df, input_path_nii="", input_path_dcm_idc="", output_path_root=""):
  assert os.path.exists(input_path_nii)
  assert os.path.exists(input_path_dcm_idc)
  !mkdir -p $output_path_root
  for nii_seg_pred in glob.glob(os.path.join(input_path_nii, '*.nii.gz')):
    patID = nii_seg_pred.split('/')[-1].split('_')[0]
    study_mr_t2_serieUID = nii_seg_pred.split('/')[-1].split('_')[1].replace(".nii.gz","")

    # study_mr_t2_serieUID = idc_df[idc_df["StudyInstanceUID"] == study_current]["SeriesInstanceUID"].unique()[0]
    #find t2 dcm folder
    t2_dcm_folder = glob.glob(os.path.join(input_path_dcm_idc, patID, "**", study_mr_t2_serieUID))[0]
    #find seg dcm file
    # find nii seg folder == preds resampled
    assert os.path.exists(t2_dcm_folder)
    print('\nConverting...')
    print(f'pred nnunet processed : {nii_seg_pred}')
    print(f't2_dcm_folder : {t2_dcm_folder}')
    output_path = os.path.join(output_path_root, '_'.join([patID, study_mr_t2_serieUID])+'.dcm')
    #find gt seg dcm file == orginal idc dcm files
    #convert nii pred to dcm
    !itkimage2segimage --inputImageList $nii_seg_pred \
    --inputDICOMDirectory  $t2_dcm_folder \
    --inputMetadata  $seg_dcm_metadata_json_file \
    --outputDICOM $output_path 
    print("Done!")

# Auth login

In [None]:
import os
# dcm2niix 
!sudo apt-get install dcm2niix 
# cmd = 'dcm2niix -z y -m y -o %s  %s ' % (pat_dir_nifti_path, path_to_dicom_ct_folder)
# print(cmd)
# ret = os.system(cmd)
# print (ret)
# !pip install dcm2niix

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libyaml-cpp0.6
Suggested packages:
  pigz
The following NEW packages will be installed:
  dcm2niix libyaml-cpp0.6
0 upgraded, 2 newly installed, 0 to remove and 24 not upgraded.
Need to get 300 kB of archives.
After this operation, 1,110 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/main amd64 libyaml-cpp0.6 amd64 0.6.2-4ubuntu1 [124 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal/universe amd64 dcm2niix amd64 1.0.20181125-1build1 [176 kB]
Fetched 300 kB in 1s (412 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 2.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend 

##Colab

In [None]:
#colab 
from google.colab import auth
auth.authenticate_user()

# Setup GCP Project ID

In [None]:
project_id = "idc-sandbox-003"
os.environ["GCP_PROJECT_ID"] = project_id

In [None]:
print(os.environ["nnUNet_models"])

/content/nnUNet/models


# Setup of the Colab VM



In the following cells we will confirm you have a GPU before doing anything else, and will install and import all the Python dependencies. 

The main python packages we need to install are:
* `nnunet` - which is the [codebase for the nn-UNet framework](https://github.com/MIC-DKFZ/nnUNet) we are going to be using for the segmentation step;
* `pydicom`, a Python [package](https://github.com/pydicom/pydicom) that lets the use read, modify, and write DICOM data in an easy "pythonic" way - that we are going to use to distinguish different DICOM objects from each other.

## GPU checks

In [None]:
# check wether the Colab Instance was correctly initialized with a GPU instance
gpu_list = !nvidia-smi --list-gpus

has_gpu = False if "failed" in gpu_list[0] else True

if not has_gpu:
  print("Your Colab VM does not have a GPU - check \"Runtime > Change runtime type\"")

In [None]:
# check which model of GPU the notebook is equipped with - a Tesla K80 or T4
# T4 is the best performing on the two - and can about half the GPU processing time

!nvidia-smi

Sun May  7 02:19:35 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Environment Setup

Here we will configure the Linux environment variables needed to run the nnU-Net pipeline. 

Three main variables are needed by default to run the nnU-Net segmentation pipelines:
* `nnUNet_raw_data_base` is the path to the folder where the segmentation pipeline expects to find the data to process;
* `nnUNet_preprocessed` is the path to the folder where the preprocessed data are saved;
* `RESULTS_FOLDER` is the path to the folder storing by default the model weights and, in our case, for simplicity, the segmentation masks produced by the pipeline.

We will use the additional variable `PATH_TO_MODEL_FILE` to point to the location where the pre-trained model weights for the chosen model will be stored (more on this later).

Please notice that these variables need to be set using `os.environ[]` in Google Colab - as `!export` is not sufficient to guarantee the variables are kept from one cell to the other. For more in-depth information regarding what the nnU-Net framework uses these folders for, please visit [the dedicated nnU-Net documentation page](https://github.com/MIC-DKFZ/nnUNet/blob/master/documentation/setting_up_paths.md)

## Install command-line tools


[Plastimatch](https://plastimatch.org/index.html) is considered to be the swiss army knife of medical images processing: we will use it to convert DICOM (CT, RTSTRUCT) series to NRRD files - but it can be used for a multitude of other tasks, such as registration, resampling, cropping, and computing statistics to name a few. Plastimatch is also available as a 3DSlicer plug-in and can be used directly from the Slicer GUI.

For the sake of clarity and simplicity, we will call Plastimatch from a very simple [Python wrapper](https://github.com/denbonte/pyplastimatch) written for the occasion (unfortunately, Plastimatch does not provide an official one).

In [None]:
%%capture
!sudo apt update

!sudo apt install plastimatch

In [None]:
!echo $(plastimatch --version)

plastimatch version 1.8.0


[dcmqi](https://github.com/QIICR/dcmqi) is an open source library that can help with the conversion between imaging research formats and the standard DICOM representation for image analysis results. More specifically, you can use dcmqi convert DICOM Segmentation objects (DICOM SEG) into research formats, such as NIfTI and NRRD.

In [None]:
%%capture
!wget https://github.com/QIICR/dcmqi/releases/download/v1.2.5/dcmqi-1.2.5-linux.tar.gz
!tar zxvf dcmqi-1.2.5-linux.tar.gz
!cp dcmqi-1.2.5-linux/bin/* /usr/local/bin/

Finally, we are going to install [Subversion](https://subversion.apache.org/), a tool that will allow us to clone GitHub repositories only partially (to save time and space).

In [None]:
%%capture

!sudo apt install subversion

In [None]:
!echo $(svn --version | head -n 2)

svn, version 1.13.0 (r1867053) compiled May 12 2022, 20:47:08 on x86_64-pc-linux-gnu


## Install Python packages

In [None]:
%%capture
!pip install nnunet
!pip install pydicom
!pip install nibabel
!pip install dcm2niix
!pip install SimpleITK

Unpack and install model we downloaded earlier (under `PATH_TO_MODEL_FILE`). This step can take about 1-2 minutes.

In [None]:
%%capture
!nnUNet_install_pretrained_model_from_zip $PATH_TO_MODEL_FILE

Next we set up few things to help with visualization of the segmentations later.

In [None]:
import os
import sys
import shutil
import csv
import random

import os
import glob
import csv
import json

import nibabel as nib

import time
import gdown

import json
import pprint
import numpy as np
import pandas as pd

import pydicom
import nibabel as nib
import SimpleITK as sitk

# from medpy.metric.binary import dc as dice_coef
# from medpy.metric.binary import hd as hausdorff_distance
# from medpy.metric.binary import asd as avg_surf_distance

# from medpy.filter.binary import largest_connected_component

# use the "tensorflow_version" magic to make sure TF 1.x is imported
# %tensorflow_version 2.x
# import tensorflow as tf
# import keras

print("\nThis Colab instance is equipped with a GPU.")


This Colab instance is equipped with a GPU.


In [None]:
# PyPlastimatch - python wrapper for Plastimatch (and interactive notebook visualisation)
!svn checkout https://github.com/AIM-Harvard/pyplastimatch/trunk/pyplastimatch pyplastimatch

A    pyplastimatch/__init__.py
A    pyplastimatch/pyplastimatch.py
A    pyplastimatch/utils
A    pyplastimatch/utils/__init__.py
A    pyplastimatch/utils/data.py
A    pyplastimatch/utils/eval.py
A    pyplastimatch/utils/viz.py
Checked out revision 25.


In [None]:
# dicomsort is the pythong package that can sort DICOM files into
# folder organization based on user-specified DICOM attributes
!git clone https://github.com/pieper/dicomsort.git

Cloning into 'dicomsort'...
remote: Enumerating objects: 169, done.[K
remote: Counting objects: 100% (43/43), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 169 (delta 23), reused 34 (delta 17), pack-reused 126[K
Receiving objects: 100% (169/169), 87.85 KiB | 3.99 MiB/s, done.
Resolving deltas: 100% (86/86), done.


In [None]:
from pyplastimatch import pyplastimatch as pypla
from pyplastimatch.utils import viz as viz_utils
from pyplastimatch.utils import data as data_utils

# Data selection, downloading and structuring -- Conversion to DICOM

We want to select here the collection named qin-prostate repeatibility, and more particularly the two timepoints per patient ID for further analysis.

In order to use data hosted by IDC effectively, you will need to utilize metadata to navigate what data is available and to select specific files that are relevant in your analysis. The main metadata table you will need for this purpose is the [`bigquery-public-data.idc_current.dicom_all`](https://console.cloud.google.com/bigquery?p=bigquery-public-data&d=idc_current&t=dicom_all&page=table) table.

This query has one row per file hosted by IDC. All of IDC data is in DICOM format, and each of the rows in this table will have all of the DICOM attributes extracted from a given file. It will also have various columns containing non-DICOM metadata, such as the name of the collection where the file is included, size of the file, and URL that can be used to retrieve that file.

To query IDC BigQuery tables, you can use one of the following approaches:
1. `%%bigquery` magic will allow you to define your query in plain SQL, and load the result of the query into a Pandas dataframe.
2. [BigQuery Python API](https://googleapis.dev/python/bigquery/latest/index.html) is more flexible in allowing you to parameterize your query.
3. [Google Cloud BigQuery console](https://console.cloud.google.com/bigquery) is very convenient for interactive query exploration of tables.
4. [`gcloud bq`](https://cloud.google.com/bigquery/docs/bq-command-line-tool) is the command line tool that comes as part of [Cloud SDK](https://cloud.google.com/sdk) and is convenient for scripting interactions from the shell. Cloud SDK is preinstalled on Colab.

In the following cells we will utilize `%%bigquery`, Python BigQuery SDK and BigQuery console for working with IDC BigQuery tables.

First, to verify that you are authenticated, and your project ID is working, let's run a test query against IDC BigQuery table to get the summary statistics about the  of data available in IDC.


Given `SeriesInstanceUID` value identifying the image series, we can query the IDC metadata table to get the list of files (defined by the Google Storage URLs) corresponding to this series.

All of the DICOM metadata for each of the DICOM files is available in the BigQuery table we will be querying. We will get not just the `gcs_url`, but also identifiers for the Study, Series and Instance, to better understand organization of data, and since `StudyInstanceUID` will be handy later when we get to the visualization of the data.

In [None]:
from google.cloud import bigquery
bq_client = bigquery.Client(os.environ["GCP_PROJECT_ID"])

In [None]:
selection_query = f"""
  -- get all prostatex segmentations objects
WITH
  prostatex_seg AS (
  SELECT
    DISTINCT(ReferencedSeriesSequence[SAFE_OFFSET(0)].SeriesInstanceUID) AS RefSerieUID
  FROM
    `bigquery-public-data.idc_v14.dicom_all`
  WHERE
    collection_id = 'prostatex'
    AND SegmentSequence[SAFE_OFFSET(0)].SegmentedPropertyTypeCodeSequence[SAFE_OFFSET(0)].CodeMeaning = 'Prostate'
    AND SegmentSequence[SAFE_OFFSET(0)].SegmentedPropertyTypeCodeSequence[SAFE_OFFSET(0)].CodeValue = '41216001' )
SELECT
  dc_all.*
FROM
  `bigquery-public-data.idc_v14.dicom_all` AS dc_all
JOIN
  prostatex_seg
ON
  dc_all.SeriesInstanceUID = prostatex_seg.RefSerieUID
WHERE
  dc_all.collection_id = 'prostatex'
  AND dc_all.Modality = 'MR'
  AND LOWER(dc_all.SeriesDescription) LIKE '%t2%'
ORDER BY
  PatientID"""
selection_result = bq_client.query(selection_query)
selection_df = selection_result.result().to_dataframe()

Get SerieUIDs in bucket processed from nnunet

In [None]:
!rm bucketUIDs_processed.csv
!gcloud storage ls --recursive gs://idc_prostatex/model1/preds_processed_dcm/* > bucketUIDs_processed.csv
sereUID_processed_dcm = pd.read_csv("bucketUIDs_processed.csv", names=["serieUID"], skiprows=[0])
seriesInstanceUID_dcm_processed_lst = [x.split("/")[-1].split("_")[1].replace(".dcm", "") for x in sereUID_processed_dcm.serieUID.values]

rm: cannot remove 'bucketUIDs_processed.csv': No such file or directory


In [None]:
serieUID_current_lst = list(set(selection_df.SeriesInstanceUID.values) - set(seriesInstanceUID_dcm_processed_lst))

In [None]:
len(serieUID_current_lst)

65

# Main loop

In [None]:
seg_whole_prostate_json = "https://www.dropbox.com/s/yhkqnbqqc9fhgps/task024_whole_prostate.json?dl=0"
seg_dcm_metadata_json_file = "pred_metadata.json"#"/content/nnUnet/models/Task024_Promise.zip"
!wget -O $seg_dcm_metadata_json_file $seg_whole_prostate_json

--2023-05-07 02:21:09--  https://www.dropbox.com/s/yhkqnbqqc9fhgps/task024_whole_prostate.json?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.18, 2620:100:601d:18::a27d:512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/yhkqnbqqc9fhgps/task024_whole_prostate.json [following]
--2023-05-07 02:21:09--  https://www.dropbox.com/s/raw/yhkqnbqqc9fhgps/task024_whole_prostate.json
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uccfd1f0e2f8f8399880556fef2b.dl.dropboxusercontent.com/cd/0/inline/B7kHKS_A51gJxxtZMJbCes9Yq7eua4R0A_-aCj58hTWe8y_CS3HZ-D8-wcFhb1GfQlsWgTbWnFhHaXsVh-s7T9xC-6fxVVHuMe_LilbioJjQtJV1nCLkRaQ4oyVq_qY3rZ8ZvuhndCRP6ueeh43NxlCiA_inxjyWA808O02Q8k2rnw/file# [following]
--2023-05-07 02:21:10--  https://uccfd1f0e2f8f8399880556fef2b.dl.dropboxusercontent.com/cd/0/inline/B7kHKS_A51gJxxtZMJbCes9Yq7eua4

## Download and install nnUnet pre-trained model -- task024 -- personal dropbox

nnUnet pre-trained models zip files can also be found [here](https://zenodo.org/record/4003545#.YsWmH-zMLt8).

In [None]:
%%capture
# this will usually take between one and five minutes (but can sometimes take up to eight)
seg_model_url = "https://www.dropbox.com/s/u9m37l8et4hgu4h/Task024_Promise.zip?dl=0"
out_path_mod = os.path.join(os.environ["nnUNet_models"], "Task024_Promise.zip")#"/content/nnUnet/models/Task024_Promise.zip"
!wget -O $out_path_mod $seg_model_url
!nnUNet_install_pretrained_model_from_zip $out_path_mod

In [None]:
def reset_folders():
  for key, path in os.environ.items():
    check_patterns = [True for el in ["nnUNet_preprocessed", "nnUNet_preds_post_processed", \
                                      "prostatex", "IDC_Downloads", "IDC_Downloads_Sorted", \
                                      "prostatex_dicom", "prostatex_root", "nnUNet_preds_resampled"] if el in key]
    if True in check_patterns:
      !rm -rf $path
      !mkdir -p $path
  !rm /content/nnUNet/output/preds/*.nii.gz

In [None]:
#whole process
for serieUID_current in serieUID_current_lst:
  #reset processing folders
  reset_folders()
  #download idc data
  idc_df = download_idc_data_serie_uid(idc_df=selection_df[selection_df.SeriesInstanceUID \
                                                           == serieUID_current])
  ##reformat idc input data
  reformat_image_nnunet()
  # Inference on IDC data
  !nnUNet_predict --input_folder {os.environ["nnUNet_preprocessed"]} \
                --output_folder {os.environ["RESULTS_FOLDER"]} \
                --task_name "Task024_Promise" --model $model_type \
                --save_npz 
  # resample
  resample_preds(input_path_nnunet_preds=os.environ["RESULTS_FOLDER"],\
                  input_path_t2_idc=os.environ["prostatex_nii"], output_path=os.environ["nnUNet_preds_resampled"])
  # Convert nnunet preds to dicom
  seg_nii_to_dicom(idc_df=idc_df, input_path_nii=os.environ["nnUNet_preds_resampled"], \
                   input_path_dcm_idc=os.environ["prostatex_dicom"], output_path_root=os.environ["nnUNet_preds_resampled_dcm"])
  #upload to buckets
  !gsutil -m cp -r {os.environ['nnUNet_preds_resampled_dcm']}/* gs://idc_prostatex/model1/preds_processed_dcm/

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Convert 21 DICOM as /content/prostateX/nii/ProstateX-0102_1.3.6.1.4.1.14519.5.2.1.7311.5101.248203933751895346486742820088 (384x384x21x1)
Conversion required 0.520525 seconds (0.520420 for core code).


Please cite the following paper when using nnUNet:

Isensee, F., Jaeger, P.F., Kohl, S.A.A. et al. "nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation." Nat Methods (2020). https://doi.org/10.1038/s41592-020-01008-z


If you have questions or suggestions, feel free to open an issue at https://github.com/MIC-DKFZ/nnUNet

using model stored in  /content/nnUNet/output/preds/nnUNet/3d_fullres/Task024_Promise/nnUNetTrainerV2__nnUNetPlansv2.1
This model expects 1 input modalities for each image
Found 1 unique case ids, here are some examples: ['ProstateX-0102_1.3.6.1.4.1.14519.5.2.1.7311.5101.248203933751895346486742820088']
If they don't look right, make sure to d