<a href="https://colab.research.google.com/github/AIM-Harvard/pyplastimatch/blob/main/notebooks/pyplastimatch_MWE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Download Binaries and Setup

In [1]:
%%capture
!pip install pyplastimatch itk

In [2]:
from pyplastimatch.utils.install import install_precompiled_binaries

install_precompiled_binaries()

PyPlastimatch Plastimatch installation utility.
NOTE: this utility is not meant to be replace the normal install of Plastimatch via apt.
Rather, it is meant to be used in case a Plastimatch binary is not available for a specific distribution.

System distribution: Ubuntu 22.04

Downloading meta JSON in the temp directory /tmp/tmp4878jdl9/release_meta.json... Done.
Matching distribution found in the latest PyPlastimatch release.

Downloading binary in the temp directory /tmp/tmp4878jdl9/plastimatch-ubuntu_22_04... Done.

Installing binaries... Done.
Installing dependencies... Done.


---

In [3]:
!plastimatch

plastimatch version 1.9.4-46-g950dde17
Usage: plastimatch command [options]
Commands:
  add           adjust        average       bbox          boundary    
  crop          compare       compose       convert       dice        
  diff          dmap          dose          drr           dvh         
  fdk           fill          filter        gamma         header      
  intersect     jacobian      lm-warp       mabs          mask        
  maximum       ml-convert    multiply      probe         register    
  resample      scale         segment       sift          stats       
  synth         synth-vf      threshold     thumbnail     union       
  warp          wed           xf-convert    xf-invert   

For detailed usage of a specific command, type:
  plastimatch command


---

## Run Plastimatch

Sanity check to see if plastimatch works as intended, using IDC data.

In [4]:
import os
import time
import random

import pyplastimatch as pypla

In [5]:
from google.colab import files
from google.cloud import storage
from google.cloud import bigquery as bq

project_id = "idc-sandbox-000"

In [6]:
from google.colab import auth
auth.authenticate_user()

In [7]:
s5cmd_release_url = "https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz"
s5cmd_download_path = "s5cmd_2.0.0_Linux-64bit.tar.gz"
s5cmd_path = "s5cmd"

!wget $s5cmd_release_url
!mkdir -p $s5cmd_path && tar zxf $s5cmd_download_path -C $s5cmd_path
!cp s5cmd/s5cmd /usr/bin && rm $s5cmd_download_path && rm -r $s5cmd_path

--2023-09-27 16:55:04--  https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/73909333/2e177ae0-614f-48ba-92fd-04cf9bf41529?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230927%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230927T165504Z&X-Amz-Expires=300&X-Amz-Signature=0160409078503d3cf37476c2feea38811dfcfc6da9e0f91a4c88949e976b0c18&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=73909333&response-content-disposition=attachment%3B%20filename%3Ds5cmd_2.0.0_Linux-64bit.tar.gz&response-content-type=application%2Foctet-stream [following]
--2023-09-27 16:55:04--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/73909333/2e177ae0-614f-48ba-92fd-

In [8]:
%%bigquery cohort_df --project=$project_id

SELECT
  dicom_pivot.PatientID,
  dicom_pivot.Modality,
  dicom_pivot.collection_id,
  dicom_pivot.source_DOI,
  dicom_pivot.StudyInstanceUID,
  dicom_pivot.SeriesInstanceUID,
  dicom_pivot.SOPInstanceUID,
  dicom_pivot.gcs_url
FROM
  `bigquery-public-data.idc_v15.dicom_pivot` dicom_pivot
WHERE
  StudyInstanceUID IN (
    SELECT
      StudyInstanceUID
    FROM
      `bigquery-public-data.idc_v15.dicom_pivot` dicom_pivot
    WHERE
      (
        dicom_pivot.collection_id IN ('Community', 'nsclc_radiomics')
      )
    GROUP BY
      StudyInstanceUID
  )
GROUP BY
  dicom_pivot.PatientID,
  dicom_pivot.Modality,
  dicom_pivot.collection_id,
  dicom_pivot.source_DOI,
  dicom_pivot.StudyInstanceUID,
  dicom_pivot.SeriesInstanceUID,
  dicom_pivot.SOPInstanceUID,
  dicom_pivot.gcs_url
ORDER BY
  dicom_pivot.PatientID ASC,
  dicom_pivot.Modality ASC,
  dicom_pivot.collection_id ASC,
  dicom_pivot.source_DOI ASC,
  dicom_pivot.StudyInstanceUID ASC,
  dicom_pivot.SeriesInstanceUID ASC,
  dicom_pivot.SOPInstanceUID ASC,
  dicom_pivot.gcs_url ASC

Query is running:   0%|          |

Downloading:   0%|          |

In [9]:
cohort_df = cohort_df[cohort_df["Modality"] == "CT"].reset_index(drop = True)

# randomly select one Series from the cohort
sid = random.choice(cohort_df["SeriesInstanceUID"].values)
series_df = cohort_df[cohort_df["SeriesInstanceUID"] == sid].reset_index(drop = True)

In [10]:
def download_series(download_path, series_df):

  """
  Download raw DICOM data and run dicomsort to standardise the input format.
  Arguments:
    download_path : required - path to the folder where the raw data will be downloaded.
    patient_df    : required - Pandas dataframe storing all the information required
                               to pull data  from the IDC buckets.
  """

  s5cmd_gs_file_path = "data/gcs_url_s5cmd.txt"

  sid = series_df["PatientID"].values[0]
  download_path = os.path.join(download_path, sid)

  gcsurl_temp = "cp " + series_df["gcs_url"].str.replace("gs://","s3://") + " " + download_path
  gcsurl_temp.to_csv(s5cmd_gs_file_path, header=False, index=False)

  if not os.path.exists(download_path): os.mkdir(download_path)

  start_time = time.time()
  print("Copying files from IDC buckets to %s..."%(download_path))

  !s5cmd --no-sign-request --endpoint-url https://storage.googleapis.com run data/gcs_url_s5cmd.txt

  elapsed = time.time() - start_time
  print("Done in %g seconds."%elapsed)

  return download_path

In [11]:
%%capture

!mkdir data

# cross-load data
download_path = download_series(download_path = "data", series_df = series_df)

In [12]:
ct_nrrd_path = "data/image.nrrd"

convert_args_ct = {"input" : download_path,
                   "output-img" : ct_nrrd_path}

In [13]:
pypla.convert(verbose = True, **convert_args_ct)


Running 'plastimatch convert' with the specified arguments:
  --input data/LUNG1-078
  --output-img data/image.nrrd
... Done.


In [14]:
!ls -lh data

total 37M
-rw-r--r-- 1 root root 16K Sep 27 16:55 gcs_url_s5cmd.txt
-rw-r--r-- 1 root root 37M Sep 27 16:55 image.nrrd
drwxr-xr-x 2 root root 12K Sep 27 16:55 LUNG1-078
