# Env setup and google auth

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
!apt-get install lz4

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  lz4
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 90.0 kB of archives.
After this operation, 236 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 lz4 amd64 1.9.3-2build2 [90.0 kB]
Fetched 90.0 kB in 0s (183 kB/s)
Selecting previously unselected package lz4.
(Reading database ... 123622 files and directories currently installed.)
Preparing to unpack .../lz4_1.9.3-2build2_amd64.deb ...
Unpacking lz4 (1.9.3-2build2) ...
Setting up lz4 (1.9.3-2build2) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
!pip install pyyaml s5cmd pydicom

Collecting s5cmd
  Downloading s5cmd-0.2.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.7 kB)
Collecting pydicom
  Downloading pydicom-3.0.1-py3-none-any.whl.metadata (9.4 kB)
Downloading s5cmd-0.2.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydicom-3.0.1-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: s5cmd, pydicom
Successfully installed pydicom-3.0.1 s5cmd-0.2.0


# Python imports

In [None]:
import os
import glob
import yaml
import math
import pandas as pd
import json

# Global variables

## Terra data table default column scheme

In [None]:
terra_data_columns = ["default_id",
#OUTPUT ARCHIVE FILES -- leave empty, terra will fill these values
'mhubCompressedOutputFile',
'radsAiCompressedOutputFile',
'finalCompressedOutputFile',
#Resampling scheme
'res_scheme_format',
#Combination -- which DICOM values characterize whole prostate gland?
'dicomCodeValuesProstate_lst',
'dicomCodeMeaningProstate_lst',
'dicomCodingSchemeDesignatorProstate_lst',
#Radiomics
##AI -- which AI SEGs segments to compute radiomics into SR DICOM?
'dicomSrAiCodeValues_lst',
'dicomSrAiCodeMeaning_lst',
'dicomSrAiCodingSchemeDesignator_lst',
#MHUB -- models to run inferene from, and associated config files
'mhub_model_name_lst',
'mhubai_custom_config_lst',
'mhubaiCustomSegmentAlgorithmName_lst',#custom SegmentAlgorithm lists for each mhub run model, define as ['','','','] if not desired
#IDC serieUIDs parameters -- which (images,idc_seg_experts) from IDC to run
'collection_id',
'seriesInstanceUIDs',
'adcSeriesInstanceUIDs']

## Terra parameters per collection basis

### ProstateX

In [None]:
terra_table_id_prostatex_inference_only = "terra_mhub_prostatex_inference_only"
terra_prostatex_zonal_only_values = [json.dumps(""),#mhubCompressedOutputFile
    json.dumps(""),#radsAiCompressedOutputFile
    json.dumps(""),#finalCompressedOutputFile
    # json.dumps("nii"),#res_scheme_format
    "nii",
    json.dumps(["41216001", "T-9200B"]),#dicomCodeValuesProstate_lst
    json.dumps(["Prostatic_structure", "Prostate"]),#dicomCodeMeaningProstate_lst
    json.dumps(["SCT", "SRT"]),#dicomCodingSchemeDesignatorProstate_lst
    json.dumps(["41216001", "279706003", "399384005"]),#dicomSrAiCodeValues_lst
    json.dumps(["Prostatic_structure",
              "Structure_of_peripheral_glandular_zone_of_prostate_(body_structure)",
              "Structure_of_transition_zone_of_prostate_(body_structure)"]),#dicomSrAiCodeMeaning_lst
    json.dumps(["SCT", "SCT", "SCT"]),#dicomSrAiCodingSchemeDesignator_lst
    json.dumps(["bamf_nnunet_mr_prostate", "nnunet_prostate_task24",
                "monai_prostate158", "nnunet_prostate_zonal_task05"]),#mhub_model_name_lst
    json.dumps(["default", "default",
                "default", "default"]),#mhubai_custom_config_lst,
    json.dumps(["bamf_nnunet_mr_prostate", "nnunet_prostate_task24",
                  "monai_prostate158", "nnunet_prostate_zonal_task05"]),#mhubaiCustomSegmentAlgorithmName_lst
    json.dumps("prostatex_inference_only"),
    # json.dumps(idc_seg_batch),#idcSegSeriesInstanceUIDs
    # json.dumps(""),#idcAddSegSeriesInstanceUIDs
    # json.dumps(image_seg_batch),#t2_seriesInstanceUIDs
    # json.dumps(image_adc_seg_batch),#adc_seriesInstanceUIDs
    ]

# Function to build terra data table

In [None]:
def build_data_table(idc_data_df,#idc dataframe containing T2 analyzed images serieUIDs and associated adc and expert seg seriesInstanceUIDs
                     terra_col_scheme_lst, #terra data table column scheme
                     terra_table_id, #every terra data table needs an id, such as 'PLACEHOLDER_id'
                     terra_col_values_lst, #values associated to the column scheme
                     idcSegColName,#t2 image df column name
                     t2ImageColName,#adc image df column name
                     adcImageColName,#idc expert seg df column name
                     idcSegColNameAdd=None#idc expert seg second set df column name
                     ):

  # Set the number of rows per file
  series_per_batch = 12

  # Calculate the number of files needed
  num_files = math.ceil(len(idc_data_df) / series_per_batch)

  # Split the dataframe into multiple dataframes
  dfs = [idc_data_df[i*series_per_batch:(i+1)*series_per_batch] for i in range(num_files)]

  # Create a new column name for the batch_id column
  batch_id_column = f'entity:{terra_table_id}_id'

  # Initialize an empty DataFrame to store the batch information
  out_batch_df = pd.DataFrame(columns=[batch_id_column]+terra_col_scheme_lst[1:])#first index is id

  idc_seg_lst = []
  image_seg_lst = []
  image_adc_seg_lst = []
  if idcSegColNameAdd is not None:#idc data df contains a col name with second set of expert SEG
    idc_seg_add_lst = []
    for batch_df in dfs:
      idc_seg_lst.append(list(batch_df[idcSegColName].values))
      idc_seg_add_lst.append(list(batch_df[idcSegColNameAdd].values))
      image_seg_lst.append(list(batch_df[t2ImageColName].values))
      image_adc_seg_lst.append(list(batch_df[adcImageColName].values))
    idx=0
    for image_seg_batch, idc_seg_batch, idc_seg_add_batch, image_adc_seg_batch in zip(image_seg_lst,idc_seg_lst, idc_seg_add_lst, image_adc_seg_lst):
      out_batch_df.loc[idx] = [int(idx)] + terra_col_values_lst + [json.dumps(idc_seg_batch),
          json.dumps(idc_seg_add_batch),
          json.dumps(image_seg_batch),
          json.dumps(image_adc_seg_batch)]
      idx+=1
    return out_batch_df
  else:#no second set of EXPERT SEG
    if idcSegColName is not None:
      for batch_df in dfs:
        idc_seg_lst.append(list(batch_df[idcSegColName].values))
        image_seg_lst.append(list(batch_df[t2ImageColName].values))
        image_adc_seg_lst.append(list(batch_df[adcImageColName].values))
      idx=0
      for image_seg_batch, idc_seg_batch, image_adc_seg_batch in zip(image_seg_lst, idc_seg_lst, image_adc_seg_lst):
        out_batch_df.loc[idx] = [int(idx)] + terra_col_values_lst + [json.dumps(idc_seg_batch),
            json.dumps(""),
            json.dumps(image_seg_batch),
            json.dumps(image_adc_seg_batch)]
        idx+=1
    else:#no first set of EXPERT SEG
      for batch_df in dfs:
        # idc_seg_lst.append(list(batch_df[idcSegColName].values))
        image_seg_lst.append(list(batch_df[t2ImageColName].values))
        image_adc_seg_lst.append(list(batch_df[adcImageColName].values))
      idx=0
      for image_seg_batch, image_adc_seg_batch in zip(image_seg_lst, image_adc_seg_lst):
        out_batch_df.loc[idx] = [int(idx)] + terra_col_values_lst + [json.dumps(image_seg_batch),
            json.dumps(image_adc_seg_batch)]
        idx+=1
    return out_batch_df

# Query IDC data

Now that the model environment is set up, it's time to download some data.

For our study, we are interested in retrieving, within individual studies and for one T2 image, one ADC image.

Let's query all T2W images from ProstateX from the same studies we did evaluation on. This subset of images will be used to study volume repeatability across one study.

In [None]:
%%bigquery cohort_prostatex_inference_df --project idc-sandbox-003
WITH
  t2_series AS(
  SELECT
    DISTINCT dc_all.SeriesInstanceUID,
    dc_all.StudyInstanceUID,
    dc_all.PatientID
  FROM
    `bigquery-public-data.idc_current.dicom_all` AS dc_all
  WHERE
    dc_all.collection_id = 'prostatex'
    AND dc_all.Modality = 'MR'
    AND LOWER(dc_all.SeriesDescription) LIKE '%t2_tse_tra%'
  ORDER BY
    PatientID),
  adc_series AS(
  SELECT
    dc_adc.StudyInstanceUID,
    ARRAY_AGG(dc_adc.SeriesInstanceUID
    ORDER BY
      dc_adc.SeriesDate, dc_adc.SeriesTime DESC
    LIMIT
      1)[SAFE_OFFSET(0)] AS SeriesInstanceUID,
  FROM
    `bigquery-public-data.idc_current.dicom_all` AS dc_adc
  JOIN
    t2_series
  ON
    dc_adc.StudyInstanceUID = t2_series.StudyInstanceUID
  WHERE
    LOWER(dc_adc.SeriesDescription) LIKE '%adc%'
    AND collection_id = "prostatex"
  GROUP BY
    dc_adc.StudyInstanceUID),
  concat_t2_adc_series AS(
  SELECT
    t2_series.SeriesInstanceUID AS t2_serieUID,
    adc_series.SeriesInstanceUID AS adc_serieUID,
    t2_series.StudyInstanceUID
  FROM
    t2_series
  INNER JOIN
    adc_series
  ON
    t2_series.StudyInstanceUID = adc_series.StudyInstanceUID),
  t2_with_seg AS (
  SELECT
    concat_t2_adc_series.t2_serieUID,
    concat_t2_adc_series.adc_serieUID,
    concat_t2_adc_series.StudyInstanceUID,
  FROM
    concat_t2_adc_series)
SELECT
  DISTINCT dc_all.SeriesInstanceUID as t2_serieUID,
  t2_with_seg.adc_serieUID,
  dc_all.StudyInstanceUID,
  "" AS expert_zonal_prostate_serieUID,
  "" AS expert_zonal_prostate_image_serieUID,
  "" AS expert_zonal_prostate_studyUID
FROM
  `bigquery-public-data.idc_current.dicom_all` AS dc_all
INNER JOIN
  t2_with_seg
ON
  dc_all.StudyInstanceUID = t2_with_seg.StudyInstanceUID
WHERE
  dc_all.collection_id = 'prostatex'
  AND dc_all.Modality = 'MR'
  AND LOWER(dc_all.SeriesDescription) LIKE '%t2_tse_tra%'

Query is running:   0%|          |

Downloading:   0%|          |

# Generate .tsv terra data tables per collection basis

## ProstateX

## QIN-Prostate-Repeatability

In [None]:
terra_prostatex_inference_df = build_data_table(idc_data_df=cohort_prostatex_inference_df,
  terra_col_scheme_lst=terra_data_columns,
  terra_table_id=terra_table_id_prostatex_inference_only,
  terra_col_values_lst=terra_prostatex_zonal_only_values,
  t2ImageColName="t2_serieUID",
  adcImageColName="adc_serieUID",
  idcSegColName=None,
  idcSegColNameAdd=None)

In [None]:
terra_prostatex_inference_df.to_csv("terra_prostatex_inference_only.tsv", sep="\t", index=None)