# Env setup and google auth

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
!apt-get install lz4

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
lz4 is already the newest version (1.9.3-2build2).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
!pip install pyyaml s5cmd pydicom



# Python imports

In [None]:
import os
import glob
import yaml
import math
import pandas as pd
import json

# Global variables

## Terra data table default column scheme

In [None]:
terra_data_columns = ["default_id",
#OUTPUT ARCHIVE FILES -- leave empty, terra will fill these values
'mhubCompressedOutputFile',
'evalCompressedOutputFile',
'radsAiCompressedOutputFile',
'idcExpertCompressedOutputFile',
'radsIdcExpertCompressedOutputFile',
'finalCompressedOutputFile',
#Resampling scheme
'res_scheme_format',
#AI EVAL -- which segments from AI SEGS do we want to evaluate?
'dicomAiCodeValuesEval_lst',
'dicomAiCodeMeaningEval_lst',
'dicomAiCodingSchemeDesignatorEval_lst',
#IDC EVAL -- which expert annotations SEGs segments?
'dicomIdcCodeValuesEval_lst',
'dicomIdcCodeMeaningEval_lst',
'dicomIdcCodingSchemeDesignatorEval_lst',
##IDC EVAL -- second set of annotations SEGs
'dicomIdcAddCodeValuesEval_lst',
'dicomIdcAddCodeMeaningEval_lst',
'dicomIdcAddCodingSchemeDesignatorEval_lst',
#Combination -- which DICOM values characterize whole prostate gland?
'dicomCodeValuesProstate_lst',
'dicomCodeMeaningProstate_lst',
'dicomCodingSchemeDesignatorProstate_lst',
#Radiomics
##AI -- which AI SEGs segments to compute radiomics into SR DICOM?
'dicomSrAiCodeValues_lst',
'dicomSrAiCodeMeaning_lst',
'dicomSrAiCodingSchemeDesignator_lst',
##IDC -- which IDC expert SEGs segments to compute radiomics into SR DICOM?
'dicomSrIdcCodeValues_lst',
'dicomSrIdcCodeMeaning_lst',
'dicomSrIdcCodingSchemeDesignator_lst',
#MHUB -- models to run inferene from, and associated config files
'mhub_model_name_lst',
'mhubai_custom_config_lst',
'mhubaiCustomSegmentAlgorithmName_lst',#custom SegmentAlgorithm lists for each mhub run model, define as ['','','','] if not desired
#IDC serieUIDs parameters -- which (images,idc_seg_experts) from IDC to run
'collection_id',
'idcSegSeriesInstanceUIDs',
'idcAddSegSeriesInstancceUIDs',
'seriesInstanceUIDs',
'adcSeriesInstanceUIDs']

## Terra parameters per collection basis

### Prostate-MRI-US-Biopsy

In [None]:
terra_table_id_prostate_mri_us_biopsy = "terra_mhub_prostate_mri_us_biopsy"
terra_prostate_mri_us_biopsy_values = [json.dumps(""),#mhubCompressedOutputFile
      json.dumps(""),#evalCompressedOutputFile
      json.dumps(""),#radsAiCompressedOutputFile
      json.dumps(""),#idcExpertCompressedOutputFile
      json.dumps(""),#radsIdcExpertCompressedOutputFile
      json.dumps(""),#finalCompressedOutputFile
      # json.dumps("nrrd"),#res_scheme_format
      "nrrd",
      json.dumps(["41216001", "279706003", "399384005"]),#dicomAiCodeValuesEval_lst
      json.dumps(["Prostatic_structure",
                "Structure_of_peripheral_glandular_zone_of_prostate_(body_structure)",
                "Structure_of_transition_zone_of_prostate_(body_structure)"]),#dicomAiCodeMeaningEval_lst
      json.dumps(["SCT", "SCT", "SCT"]),#dicomAiCodingSchemeDesignatorEval_lst
      json.dumps(["41216001", "None", "None"]),#dicomIdcCodeValuesEval_lst
      json.dumps(["Prostate", "None", "None"]),#dicomIdcCodeMeaningEval_lst
      json.dumps(["SCT", "None", "None"]),#dicomIdcCodingSchemeDesignatorEval_lst
      json.dumps(""),#dicomIdcAddCodeValuesEval_lst
      json.dumps(""),#dicomIdcAddCodeMeaningEval_lst
      json.dumps(""),#dicomIdcAddCodingSchemeDesignatorEval_lst
      json.dumps(["41216001", "T-9200B"]),#dicomCodeValuesProstate_lst
      json.dumps(["Prostatic_structure", "Prostate"]),#dicomCodeMeaningProstate_lst
      json.dumps(["SCT", "SRT"]),#dicomCodingSchemeDesignatorProstate_lst
      json.dumps(["41216001", "279706003", "399384005"]),#dicomSrAiCodeValues_lst
      json.dumps(["Prostatic_structure",
                "Structure_of_peripheral_glandular_zone_of_prostate_(body_structure)",
                "Structure_of_transition_zone_of_prostate_(body_structure)"]),#dicomSrAiCodeMeaning_lst
      json.dumps(["SCT",
                "SCT",
                "SCT"]),#dicomSrAiCodingSchemeDesignator_lst
      json.dumps(["41216001"]),#dicomSrIdcCodeValues_lst
      json.dumps(["Prostate"]),#dicomSrIdcCodeMeaning_lst
      json.dumps(["SCT"]),#dicomSrIdcCodingSchemeDesignator_lst
      json.dumps(["bamf_nnunet_mr_prostate", "nnunet_prostate_task24",
                  "monai_prostate158", "nnunet_prostate_zonal_task05"]),#mhub_model_name_lst
      json.dumps(["default", "default", "default", "default"]),#mhubai_custom_config_lst
      json.dumps(["bamf_nnunet_mr_prostate", "nnunet_prostate_task24",
                  "monai_prostate158", "nnunet_prostate_zonal_task05"]),#mhubaiCustomSegmentAlgorithmName_lst,
      json.dumps("prostate_mri_us_biopsy"),
      # json.dumps(idc_seg_batch),#idcSegSeriesInstanceUIDs
      # json.dumps(""),#idcAddSegSeriesInstanceUIDs
      # json.dumps(image_seg_batch),#t2_seriesInstanceUIDs
      # json.dumps(image_adc_seg_batch),#adc_seriesInstanceUIDs
      ]

### ProstateX

Parameters for studies that have both zonal and whole prostate SEGs DICOM

In [None]:
terra_table_id_prostatex_zonal_whole = "terra_mhub_prostatex_zonal_whole_test"
terra_prostatex_zonal_whole_values =[json.dumps(""),#mhubCompressedOutputFile
    json.dumps(""),#evalCompressedOutputFile
    json.dumps(""),#radsAiCompressedOutputFile
    json.dumps(""),#idcExpertCompressedOutputFile
    json.dumps(""),#radsIdcExpertCompressedOutputFile
    json.dumps(""),#finalCompressedOutputFile
    # json.dumps("nii"),#res_scheme_format
    "nii",
    json.dumps(["41216001", "279706003", "399384005"]),#dicomAiCodeValuesEval_lst
    json.dumps(["Prostatic_structure",
              "Structure_of_peripheral_glandular_zone_of_prostate_(body_structure)",
              "Structure_of_transition_zone_of_prostate_(body_structure)"]),#dicomAiCodeMeaningEval_lst
    json.dumps(["SCT", "SCT", "SCT"]),#dicomAiCodingSchemeDesignatorEval_lst
    json.dumps(["41216001", "None", "None"]),#dicomIdcCodeValuesEval_lst
    json.dumps(["Prostate", "None", "None"]),#dicomIdcCodeMeaningEval_lst
    json.dumps(["SCT", "None", "None"]),#dicomIdcCodingSchemeDesignatorEval_lst
    json.dumps(["None", "279706003", "399384005"]),#dicomIdcAddCodeValuesEval_lst
    json.dumps(["None", "Peripheral_zone_of_prostate", "Transition_zone_of_prostate"]),#dicomIdcAddCodeMeaningEval_lst
    json.dumps(["None", "SCT", "SCT"]),#dicomIdcAddCodingSchemeDesignatorEval_lst
    json.dumps(["41216001", "T-9200B"]),#dicomCodeValuesProstate_lst
    json.dumps(["Prostatic_structure", "Prostate"]),#dicomCodeMeaningProstate_lst
    json.dumps(["SCT", "SRT"]),#dicomCodingSchemeDesignatorProstate_lst
    json.dumps(["41216001", "279706003", "399384005"]),#dicomSrAiCodeValues_lst
    json.dumps(["Prostatic_structure",
              "Structure_of_peripheral_glandular_zone_of_prostate_(body_structure)",
              "Structure_of_transition_zone_of_prostate_(body_structure)"]),#dicomSrAiCodeMeaning_lst
    json.dumps(["SCT", "SCT", "SCT"]),#dicomSrAiCodingSchemeDesignator_lst
    json.dumps(["41216001", "279706003", "399384005"]),#dicomSrIdcCodeValues_lst
    json.dumps(["Prostate", "Peripheral_zone_of_prostate", "Transition_zone_of_prostate"]),#dicomSrIdcCodeMeaning_lst
    json.dumps(["SCT", "SCT", "SCT"]),#dicomSrIdcCodingSchemeDesignator_lst
    json.dumps(["bamf_nnunet_mr_prostate", "nnunet_prostate_task24",
                "monai_prostate158", "nnunet_prostate_zonal_task05"]),#mhub_model_name_lst
    json.dumps(["default", "default",
                "default", "default"]),#mhubai_custom_config_lst
    json.dumps(["bamf_nnunet_mr_prostate", "nnunet_prostate_task24",
                  "monai_prostate158", "nnunet_prostate_zonal_task05"]),#mhubaiCustomSegmentAlgorithmName_lst
    json.dumps("prostatex_2_expert_seg_sets"),
    # json.dumps(idc_seg_batch),#idcSegSeriesInstanceUIDs
    # json.dumps(idc_seg_add_batch),#idcAddSegSeriesInstanceUIDs
    # json.dumps(image_seg_batch),#t2_seriesInstanceUIDs
    # json.dumps(image_adc_seg_batch),#adc_seriesInstanceUIDs
    ]

In [None]:
terra_table_id_prostatex_zonal_only = "terra_mhub_prostatex_zonal_only_test"
terra_prostatex_zonal_only_values = [json.dumps(""),#mhubCompressedOutputFile
    json.dumps(""),#evalCompressedOutputFile
    json.dumps(""),#radsAiCompressedOutputFile
    json.dumps(""),#idcExpertCompressedOutputFile
    json.dumps(""),#radsIdcExpertCompressedOutputFile
    json.dumps(""),#finalCompressedOutputFile
    # json.dumps("nii"),#res_scheme_format
    "nii",
    json.dumps(["41216001", "279706003", "399384005"]),#dicomAiCodeValuesEval_lst
    json.dumps(["Prostatic_structure",
              "Structure_of_peripheral_glandular_zone_of_prostate_(body_structure)",
              "Structure_of_transition_zone_of_prostate_(body_structure)"]),#dicomAiCodeMeaningEval_lst
    json.dumps(["SCT", "SCT", "SCT"]),#dicomAiCodingSchemeDesignatorEval_lst
    json.dumps(["41216001", "279706003", "399384005"]),#dicomIdcCodeValuesEval_lst
    json.dumps(["Prostate", "Peripheral_zone_of_prostate", "Transition_zone_of_prostate"]),#dicomIdcCodeMeaningEval_lst
    json.dumps(["SCT", "SCT", "SCT"]),#dicomIdcCodingSchemeDesignatorEval_lst
    json.dumps(""),#dicomIdcAddCodeValuesEval_lst
    json.dumps(""),#dicomIdcAddCodeMeaningEval_lst
    json.dumps(""),#dicomIdcAddCodingSchemeDesignatorEval_lst
    json.dumps(["41216001", "T-9200B"]),#dicomCodeValuesProstate_lst
    json.dumps(["Prostatic_structure", "Prostate"]),#dicomCodeMeaningProstate_lst
    json.dumps(["SCT", "SRT"]),#dicomCodingSchemeDesignatorProstate_lst
    json.dumps(["41216001", "279706003", "399384005"]),#dicomSrAiCodeValues_lst
    json.dumps(["Prostatic_structure",
              "Structure_of_peripheral_glandular_zone_of_prostate_(body_structure)",
              "Structure_of_transition_zone_of_prostate_(body_structure)"]),#dicomSrAiCodeMeaning_lst
    json.dumps(["SCT", "SCT", "SCT"]),#dicomSrAiCodingSchemeDesignator_lst
    json.dumps(["41216001", "279706003", "399384005"]),#dicomSrIdcCodeValues_lst
    json.dumps(["Prostate", "Peripheral_zone_of_prostate", "Transition_zone_of_prostate"]),#dicomSrIdcCodeMeaning_lst
    json.dumps(["SCT", "SCT", "SCT"]),#dicomSrIdcCodingSchemeDesignator_lst
    json.dumps(["bamf_nnunet_mr_prostate", "nnunet_prostate_task24",
                "monai_prostate158", "nnunet_prostate_zonal_task05"]),#mhub_model_name_lst
    json.dumps(["default", "default",
                "default", "default"]),#mhubai_custom_config_lst,
    json.dumps(["bamf_nnunet_mr_prostate", "nnunet_prostate_task24",
                  "monai_prostate158", "nnunet_prostate_zonal_task05"]),#mhubaiCustomSegmentAlgorithmName_lst
    json.dumps("prostatex_only_whole_prostate"),
    # json.dumps(idc_seg_batch),#idcSegSeriesInstanceUIDs
    # json.dumps(""),#idcAddSegSeriesInstanceUIDs
    # json.dumps(image_seg_batch),#t2_seriesInstanceUIDs
    # json.dumps(image_adc_seg_batch),#adc_seriesInstanceUIDs
    ]

### QIN-Prostate-Repeatability

In [None]:
terra_table_id_qin_prostate_repeatability = "terra_mhub_qin_prostate_repeatability_test"
terra_qin_prostate_repeatability_values = [json.dumps(""),#mhubCompressedOutputFile
    json.dumps(""),#evalCompressedOutputFile
    json.dumps(""),#radsAiCompressedOutputFile
    json.dumps(""),#idcExpertCompressedOutputFile
    json.dumps(""),#radsIdcExpertCompressedOutputFile
    json.dumps(""),#finalCompressedOutputFile
    # json.dumps("nii"),#res_scheme_format
    "nii",
    json.dumps(["41216001",
                "279706003",
                "399384005"]),#dicomAiCodeValuesEval_lst
    json.dumps(["Prostatic_structure",
              "Structure_of_peripheral_glandular_zone_of_prostate_(body_structure)",
              "Structure_of_transition_zone_of_prostate_(body_structure)"]),#dicomAiCodeMeaningEval_lst
    json.dumps(["SCT","SCT","SCT"]),#dicomAiCodingSchemeDesignatorEval_lst
    json.dumps(["T-9200B","T-D05E4","None"]),#dicomIdcCodeValuesEval_lst
    json.dumps(["Prostate","Peripheral_zone_of_the_prostate","None"]),#dicomIdcCodeMeaningEval_lst
    json.dumps(["SRT","SRT","None"]),#dicomIdcCodingSchemeDesignatorEval_lst
    json.dumps(""),#dicomIdcAddCodeValuesEval_lst
    json.dumps(""),#dicomIdcAddCodeMeaningEval_lst
    json.dumps(""),#dicomIdcAddCodingSchemeDesignatorEval_lst
    json.dumps(["41216001", "T-9200B"]),#dicomCodeValuesProstate_lst
    json.dumps(["Prostatic_structure", "Prostate"]),#dicomCodeMeaningProstate_lst
    json.dumps(["SCT", "SRT"]),#dicomCodingSchemeDesignatorProstate_lst
    json.dumps(["41216001",
                "279706003",
                "399384005"]),#dicomSrAiCodeValues_lst
    json.dumps(["Prostatic_structure",
              "Structure_of_peripheral_glandular_zone_of_prostate_(body_structure)",
              "Structure_of_transition_zone_of_prostate_(body_structure)"]),#dicomSrAiCodeMeaning_lst
    json.dumps(["SCT",
              "SCT",
              "SCT"]),#dicomSrAiCodingSchemeDesignator_lst
    json.dumps(["T-9200B", "T-D05E4"]),#dicomSrIdcCodeValues_lst
    json.dumps(["Prostate", "Peripheral_zone_of_the_prostate"]),#dicomSrIdcCodeMeaning_lst
    json.dumps(["SRT", "SRT"]),#dicomSrIdcCodingSchemeDesignator_lst
    json.dumps(["bamf_nnunet_mr_prostate", "nnunet_prostate_task24",
                "monai_prostate158", "nnunet_prostate_zonal_task05"]),#mhub_model_name_lst
    json.dumps(["default", "default",
                "default", "default"]),#mhubai_custom_config_lst
    json.dumps(["bamf_nnunet_mr_prostate", "nnunet_prostate_task24",
                  "monai_prostate158", "nnunet_prostate_zonal_task05"]),#mhubaiCustomSegmentAlgorithmName_lst
    json.dumps("qin_prostate_repeatability"),
    # json.dumps(idc_seg_batch),#idcSegSeriesInstanceUIDs
    # json.dumps(""),#idcAddSegSeriesInstanceUIDs
    # json.dumps(image_seg_batch),#t2_seriesInstanceUIDs
    # json.dumps(image_adc_seg_batch),#adc_seriesInstanceUIDs
    ]

# Function to build terra data table

In [None]:
def build_data_table(idc_data_df,#idc dataframe containing T2 analyzed images serieUIDs and associated adc and expert seg seriesInstanceUIDs
                     terra_col_scheme_lst, #terra data table column scheme
                     terra_table_id, #every terra data table needs an id, such as 'PLACEHOLDER_id'
                     terra_col_values_lst, #values associated to the column scheme
                     idcSegColName,#t2 image df column name
                     t2ImageColName,#adc image df column name
                     adcImageColName,#idc expert seg df column name
                     idcSegColNameAdd=None#idc expert seg second set df column name
                     ):

  # Set the number of rows per file
  series_per_batch = 12

  # Calculate the number of files needed
  num_files = math.ceil(len(idc_data_df) / series_per_batch)

  # Split the dataframe into multiple dataframes
  dfs = [idc_data_df[i*series_per_batch:(i+1)*series_per_batch] for i in range(num_files)]

  # Create a new column name for the batch_id column
  batch_id_column = f'entity:{terra_table_id}_id'

  # Initialize an empty DataFrame to store the batch information
  out_batch_df = pd.DataFrame(columns=[batch_id_column]+terra_col_scheme_lst[1:])#first index is id

  idc_seg_lst = []
  image_seg_lst = []
  image_adc_seg_lst = []
  if idcSegColNameAdd is not None:#idc data df contains a col name with second set of expert SEG
    idc_seg_add_lst = []
    for batch_df in dfs:
      idc_seg_lst.append(list(batch_df[idcSegColName].values))
      idc_seg_add_lst.append(list(batch_df[idcSegColNameAdd].values))
      image_seg_lst.append(list(batch_df[t2ImageColName].values))
      image_adc_seg_lst.append(list(batch_df[adcImageColName].values))
    idx=0
    for image_seg_batch, idc_seg_batch, idc_seg_add_batch, image_adc_seg_batch in zip(image_seg_lst,idc_seg_lst, idc_seg_add_lst, image_adc_seg_lst):
      out_batch_df.loc[idx] = [int(idx)] + terra_col_values_lst + [json.dumps(idc_seg_batch),
          json.dumps(idc_seg_add_batch),
          json.dumps(image_seg_batch),
          json.dumps(image_adc_seg_batch)]
      idx+=1
    return out_batch_df
  else:#no second set of EXPERT SEG
    for batch_df in dfs:
      idc_seg_lst.append(list(batch_df[idcSegColName].values))
      image_seg_lst.append(list(batch_df[t2ImageColName].values))
      image_adc_seg_lst.append(list(batch_df[adcImageColName].values))
    idx=0
    for image_seg_batch, idc_seg_batch, image_adc_seg_batch in zip(image_seg_lst, idc_seg_lst, image_adc_seg_lst):
      out_batch_df.loc[idx] = [int(idx)] + terra_col_values_lst + [json.dumps(idc_seg_batch),
          json.dumps(""),
          json.dumps(image_seg_batch),
          json.dumps(image_adc_seg_batch)]
      idx+=1
    return out_batch_df

# Query IDC data

Now that the model environment is set up, it's time to download some data.

For our study, we are interested in retrieving, within individual studies and for one T2 image, one ADC image and associated T2 segmented expert prostate segmentations.

## Prostate-MRI-US-Biopsy collection

Let's query all T2W images from PROSTATE-MRI-US-BIOPSY that have both whole expert prostate SEG DCM objects

In [None]:
%%bigquery cohort_prostate_mri_us_biopsy_df --project idc-sandbox-003
WITH
  idc_seg_whole_prostate AS (
  SELECT
    segmentations.StudyInstanceUID,
    segmentations.SeriesInstanceUID,
    segmentations.segmented_SeriesInstanceUID,
    # for debugging:
    dicom_all.SeriesDescription
  FROM
    `bigquery-public-data.idc_current.segmentations` AS segmentations
  JOIN
    `bigquery-public-data.idc_current.dicom_all` AS dicom_all
  ON
    segmentations.SeriesInstanceUID = dicom_all.SeriesInstanceUID
  WHERE
    dicom_all.analysis_result_id = "Prostate-MRI-US-Biopsy-DICOM-Annotations"
    # the next line is equivalent, but less clear
    #lower(dicom_all.source_doi) = "10.5281/zenodo.10069910"
    AND segmentations.SegmentedPropertyType.CodeValue = '41216001'
    AND segmentations.SegmentedPropertyType.CodingSchemeDesignator = 'SCT'
    AND segmentations.SegmentAlgorithmType in UNNEST(['MANUAL', 'SEMIAUTOMATIC'])),
  # select T2, here we do not check for multiple T2 within the same study, since we know there is only one per study
  t2_series AS(
  SELECT
    DISTINCT dc_all.SeriesInstanceUID,
    dc_all.StudyInstanceUID,
    dc_all.PatientID
  FROM
    `bigquery-public-data.idc_current.dicom_all` AS dc_all
  WHERE
    dc_all.collection_id = 'prostate_mri_us_biopsy'
    AND dc_all.Modality = 'MR'
    AND LOWER(dc_all.SeriesDescription) LIKE '%t2%'
  ORDER BY
    PatientID),
  # ADC selection, but since we know there are studies with more than one, we select the latest
  adc_series AS(
  SELECT
    dc_adc.StudyInstanceUID,
    ARRAY_AGG(dc_adc.SeriesInstanceUID
    ORDER BY
      dc_adc.SeriesDate, dc_adc.SeriesTime DESC
    LIMIT
      1)[SAFE_OFFSET(0)] AS SeriesInstanceUID,
  FROM
    `bigquery-public-data.idc_current.dicom_all` AS dc_adc
  JOIN
    t2_series
  ON
    dc_adc.StudyInstanceUID = t2_series.StudyInstanceUID
  WHERE
    LOWER(dc_adc.SeriesDescription) LIKE '%adc%'
  GROUP BY
    dc_adc.StudyInstanceUID)
SELECT
  t2_series.SeriesInstanceUID AS t2_serieUID,
  adc_series.SeriesInstanceUID AS adc_serieUID,
  idc_seg_whole_prostate.SeriesInstanceUID AS expertWholeProstateSeriesInstanceUID,
  idc_seg_whole_prostate.SeriesDescription AS expertWholeProstateSeriesDescription,
  t2_series.StudyInstanceUID
FROM
  t2_series
INNER JOIN
  adc_series
ON
  t2_series.StudyInstanceUID = adc_series.StudyInstanceUID
INNER JOIN
  idc_seg_whole_prostate
ON
  t2_series.SeriesInstanceUID = idc_seg_whole_prostate.segmented_SeriesInstanceUID

Query is running:   0%|          |

Downloading:   0%|          |

## ProstateX collection

Let's query all T2W images from ProstateX that have both whole prostate and zonal expert SEG DCM objects -- n=66

In [None]:
%%bigquery cohort_prostatex_zonal_whole_df --project idc-sandbox-003
WITH
  idc_seg_zonal_prostate AS(
  SELECT
    DISTINCT segmentations.SeriesInstanceUID,
    segmentations.segmented_SeriesInstanceUID AS image_serieUID,
    segmentations.StudyInstanceUID
  FROM
    `bigquery-public-data.idc_current.segmentations` AS segmentations
  JOIN
    `bigquery-public-data.idc_current.dicom_all` AS dicom_all
  ON
    segmentations.SeriesInstanceUID = dicom_all.SeriesInstanceUID
  WHERE
    segmentations.SegmentedPropertyType.CodeValue = '279706003'
    AND dicom_all.collection_id = "prostatex"
    AND segmentations.SegmentedPropertyType.CodingSchemeDesignator = 'SCT'
    AND segmentations.SegmentAlgorithmType in UNNEST(['MANUAL'])
    AND SeriesDescription NOT LIKE '%AIMI%'
    AND Modality = 'SEG'),
  idc_seg_whole_prostate AS(
  SELECT
    DISTINCT segmentations.SeriesInstanceUID,
    segmentations.segmented_SeriesInstanceUID AS image_serieUID,
    segmentations.StudyInstanceUID
  FROM
    `bigquery-public-data.idc_current.segmentations` AS segmentations
  JOIN
    `bigquery-public-data.idc_current.dicom_all` AS dicom_all
  ON
    segmentations.SeriesInstanceUID = dicom_all.SeriesInstanceUID
  WHERE
    segmentations.SegmentedPropertyType.CodeValue = '41216001'
    AND dicom_all.collection_id = "prostatex"
    AND segmentations.SegmentedPropertyType.CodingSchemeDesignator = 'SCT'
    AND segmentations.SegmentAlgorithmType in UNNEST(['MANUAL'])
    AND SeriesDescription NOT LIKE '%AIMI%'
    AND Modality = 'SEG'),
  t2_series AS(
  SELECT
    DISTINCT dc_all.SeriesInstanceUID,
    dc_all.StudyInstanceUID,
    dc_all.PatientID
  FROM
    `bigquery-public-data.idc_current.dicom_all` AS dc_all
  WHERE
    dc_all.collection_id = 'prostatex'
    AND dc_all.Modality = 'MR'
    AND LOWER(dc_all.SeriesDescription) LIKE '%t2_tse_tra%'
  ORDER BY
    PatientID),
  adc_series AS(
  SELECT
    dc_adc.StudyInstanceUID,
    ARRAY_AGG(dc_adc.SeriesInstanceUID
    ORDER BY
      dc_adc.SeriesDate, dc_adc.SeriesTime DESC
    LIMIT
      1)[SAFE_OFFSET(0)] AS SeriesInstanceUID,
  FROM
    `bigquery-public-data.idc_current.dicom_all` AS dc_adc
  JOIN
    t2_series
  ON
    dc_adc.StudyInstanceUID = t2_series.StudyInstanceUID
  WHERE
    LOWER(dc_adc.SeriesDescription) LIKE '%adc%'
    AND collection_id = "prostatex"
  GROUP BY
    dc_adc.StudyInstanceUID),
  concat_t2_adc_series AS (
  SELECT
    t2_series.SeriesInstanceUID AS t2_serieUID,
    adc_series.SeriesInstanceUID AS adc_serieUID,
    t2_series.StudyInstanceUID
  FROM
    t2_series
  INNER JOIN
    adc_series
  ON
    t2_series.StudyInstanceUID = adc_series.StudyInstanceUID)
SELECT
  concat_t2_adc_series.t2_serieUID,
  concat_t2_adc_series.adc_serieUID,
  concat_t2_adc_series.StudyInstanceUID,
  idc_zonal_prostate_seg.SeriesInstanceUID as expert_zonal_prostate_serieUID,
  idc_whole_prostate_seg.SeriesInstanceUID as expert_whole_prostate_serieUID,
  idc_zonal_prostate_seg.image_serieUID as expert_zonal_prostate_image_serieUID,
  idc_whole_prostate_seg.image_serieUID as expert_whole_prostate_image_serieUID,
  idc_zonal_prostate_seg.StudyInstanceUID as expert_zonal_prostate_studyUID,
  idc_whole_prostate_seg.StudyInstanceUID as expert_whole_prostate_studyUID
FROM
  concat_t2_adc_series
INNER JOIN idc_seg_zonal_prostate idc_zonal_prostate_seg ON concat_t2_adc_series.t2_serieUID = idc_zonal_prostate_seg.image_serieUID
INNER JOIN idc_seg_whole_prostate idc_whole_prostate_seg ON concat_t2_adc_series.t2_serieUID = idc_whole_prostate_seg.image_serieUID

Query is running:   0%|          |

Downloading:   0%|          |

Let's query all T2W images from ProstateX that have ONLY zonal SEG DCM objects -- n=32

In [None]:
%%bigquery cohort_prostatex_zonal_only_df --project idc-sandbox-003
WITH
  idc_seg_zonal_prostate AS(
  SELECT
    DISTINCT segmentations.SeriesInstanceUID,
    segmentations.segmented_SeriesInstanceUID AS image_serieUID,
    segmentations.StudyInstanceUID
  FROM
    `bigquery-public-data.idc_current.segmentations` AS segmentations
  JOIN
    `bigquery-public-data.idc_current.dicom_all` AS dicom_all
  ON
    segmentations.SeriesInstanceUID = dicom_all.SeriesInstanceUID
  WHERE
    segmentations.SegmentedPropertyType.CodeValue = '279706003'
    AND dicom_all.collection_id = "prostatex"
    AND segmentations.SegmentedPropertyType.CodingSchemeDesignator = 'SCT'
    AND segmentations.SegmentAlgorithmType in UNNEST(['MANUAL'])
    AND SeriesDescription NOT LIKE '%AIMI%'
    AND Modality = 'SEG'),
  idc_seg_whole_prostate AS(
  SELECT
    DISTINCT segmentations.SeriesInstanceUID,
    segmentations.segmented_SeriesInstanceUID AS image_serieUID,
    segmentations.StudyInstanceUID
  FROM
    `bigquery-public-data.idc_current.segmentations` AS segmentations
  JOIN
    `bigquery-public-data.idc_current.dicom_all` AS dicom_all
  ON
    segmentations.SeriesInstanceUID = dicom_all.SeriesInstanceUID
  WHERE
    segmentations.SegmentedPropertyType.CodeValue = '41216001'
    AND dicom_all.collection_id = "prostatex"
    AND segmentations.SegmentedPropertyType.CodingSchemeDesignator = 'SCT'
    AND segmentations.SegmentAlgorithmType in UNNEST(['MANUAL'])
    AND SeriesDescription NOT LIKE '%AIMI%'
    AND Modality = 'SEG'),
  t2_series AS(
  SELECT
    DISTINCT dc_all.SeriesInstanceUID,
    dc_all.StudyInstanceUID,
    dc_all.PatientID
  FROM
    `bigquery-public-data.idc_current.dicom_all` AS dc_all
  WHERE
    dc_all.collection_id = 'prostatex'
    AND dc_all.Modality = 'MR'
    AND LOWER(dc_all.SeriesDescription) LIKE '%t2_tse_tra%'
  ORDER BY
    PatientID),
  adc_series AS(
  SELECT
    dc_adc.StudyInstanceUID,
    ARRAY_AGG(dc_adc.SeriesInstanceUID
    ORDER BY
      dc_adc.SeriesDate, dc_adc.SeriesTime DESC
    LIMIT
      1)[SAFE_OFFSET(0)] AS SeriesInstanceUID,
  FROM
    `bigquery-public-data.idc_current.dicom_all` AS dc_adc
  JOIN
    t2_series
  ON
    dc_adc.StudyInstanceUID = t2_series.StudyInstanceUID
  WHERE
    LOWER(dc_adc.SeriesDescription) LIKE '%adc%'
    AND collection_id = "prostatex"
  GROUP BY
    dc_adc.StudyInstanceUID),
  concat_t2_adc_series AS (
  SELECT
    t2_series.SeriesInstanceUID AS t2_serieUID,
    adc_series.SeriesInstanceUID AS adc_serieUID,
    t2_series.StudyInstanceUID
  FROM
    t2_series
  INNER JOIN
    adc_series
  ON
    t2_series.StudyInstanceUID = adc_series.StudyInstanceUID)
SELECT
  concat_t2_adc_series.t2_serieUID,
  concat_t2_adc_series.adc_serieUID,
  concat_t2_adc_series.StudyInstanceUID,
  idc_zonal_prostate_seg.SeriesInstanceUID as expert_zonal_prostate_serieUID,
  idc_whole_prostate_seg.SeriesInstanceUID as expert_whole_prostate_serieUID,
  idc_zonal_prostate_seg.image_serieUID as expert_zonal_prostate_image_serieUID,
  idc_whole_prostate_seg.image_serieUID as expert_whole_prostate_image_serieUID,
  idc_zonal_prostate_seg.StudyInstanceUID as expert_zonal_prostate_studyUID,
  idc_whole_prostate_seg.StudyInstanceUID as expert_whole_prostate_studyUID
FROM
  concat_t2_adc_series
INNER JOIN idc_seg_zonal_prostate idc_zonal_prostate_seg ON concat_t2_adc_series.t2_serieUID = idc_zonal_prostate_seg.image_serieUID
LEFT JOIN idc_seg_whole_prostate idc_whole_prostate_seg ON concat_t2_adc_series.t2_serieUID = idc_whole_prostate_seg.image_serieUID
WHERE idc_whole_prostate_seg.image_serieUID is NULL

Query is running:   0%|          |

Downloading:   0%|          |

## QIN-Prostate-Repeatability collection

In [None]:
%%bigquery cohort_qin_prostate_repeatability_df --project idc-sandbox-003
WITH
  # we know there is only one prostate segmentation per study
  idc_seg_whole_prostate AS (
  SELECT
    segmentations.StudyInstanceUID,
    segmentations.SeriesInstanceUID,
    segmentations.segmented_SeriesInstanceUID,
    # for debugging:
    dicom_all.SeriesDescription
  FROM
    `bigquery-public-data.idc_current.segmentations` AS segmentations
  JOIN
    `bigquery-public-data.idc_current.dicom_all` AS dicom_all
  ON
    segmentations.SeriesInstanceUID = dicom_all.SeriesInstanceUID
  WHERE
    dicom_all.SeriesDescription = "T2 Weighted Axial Segmentations"
    AND segmentations.SegmentedPropertyType.CodeValue = 'T-9200B'
    AND segmentations.SegmentAlgorithmType in UNNEST(['MANUAL'])
    AND segmentations.SegmentedPropertyType.CodingSchemeDesignator = 'SRT'),
  # select T2, here we do not check for multiple T2 within the same study, since we know there is only one per study
  t2_series AS(
  SELECT
    DISTINCT dc_all.SeriesInstanceUID,
    dc_all.StudyInstanceUID,
    dc_all.PatientID
  FROM
    `bigquery-public-data.idc_current.dicom_all` AS dc_all
  WHERE
    dc_all.collection_id = 'qin_prostate_repeatability'
    AND dc_all.Modality = 'MR'
    AND LOWER(dc_all.SeriesDescription) LIKE '%t2%'
  ORDER BY
    PatientID),
  # ADC selection
  adc_series AS(
  SELECT
    dc_adc.StudyInstanceUID,
    ARRAY_AGG(dc_adc.SeriesInstanceUID
    ORDER BY
      dc_adc.SeriesDate, dc_adc.SeriesTime DESC
    LIMIT
      1)[SAFE_OFFSET(0)] AS SeriesInstanceUID,
  FROM
    `bigquery-public-data.idc_current.dicom_all` AS dc_adc
  JOIN
    t2_series
  ON
    dc_adc.StudyInstanceUID = t2_series.StudyInstanceUID
  WHERE
    LOWER(dc_adc.SeriesDescription) LIKE '%apparent%'
  GROUP BY
    dc_adc.StudyInstanceUID)
SELECT
  t2_series.SeriesInstanceUID AS t2_serieUID,
  adc_series.SeriesInstanceUID AS adc_serieUID,
  idc_seg_whole_prostate.SeriesInstanceUID AS expertWholeProstateSeriesInstanceUID,
  idc_seg_whole_prostate.SeriesDescription AS expertWholeProstateSeriesDescription,
  t2_series.StudyInstanceUID
FROM
  t2_series
INNER JOIN
  adc_series
ON
  t2_series.StudyInstanceUID = adc_series.StudyInstanceUID
INNER JOIN
  idc_seg_whole_prostate
ON
  t2_series.SeriesInstanceUID = idc_seg_whole_prostate.segmented_SeriesInstanceUID

Query is running:   0%|          |

Downloading:   0%|          |

# Generate .tsv terra data tables per collection basis

## Prostate-MRI-US-Biopsy

In [None]:
# asdf
terra_prostate_mri_us_biopsy_df = build_data_table(idc_data_df=cohort_prostate_mri_us_biopsy_df,
  terra_col_scheme_lst=terra_data_columns,
  terra_table_id=terra_table_id_prostate_mri_us_biopsy,
  terra_col_values_lst=terra_prostate_mri_us_biopsy_values,
  idcSegColName="expertWholeProstateSeriesInstanceUID",
  t2ImageColName="t2_serieUID",
  adcImageColName="adc_serieUID",
  idcSegColNameAdd=None)

In [None]:
terra_prostate_mri_us_biopsy_df.to_csv("prostate_mri_us_biopsy.tsv", sep="\t", index=None)

## ProstateX

In [None]:
terra_prostatex_zonal_whole_df = build_data_table(idc_data_df=cohort_prostatex_zonal_whole_df,
  terra_col_scheme_lst=terra_data_columns,
  terra_table_id=terra_table_id_prostatex_zonal_whole,
  terra_col_values_lst=terra_prostatex_zonal_whole_values,
  idcSegColName="expert_whole_prostate_serieUID",
  t2ImageColName="t2_serieUID",
  adcImageColName="adc_serieUID",
  idcSegColNameAdd="expert_zonal_prostate_serieUID")

In [None]:
terra_prostatex_zonal_only_df = build_data_table(idc_data_df=cohort_prostatex_zonal_only_df,
  terra_col_scheme_lst=terra_data_columns,
  terra_table_id=terra_table_id_prostatex_zonal_only,
  terra_col_values_lst=terra_prostatex_zonal_only_values,
  idcSegColName="expert_zonal_prostate_serieUID",
  t2ImageColName="t2_serieUID",
  adcImageColName="adc_serieUID",
  idcSegColNameAdd=None)

Combine both dfs together

In [None]:
terra_prostatex_all_df = pd.concat([terra_prostatex_zonal_whole_df.drop(columns=[f"entity:{terra_table_id_prostatex_zonal_whole}_id"], axis=1).reset_index(drop=True),
                                 terra_prostatex_zonal_only_df.drop(columns=[f"entity:{terra_table_id_prostatex_zonal_only}_id"], axis=1).reset_index(drop=True)], ignore_index=True)

In [None]:
terra_prostatex_all_df[f"entity:terra_mhub_prostatex_all_test_id"] = [x for x in range(0,len(terra_prostatex_all_df))]
df_cols = list(terra_prostatex_all_df.columns.values)
cols = [df_cols[-1]] + df_cols[:-1]
terra_prostatex_all_df = terra_prostatex_all_df[cols]

In [None]:
terra_prostatex_all_df.to_csv("prostatex_all.tsv", sep="\t", index=None)

## QIN-Prostate-Repeatability

In [None]:
terra_qin_prostate_repeatability_df = build_data_table(idc_data_df=cohort_qin_prostate_repeatability_df,
  terra_col_scheme_lst=terra_data_columns,
  terra_table_id=terra_table_id_qin_prostate_repeatability,
  terra_col_values_lst=terra_qin_prostate_repeatability_values,
  t2ImageColName="t2_serieUID",
  adcImageColName="adc_serieUID",
  idcSegColName="expertWholeProstateSeriesInstanceUID",
  idcSegColNameAdd=None)

In [None]:
terra_qin_prostate_repeatability_df.to_csv("qin_prostate_repeatability.tsv", sep="\t", index=None)

# Combine all the terra data together

In [None]:
terra_all_df = pd.concat([terra_prostate_mri_us_biopsy_df.drop(columns=[f"entity:{terra_table_id_prostate_mri_us_biopsy}_id"],
                                                           axis=1).reset_index(drop=True),
                                 terra_prostatex_all_df.drop(columns=[f"entity:terra_mhub_prostatex_all_test_id"],
                                                                    axis=1).reset_index(drop=True),
                                 terra_qin_prostate_repeatability_df.drop(columns=[f"entity:{terra_table_id_qin_prostate_repeatability}_id"],
                                                                    axis=1).reset_index(drop=True)], ignore_index=True)


In [None]:
terra_all_df[f"entity:terra_mhub_all_collections_v3_id"] = [int(x) for x in range(0,len(terra_all_df))]
terra_all_df = terra_all_df.reset_index(drop=True)
df_cols = list(terra_all_df.columns.values)
cols = [df_cols[-1]] + df_cols[:-1]
terra_all_df = terra_all_df[cols]

In [None]:
terra_all_df.to_csv("mhub_terra_all.tsv", sep="\t", index=None)