Skip to content

Commit

Permalink
feat: Onboard Imaging Data Commons (IDC) v7 dataset (#287)
Browse files Browse the repository at this point in the history
* feat: Onboard Imaging Data Commons (IDC) v7 dataset

* Update datasets/idc/dataset.yaml

* updates black formatter due to a recurring bug

* move the images folder under pipelines

* move images folder

* use black@stable for Python linting

* use black@stable for Python linting

* adds missing queries

Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>
  • Loading branch information
adlersantos and leahecole committed Feb 11, 2022
1 parent b71b113 commit dfda5d9
Show file tree
Hide file tree
Showing 14 changed files with 737 additions and 15 deletions.
10 changes: 10 additions & 0 deletions datasets/idc/infra/idc_dataset.tf
Expand Up @@ -91,6 +91,16 @@ output "bigquery_dataset-idc_v6-dataset_id" {
value = google_bigquery_dataset.idc_v6.dataset_id
}

resource "google_bigquery_dataset" "idc_v7" {
dataset_id = "idc_v7"
project = var.project_id
description = "Imaging Data Commons (IDC) - The Cancer Imaging Archive (TCIA) v7 data"
}

output "bigquery_dataset-idc_v7-dataset_id" {
value = google_bigquery_dataset.idc_v7.dataset_id
}

resource "google_bigquery_dataset" "idc_current" {
dataset_id = "idc_current"
project = var.project_id
Expand Down
2 changes: 1 addition & 1 deletion datasets/idc/pipelines/_images/copy_bq_datasets/script.py
Expand Up @@ -153,7 +153,7 @@ def trigger_config(
) -> None:
now = time.time()
seconds = int(now)
nanos = int((now - seconds) * 10 ** 9)
nanos = int((now - seconds) * pow(10, 9))

try:
client.start_manual_transfer_runs(
Expand Down
@@ -0,0 +1 @@
select * from `PROJECT.idc_CURRENT_VERSION.dicom_derived_all`
@@ -0,0 +1 @@
select * from `PROJECT.idc_CURRENT_VERSION.dicom_pivot_v7`
@@ -0,0 +1,55 @@
WITH
pre_dicom_all AS (
SELECT
aux.tcia_api_collection_id AS tcia_api_collection_id,
aux.idc_webapp_collection_id AS collection_id,
aux.collection_timestamp AS collection_timestamp,
aux.collection_hash as collection_hash,
aux.collection_init_idc_version AS collection_init_idc_version,
aux.collection_revised_idc_version AS collection_revised_idc_version,
aux.access AS access,
dcm.PatientID as PatientID,
aux.idc_case_id as idc_case_id,
aux.patient_hash as patient_hash,
aux.patient_init_idc_version AS patient_init_idc_version,
aux.patient_revised_idc_version AS patient_revised_idc_version,
dcm.StudyInstanceUID AS StudyInstanceUID,
aux.study_uuid as crdc_study_uuid,
aux.study_hash as study_hash,
aux.study_init_idc_version AS study_init_idc_version,
aux.study_revised_idc_version AS study_revised_idc_version,
dcm.SeriesInstanceUID AS SeriesInstanceUID,
aux.series_uuid as crdc_series_uuid,
aux.series_hash as series_hash,
aux.series_init_idc_version AS series_init_idc_version,
aux.series_revised_idc_version AS series_revised_idc_version,
dcm.SOPInstanceUID AS SOPInstanceUID,
aux.instance_uuid as crdc_instance_uuid,
aux.gcs_url as gcs_url,
aux.instance_size as instance_size,
aux.instance_hash as instance_hash,
aux.instance_init_idc_version AS instance_init_idc_version,
aux.instance_revised_idc_version AS instance_revised_idc_version,
aux.source_doi as Source_DOI,
aux.license_url as license_url,
aux.license_long_name as license_long_name,
aux.license_short_name as license_short_name,
dcm.* except(PatientID, StudyInstanceUID, SeriesInstanceUID, SOPInstanceUID)
FROM
`PROJECT.DATASET.auxiliary_metadata` AS aux
INNER JOIN
`PROJECT.DATASET.dicom_metadata` AS dcm
ON
aux.SOPInstanceUID = dcm.SOPInstanceUID)

SELECT
data_collections.Location AS tcia_tumorLocation,
data_collections.Species AS tcia_species,
data_collections.CancerType AS tcia_cancerType,
pre_dicom_all.*
FROM
pre_dicom_all
INNER JOIN
`PROJECT.DATASET.original_collections_metadata` AS data_collections
ON
pre_dicom_all.collection_id = data_collections.idc_webapp_collection_id
@@ -0,0 +1,5 @@
SELECT
SOPInstanceUID,
SAFE_CAST(SliceThickness AS FLOAT64) AS SliceThickness
FROM
`PROJECT.DATASET.dicom_metadata` AS dcm
@@ -0,0 +1,75 @@
SELECT
pivot.PatientID,
pivot.BodyPartExamined,
pivot.SeriesInstanceUID,
pivot.SliceThickness,
pivot.SeriesNumber,
pivot.SeriesDescription,
pivot.StudyInstanceUID,
pivot.StudyDescription,
pivot.StudyDate,
pivot.SOPInstanceUID,
pivot.Modality,
pivot.SOPClassUID,
pivot.collection_id,
pivot.AnatomicRegionSequence,
pivot.FrameOfReferenceUID,
pivot.crdc_study_uuid,
pivot.crdc_series_uuid,
pivot.crdc_instance_uuid,
pivot.Program,
pivot.tcia_tumorLocation,
pivot.source_DOI,
pivot.tcia_species,
pivot.license_short_name,
pivot.Manufacturer,
pivot.ManufacturerModelName,
Apparent_Diffusion_Coefficient,
Internal_structure,
Sphericity,
Calcification,
Lobular_Pattern,
Spiculation,
Margin,
Texture,
Subtlety_score,
Malignancy,
SUVbw,
Volume,
Diameter,
Surface_area_of_mesh,
Total_Lesion_Glycolysis,
Standardized_Added_Metabolic_Activity,
Percent_Within_First_Quarter_of_Intensity_Range,
Percent_Within_Third_Quarter_of_Intensity_Range,
Percent_Within_Fourth_Quarter_of_Intensity_Range,
Percent_Within_Second_Quarter_of_Intensity_Range,
Standardized_Added_Metabolic_Activity_Background,
Glycolysis_Within_First_Quarter_of_Intensity_Range,
Glycolysis_Within_Third_Quarter_of_Intensity_Range,
Glycolysis_Within_Fourth_Quarter_of_Intensity_Range,
Glycolysis_Within_Second_Quarter_of_Intensity_Range,
SegmentedPropertyCategoryCodeSequence,
SegmentedPropertyTypeCodeSequence,
SegmentNumber,
SegmentAlgorithmType,
AdditionalPatientHistory,
Allergies,
ImageType,
LastMenstrualDate,
MedicalAlerts,
EthnicGroup,
Occupation,
PatientAge,
PatientComments,
PatientSize,
PatientWeight,
PregnancyStatus,
ReasonForStudy,
RequestedProcedureComments,
SmokingStatus,
pivot.access,
pivot.gcs_url
FROM `PROJECT.DATASET.dicom_derived_all` pivot
JOIN `PROJECT.DATASET.dicom_all` dicom_all
ON pivot.SOPInstanceUID = dicom_all.SOPInstanceUID
@@ -0,0 +1,208 @@
WITH
measurementGroups AS (
WITH
contentSequenceLevel1 AS (
WITH
structuredReports AS (
SELECT
PatientID,
SOPInstanceUID,
SeriesDescription,
ContentSequence
FROM
`PROJECT.DATASET.dicom_metadata`
WHERE
( SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.11"
OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.22"
OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.33"
OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.34"
OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.35" )
AND ARRAY_LENGTH(ContentTemplateSequence) <> 0
AND ContentTemplateSequence [
OFFSET
(0)].TemplateIdentifier = "1500"
AND ContentTemplateSequence [
OFFSET
(0)].MappingResource = "DCMR" )
SELECT
PatientID,
SOPInstanceUID,
SeriesDescription,
contentSequence
FROM
structuredReports
CROSS JOIN
UNNEST(ContentSequence) AS contentSequence )
SELECT
PatientID,
SOPInstanceUID,
SeriesDescription,
contentSequence,
measurementGroup_number
FROM
contentSequenceLevel1
CROSS JOIN
UNNEST (contentSequence.ContentSequence) AS contentSequence
WITH
OFFSET
AS measurementGroup_number
WHERE
contentSequence.ValueType = "CONTAINER"
AND contentSequence.ConceptNameCodeSequence [
OFFSET
(0)].CodeMeaning = "Measurement Group" ),
measurementGroups_withTrackingID AS (
SELECT
SOPInstanceUID,
PatientID,
SeriesDescription,
measurementGroup_number,
unnestedContentSequence.TextValue AS trackingIdentifier,
measurementGroups.contentSequence
FROM
measurementGroups
CROSS JOIN
UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence
WHERE
unnestedContentSequence.ValueType = "TEXT"
AND ( unnestedContentSequence.ConceptNameCodeSequence [
OFFSET
(0)].CodeValue = "112039"
AND unnestedContentSequence.ConceptNameCodeSequence [
OFFSET
(0)].CodingSchemeDesignator = "DCM" ) ),
measurementGroups_withTrackingUID AS (
SELECT
SOPInstanceUID,
measurementGroup_number,
unnestedContentSequence.UID AS trackingUniqueIdentifier
FROM
measurementGroups
CROSS JOIN
UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence
WHERE
unnestedContentSequence.ValueType = "UIDREF"
AND ( unnestedContentSequence.ConceptNameCodeSequence [
OFFSET
(0)].CodeValue = "112040"
AND unnestedContentSequence.ConceptNameCodeSequence [
OFFSET
(0)].CodingSchemeDesignator = "DCM" ) ),
measurementGroups_withSegmentation AS (
SELECT
SOPInstanceUID,
measurementGroup_number,
unnestedContentSequence.ReferencedSOPSequence[
OFFSET
(0)].ReferencedSOPInstanceUID AS segmentationInstanceUID,
unnestedContentSequence.ReferencedSOPSequence[
OFFSET
(0)].ReferencedSegmentNumber AS segmentationSegmentNumber
FROM
measurementGroups
CROSS JOIN
UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence
WHERE
unnestedContentSequence.ValueType = "IMAGE"
AND unnestedContentSequence.ReferencedSOPSequence[
OFFSET
(0)].ReferencedSOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" ),
measurementGroups_withSourceSeries AS (
SELECT
SOPInstanceUID,
measurementGroup_number,
unnestedContentSequence.UID AS sourceSegmentedSeriesUID
FROM
measurementGroups
CROSS JOIN
UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence
WHERE
unnestedContentSequence.ValueType = "UIDREF"
AND ( unnestedContentSequence.ConceptNameCodeSequence [
OFFSET
(0)].CodeValue = "121232"
AND unnestedContentSequence.ConceptNameCodeSequence [
OFFSET
(0)].CodingSchemeDesignator = "DCM" ) ),
measurementGroups_withFinding AS (
SELECT
SOPInstanceUID,
measurementGroup_number,
unnestedContentSequence.ConceptCodeSequence [
OFFSET
(0)] AS finding
FROM
measurementGroups
CROSS JOIN
UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence
WHERE
unnestedContentSequence.ValueType = "CODE"
AND ( unnestedContentSequence.ConceptNameCodeSequence [
OFFSET
(0)].CodeValue = "121071"
AND unnestedContentSequence.ConceptNameCodeSequence [
OFFSET
(0)].CodingSchemeDesignator = "DCM" ) ),
measurementGroups_withFindingSite AS (
SELECT
SOPInstanceUID,
measurementGroup_number,
unnestedContentSequence.ConceptCodeSequence [
OFFSET
(0)] AS findingSite
FROM
measurementGroups
CROSS JOIN
UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence
WHERE
unnestedContentSequence.ValueType = "CODE"
AND ( unnestedContentSequence.ConceptNameCodeSequence [
OFFSET
(0)].CodeValue = "G-C0E3"
AND unnestedContentSequence.ConceptNameCodeSequence [
OFFSET
(0)].CodingSchemeDesignator = "SRT" ) )
SELECT
mWithUID.SOPInstanceUID,
mWithUID.measurementGroup_number,
mWithUID.trackingUniqueIdentifier,
mWithID.trackingIdentifier,
mWithID.PatientID,
mWithID.SeriesDescription,
mWithFinding.finding,
mWithFindingSite.findingSite,
mWithSourceSeries.sourceSegmentedSeriesUID,
mWithSegmentation.segmentationInstanceUID,
mWithSegmentation.segmentationSegmentNumber,
mWithID.contentSequence
FROM
measurementGroups_withTrackingUID AS mWithUID
JOIN
measurementGroups_withTrackingID AS mWithID
---
ON
mWithID.SOPInstanceUID = mWithUID.SOPInstanceUID
AND mWithID.measurementGroup_number = mWithUID.measurementGroup_number
JOIN
measurementGroups_withFinding AS mWithFinding
ON
mWithID.SOPInstanceUID = mWithFinding.SOPInstanceUID
AND mWithID.measurementGroup_number = mWithFinding.measurementGroup_number
JOIN
measurementGroups_withFindingSite AS mWithFindingSite
ON
mWithID.SOPInstanceUID = mWithFindingSite.SOPInstanceUID
AND mWithID.measurementGroup_number = mWithFindingSite.measurementGroup_number
JOIN
measurementGroups_withSourceSeries AS mWithSourceSeries
ON
mWithID.SOPInstanceUID = mWithSourceSeries.SOPInstanceUID
AND mWithID.measurementGroup_number = mWithSourceSeries.measurementGroup_number
JOIN
measurementGroups_withSegmentation AS mWithSegmentation
ON
mWithID.SOPInstanceUID = mWithSegmentation.SOPInstanceUID
AND mWithID.measurementGroup_number = mWithSegmentation.measurementGroup_number
---
ORDER BY
trackingUniqueIdentifier

0 comments on commit dfda5d9

Please sign in to comment.