Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Onboard Imaging Data Commons (IDC) v7 dataset (#287)
* feat: Onboard Imaging Data Commons (IDC) v7 dataset * Update datasets/idc/dataset.yaml * updates black formatter due to a recurring bug * move the images folder under pipelines * move images folder * use black@stable for Python linting * use black@stable for Python linting * adds missing queries Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>
- Loading branch information
1 parent
b71b113
commit dfda5d9
Showing
14 changed files
with
737 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1 change: 1 addition & 0 deletions
1
datasets/idc/pipelines/_images/generate_bq_views/queries/current/dicom_derived_all.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
select * from `PROJECT.idc_CURRENT_VERSION.dicom_derived_all` |
1 change: 1 addition & 0 deletions
1
datasets/idc/pipelines/_images/generate_bq_views/queries/current/dicom_pivot_v7.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
select * from `PROJECT.idc_CURRENT_VERSION.dicom_pivot_v7` |
55 changes: 55 additions & 0 deletions
55
datasets/idc/pipelines/_images/generate_bq_views/queries/v7/dicom_all.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
WITH | ||
pre_dicom_all AS ( | ||
SELECT | ||
aux.tcia_api_collection_id AS tcia_api_collection_id, | ||
aux.idc_webapp_collection_id AS collection_id, | ||
aux.collection_timestamp AS collection_timestamp, | ||
aux.collection_hash as collection_hash, | ||
aux.collection_init_idc_version AS collection_init_idc_version, | ||
aux.collection_revised_idc_version AS collection_revised_idc_version, | ||
aux.access AS access, | ||
dcm.PatientID as PatientID, | ||
aux.idc_case_id as idc_case_id, | ||
aux.patient_hash as patient_hash, | ||
aux.patient_init_idc_version AS patient_init_idc_version, | ||
aux.patient_revised_idc_version AS patient_revised_idc_version, | ||
dcm.StudyInstanceUID AS StudyInstanceUID, | ||
aux.study_uuid as crdc_study_uuid, | ||
aux.study_hash as study_hash, | ||
aux.study_init_idc_version AS study_init_idc_version, | ||
aux.study_revised_idc_version AS study_revised_idc_version, | ||
dcm.SeriesInstanceUID AS SeriesInstanceUID, | ||
aux.series_uuid as crdc_series_uuid, | ||
aux.series_hash as series_hash, | ||
aux.series_init_idc_version AS series_init_idc_version, | ||
aux.series_revised_idc_version AS series_revised_idc_version, | ||
dcm.SOPInstanceUID AS SOPInstanceUID, | ||
aux.instance_uuid as crdc_instance_uuid, | ||
aux.gcs_url as gcs_url, | ||
aux.instance_size as instance_size, | ||
aux.instance_hash as instance_hash, | ||
aux.instance_init_idc_version AS instance_init_idc_version, | ||
aux.instance_revised_idc_version AS instance_revised_idc_version, | ||
aux.source_doi as Source_DOI, | ||
aux.license_url as license_url, | ||
aux.license_long_name as license_long_name, | ||
aux.license_short_name as license_short_name, | ||
dcm.* except(PatientID, StudyInstanceUID, SeriesInstanceUID, SOPInstanceUID) | ||
FROM | ||
`PROJECT.DATASET.auxiliary_metadata` AS aux | ||
INNER JOIN | ||
`PROJECT.DATASET.dicom_metadata` AS dcm | ||
ON | ||
aux.SOPInstanceUID = dcm.SOPInstanceUID) | ||
|
||
SELECT | ||
data_collections.Location AS tcia_tumorLocation, | ||
data_collections.Species AS tcia_species, | ||
data_collections.CancerType AS tcia_cancerType, | ||
pre_dicom_all.* | ||
FROM | ||
pre_dicom_all | ||
INNER JOIN | ||
`PROJECT.DATASET.original_collections_metadata` AS data_collections | ||
ON | ||
pre_dicom_all.collection_id = data_collections.idc_webapp_collection_id |
5 changes: 5 additions & 0 deletions
5
datasets/idc/pipelines/_images/generate_bq_views/queries/v7/dicom_metadata_curated.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
SELECT | ||
SOPInstanceUID, | ||
SAFE_CAST(SliceThickness AS FLOAT64) AS SliceThickness | ||
FROM | ||
`PROJECT.DATASET.dicom_metadata` AS dcm |
75 changes: 75 additions & 0 deletions
75
datasets/idc/pipelines/_images/generate_bq_views/queries/v7/dicom_pivot_v7.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
SELECT | ||
pivot.PatientID, | ||
pivot.BodyPartExamined, | ||
pivot.SeriesInstanceUID, | ||
pivot.SliceThickness, | ||
pivot.SeriesNumber, | ||
pivot.SeriesDescription, | ||
pivot.StudyInstanceUID, | ||
pivot.StudyDescription, | ||
pivot.StudyDate, | ||
pivot.SOPInstanceUID, | ||
pivot.Modality, | ||
pivot.SOPClassUID, | ||
pivot.collection_id, | ||
pivot.AnatomicRegionSequence, | ||
pivot.FrameOfReferenceUID, | ||
pivot.crdc_study_uuid, | ||
pivot.crdc_series_uuid, | ||
pivot.crdc_instance_uuid, | ||
pivot.Program, | ||
pivot.tcia_tumorLocation, | ||
pivot.source_DOI, | ||
pivot.tcia_species, | ||
pivot.license_short_name, | ||
pivot.Manufacturer, | ||
pivot.ManufacturerModelName, | ||
Apparent_Diffusion_Coefficient, | ||
Internal_structure, | ||
Sphericity, | ||
Calcification, | ||
Lobular_Pattern, | ||
Spiculation, | ||
Margin, | ||
Texture, | ||
Subtlety_score, | ||
Malignancy, | ||
SUVbw, | ||
Volume, | ||
Diameter, | ||
Surface_area_of_mesh, | ||
Total_Lesion_Glycolysis, | ||
Standardized_Added_Metabolic_Activity, | ||
Percent_Within_First_Quarter_of_Intensity_Range, | ||
Percent_Within_Third_Quarter_of_Intensity_Range, | ||
Percent_Within_Fourth_Quarter_of_Intensity_Range, | ||
Percent_Within_Second_Quarter_of_Intensity_Range, | ||
Standardized_Added_Metabolic_Activity_Background, | ||
Glycolysis_Within_First_Quarter_of_Intensity_Range, | ||
Glycolysis_Within_Third_Quarter_of_Intensity_Range, | ||
Glycolysis_Within_Fourth_Quarter_of_Intensity_Range, | ||
Glycolysis_Within_Second_Quarter_of_Intensity_Range, | ||
SegmentedPropertyCategoryCodeSequence, | ||
SegmentedPropertyTypeCodeSequence, | ||
SegmentNumber, | ||
SegmentAlgorithmType, | ||
AdditionalPatientHistory, | ||
Allergies, | ||
ImageType, | ||
LastMenstrualDate, | ||
MedicalAlerts, | ||
EthnicGroup, | ||
Occupation, | ||
PatientAge, | ||
PatientComments, | ||
PatientSize, | ||
PatientWeight, | ||
PregnancyStatus, | ||
ReasonForStudy, | ||
RequestedProcedureComments, | ||
SmokingStatus, | ||
pivot.access, | ||
pivot.gcs_url | ||
FROM `PROJECT.DATASET.dicom_derived_all` pivot | ||
JOIN `PROJECT.DATASET.dicom_all` dicom_all | ||
ON pivot.SOPInstanceUID = dicom_all.SOPInstanceUID |
208 changes: 208 additions & 0 deletions
208
datasets/idc/pipelines/_images/generate_bq_views/queries/v7/measurement_groups.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
WITH | ||
measurementGroups AS ( | ||
WITH | ||
contentSequenceLevel1 AS ( | ||
WITH | ||
structuredReports AS ( | ||
SELECT | ||
PatientID, | ||
SOPInstanceUID, | ||
SeriesDescription, | ||
ContentSequence | ||
FROM | ||
`PROJECT.DATASET.dicom_metadata` | ||
WHERE | ||
( SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.11" | ||
OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.22" | ||
OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.33" | ||
OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.34" | ||
OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.35" ) | ||
AND ARRAY_LENGTH(ContentTemplateSequence) <> 0 | ||
AND ContentTemplateSequence [ | ||
OFFSET | ||
(0)].TemplateIdentifier = "1500" | ||
AND ContentTemplateSequence [ | ||
OFFSET | ||
(0)].MappingResource = "DCMR" ) | ||
SELECT | ||
PatientID, | ||
SOPInstanceUID, | ||
SeriesDescription, | ||
contentSequence | ||
FROM | ||
structuredReports | ||
CROSS JOIN | ||
UNNEST(ContentSequence) AS contentSequence ) | ||
SELECT | ||
PatientID, | ||
SOPInstanceUID, | ||
SeriesDescription, | ||
contentSequence, | ||
measurementGroup_number | ||
FROM | ||
contentSequenceLevel1 | ||
CROSS JOIN | ||
UNNEST (contentSequence.ContentSequence) AS contentSequence | ||
WITH | ||
OFFSET | ||
AS measurementGroup_number | ||
WHERE | ||
contentSequence.ValueType = "CONTAINER" | ||
AND contentSequence.ConceptNameCodeSequence [ | ||
OFFSET | ||
(0)].CodeMeaning = "Measurement Group" ), | ||
measurementGroups_withTrackingID AS ( | ||
SELECT | ||
SOPInstanceUID, | ||
PatientID, | ||
SeriesDescription, | ||
measurementGroup_number, | ||
unnestedContentSequence.TextValue AS trackingIdentifier, | ||
measurementGroups.contentSequence | ||
FROM | ||
measurementGroups | ||
CROSS JOIN | ||
UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence | ||
WHERE | ||
unnestedContentSequence.ValueType = "TEXT" | ||
AND ( unnestedContentSequence.ConceptNameCodeSequence [ | ||
OFFSET | ||
(0)].CodeValue = "112039" | ||
AND unnestedContentSequence.ConceptNameCodeSequence [ | ||
OFFSET | ||
(0)].CodingSchemeDesignator = "DCM" ) ), | ||
measurementGroups_withTrackingUID AS ( | ||
SELECT | ||
SOPInstanceUID, | ||
measurementGroup_number, | ||
unnestedContentSequence.UID AS trackingUniqueIdentifier | ||
FROM | ||
measurementGroups | ||
CROSS JOIN | ||
UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence | ||
WHERE | ||
unnestedContentSequence.ValueType = "UIDREF" | ||
AND ( unnestedContentSequence.ConceptNameCodeSequence [ | ||
OFFSET | ||
(0)].CodeValue = "112040" | ||
AND unnestedContentSequence.ConceptNameCodeSequence [ | ||
OFFSET | ||
(0)].CodingSchemeDesignator = "DCM" ) ), | ||
measurementGroups_withSegmentation AS ( | ||
SELECT | ||
SOPInstanceUID, | ||
measurementGroup_number, | ||
unnestedContentSequence.ReferencedSOPSequence[ | ||
OFFSET | ||
(0)].ReferencedSOPInstanceUID AS segmentationInstanceUID, | ||
unnestedContentSequence.ReferencedSOPSequence[ | ||
OFFSET | ||
(0)].ReferencedSegmentNumber AS segmentationSegmentNumber | ||
FROM | ||
measurementGroups | ||
CROSS JOIN | ||
UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence | ||
WHERE | ||
unnestedContentSequence.ValueType = "IMAGE" | ||
AND unnestedContentSequence.ReferencedSOPSequence[ | ||
OFFSET | ||
(0)].ReferencedSOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" ), | ||
measurementGroups_withSourceSeries AS ( | ||
SELECT | ||
SOPInstanceUID, | ||
measurementGroup_number, | ||
unnestedContentSequence.UID AS sourceSegmentedSeriesUID | ||
FROM | ||
measurementGroups | ||
CROSS JOIN | ||
UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence | ||
WHERE | ||
unnestedContentSequence.ValueType = "UIDREF" | ||
AND ( unnestedContentSequence.ConceptNameCodeSequence [ | ||
OFFSET | ||
(0)].CodeValue = "121232" | ||
AND unnestedContentSequence.ConceptNameCodeSequence [ | ||
OFFSET | ||
(0)].CodingSchemeDesignator = "DCM" ) ), | ||
measurementGroups_withFinding AS ( | ||
SELECT | ||
SOPInstanceUID, | ||
measurementGroup_number, | ||
unnestedContentSequence.ConceptCodeSequence [ | ||
OFFSET | ||
(0)] AS finding | ||
FROM | ||
measurementGroups | ||
CROSS JOIN | ||
UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence | ||
WHERE | ||
unnestedContentSequence.ValueType = "CODE" | ||
AND ( unnestedContentSequence.ConceptNameCodeSequence [ | ||
OFFSET | ||
(0)].CodeValue = "121071" | ||
AND unnestedContentSequence.ConceptNameCodeSequence [ | ||
OFFSET | ||
(0)].CodingSchemeDesignator = "DCM" ) ), | ||
measurementGroups_withFindingSite AS ( | ||
SELECT | ||
SOPInstanceUID, | ||
measurementGroup_number, | ||
unnestedContentSequence.ConceptCodeSequence [ | ||
OFFSET | ||
(0)] AS findingSite | ||
FROM | ||
measurementGroups | ||
CROSS JOIN | ||
UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence | ||
WHERE | ||
unnestedContentSequence.ValueType = "CODE" | ||
AND ( unnestedContentSequence.ConceptNameCodeSequence [ | ||
OFFSET | ||
(0)].CodeValue = "G-C0E3" | ||
AND unnestedContentSequence.ConceptNameCodeSequence [ | ||
OFFSET | ||
(0)].CodingSchemeDesignator = "SRT" ) ) | ||
SELECT | ||
mWithUID.SOPInstanceUID, | ||
mWithUID.measurementGroup_number, | ||
mWithUID.trackingUniqueIdentifier, | ||
mWithID.trackingIdentifier, | ||
mWithID.PatientID, | ||
mWithID.SeriesDescription, | ||
mWithFinding.finding, | ||
mWithFindingSite.findingSite, | ||
mWithSourceSeries.sourceSegmentedSeriesUID, | ||
mWithSegmentation.segmentationInstanceUID, | ||
mWithSegmentation.segmentationSegmentNumber, | ||
mWithID.contentSequence | ||
FROM | ||
measurementGroups_withTrackingUID AS mWithUID | ||
JOIN | ||
measurementGroups_withTrackingID AS mWithID | ||
--- | ||
ON | ||
mWithID.SOPInstanceUID = mWithUID.SOPInstanceUID | ||
AND mWithID.measurementGroup_number = mWithUID.measurementGroup_number | ||
JOIN | ||
measurementGroups_withFinding AS mWithFinding | ||
ON | ||
mWithID.SOPInstanceUID = mWithFinding.SOPInstanceUID | ||
AND mWithID.measurementGroup_number = mWithFinding.measurementGroup_number | ||
JOIN | ||
measurementGroups_withFindingSite AS mWithFindingSite | ||
ON | ||
mWithID.SOPInstanceUID = mWithFindingSite.SOPInstanceUID | ||
AND mWithID.measurementGroup_number = mWithFindingSite.measurementGroup_number | ||
JOIN | ||
measurementGroups_withSourceSeries AS mWithSourceSeries | ||
ON | ||
mWithID.SOPInstanceUID = mWithSourceSeries.SOPInstanceUID | ||
AND mWithID.measurementGroup_number = mWithSourceSeries.measurementGroup_number | ||
JOIN | ||
measurementGroups_withSegmentation AS mWithSegmentation | ||
ON | ||
mWithID.SOPInstanceUID = mWithSegmentation.SOPInstanceUID | ||
AND mWithID.measurementGroup_number = mWithSegmentation.measurementGroup_number | ||
--- | ||
ORDER BY | ||
trackingUniqueIdentifier |
Oops, something went wrong.