# CCDI Hub DICOM Analysis

This notebook extracts metadata from all DICOM files in the sample and assesses the 3 data quality measures: consistency, completeness, and outliers

Author: Lucy Han, Booz Allen Hamilton

In [1]:
# install pydicom
%pip install pydicom

Note: you may need to restart the kernel to use updated packages.


In [2]:
# import pydicom, pandas, and os
import pydicom
import pandas as pd
import os

In [3]:
# checking current path for Jupyter Notebook
current_path = os.getcwd()
print(current_path)

/sbgenomics/workspace


## Get and sort MCI DICOM files

In [1]:
# set file paths
file_src = '/sbgenomics/project-files/Reference_Files/Filtered'

In [5]:
RMS_manifest_file = file_src + '/filtered_CCDI Hub_RMS Path DICOM Files Manifest_2025-09-18 21-24-21.csv'
MCI_manifest_file = file_src + '/filtered_CCDI Hub_MCI Path DICOM Files Manifest_2025-09-18 21-12-26.csv'

In [6]:
RMS_manifest_df = pd.read_csv(RMS_manifest_file)
MCI_manifest_df = pd.read_csv(MCI_manifest_file)

In [7]:
RMS_manifest_df.shape

(246, 9)

In [8]:
MCI_manifest_df.shape

(462, 10)

In [9]:
MCI_manifest_df.head(10)

Unnamed: 0,File Name,Data Category,File Description,File Type,File Size,File Access,Study ID,Participant ID,Sample ID,Unnamed: 9
0,0049cac2-5490-4212-8014-e6176d1f3ff1.dcm,Pathology Imaging,,dicom,627.72 MB,Open,phs002790,PBBWZZ,0DJM8P,
1,0049cac2-5490-4212-8014-e6176d1f3ff1.dcm,Pathology Imaging,,dicom,627.72 MB,Open,phs002790,PBBWZZ,0DJM98,
2,0302064c-08a3-4134-b0a4-767b971bb761.dcm,Pathology Imaging,,dicom,714.79 MB,Open,phs002790,PBCDDH,0DPBFU,
3,0302064c-08a3-4134-b0a4-767b971bb761.dcm,Pathology Imaging,,dicom,714.79 MB,Open,phs002790,PBCDDH,0DPBGV,
4,03f7f0e3-c95f-4b88-b33c-0f5595343407.dcm,Pathology Imaging,,dicom,190.44 MB,Open,phs002790,PBBXEL,0DJM8R,
5,03f7f0e3-c95f-4b88-b33c-0f5595343407.dcm,Pathology Imaging,,dicom,190.44 MB,Open,phs002790,PBBXEL,0DJM95,
6,04588d47-8c04-448d-8370-2761be9a1087.dcm,Pathology Imaging,,dicom,409.15 MB,Open,phs002790,PBBSRF,0DFXFF,
7,04588d47-8c04-448d-8370-2761be9a1087.dcm,Pathology Imaging,,dicom,409.15 MB,Open,phs002790,PBBSRF,0DFXH2,
8,0497ff2c-a847-4d04-8367-40f60bbb11a9.dcm,Pathology Imaging,,dicom,1.44 GB,Open,phs002790,PBCKEL,0DUB86,
9,0497ff2c-a847-4d04-8367-40f60bbb11a9.dcm,Pathology Imaging,,dicom,1.44 GB,Open,phs002790,PBCKEL,0DUB9K,


In [11]:
dicomFileList = [] # initiate list to hold file directories

In [12]:
# local path to the file directory
# update the line below to match your directory
src = "/sbgenomics/project-files/MCI/MCI_Imaging"

# loop through all files in src directory
for root, dirs, files in os.walk(src):
    for file in files: 
        if file in list(MCI_manifest_df['File Name']): # check if .dcm is in the file name
            dicomFileList.append(os.path.join(root, file))


In [None]:
# number of files in the dicomFileList for the MCI study
len(dicomFileList)

In [14]:
# local path to the file directory
# update the line below to match your directory
src = "/sbgenomics/project-files/RMS-Mutation-Prediction/RMS_Imaging"

# loop through all files in src directory
for root, dirs, files in os.walk(src):
    for file in files: 
        if file in list(RMS_manifest_df['File Name']): # check if .dcm is in the file name
            dicomFileList.append(os.path.join(root, file))

In [15]:
# number of files in the dicomFileList for the MCI and the RMS studies
len(dicomFileList)

399

In [16]:
dicomFileList = list(set(dicomFileList))

In [17]:
len(dicomFileList)

399

# Completeness and Consistency dictionaries for DICOM headers

In [18]:
# import pydicom, os, counter, and math
import pydicom
import os
from collections import Counter
import math

In [19]:
def recurse(d, d_dict, elem_list, k="", first_file=False):
    # function to recursively extract data elements from the DICOM headers
    
    for elem in d:
        if elem.VR == 'SQ':
            if elem.keyword not in ['CodingSchemeIdentificationSequence',
                                    'Specimen Preparation Sequence', 
                                    'SpecimenDescriptionSequence',
                                    'DimensionOrganizationSequence',
                                   'DimensionIndexSequence']:
                for item in elem.value:
                    recurse(item, d_dict, elem_list, elem.keyword, first_file)
        else:
            if elem.keyword not in ["ICCProfile", ""]:
                dict_key = f"{k}:{elem.keyword}".lstrip(":")                
                if dict_key not in d_dict.keys():
                    if first_file:
                        d_dict[dict_key] = [] #initiate empty list
                    continue  
                d_dict[dict_key].append(elem.value) # add value to the list
                elem_list.append(dict_key)

In [20]:
def get_values_dict(file_list):
    # function that returns a dictionary of values from the dicom headers

    # file_list is an input list of the the files to extract data from

    field_dict = {} # initiate dictionary
    list_of_elements = []
    ds = pydicom.filereader.dcmread(file_list[0], stop_before_pixels=True) # read file
    recurse(ds, field_dict, list_of_elements, k="", first_file=True) #recurse

    file_counter = 0 # initate counter

    # loop through files
    for file in file_list:
        ds = pydicom.filereader.dcmread(file, stop_before_pixels=True) # read file

        # loop through headers
        list_of_elements = []
        recurse(ds, field_dict, elem_list=list_of_elements) # recurse

        elements_not_in_first = list(set(field_dict.keys()) - set(list_of_elements))
        for e in elements_not_in_first:
            field_dict[e].append('Not Present')
        
        file_counter+=1 # increase counter
        print(f"{file} processed. {file_counter} files have been read.")
    return field_dict

In [21]:
def get_completeness_consistency(values_dict):
    # function that returns a dictionary about the consistency of the data

    # values_dict is a dictionary of key (headers) and the values from the dicom headers

    consistency_dict = {} # initiate dictionary

    fields = values_dict.keys() # get keys

    # loop through keys
    for field in fields:
        try:
            set(values_dict[field]) # get unique set
            consistency_dict[field] = Counter(values_dict[field]) # count values per item in set
        except: # if data type that a set cannot be taken from
            consistency_dict[field] = values_dict[field] # add the values to the dictionary
    return consistency_dict

In [22]:
# #Example of reading a DICOM image and printing the metadata headers
ds = pydicom.filereader.dcmread("/sbgenomics/project-files/MCI/MCI_Imaging/815b7904-14f5-4007-9edf-3792185b6e18.dcm")
print(ds)

Dataset.file_meta -------------------------------
(0002,0000) File Meta Information Group Length  UL: 224
(0002,0001) File Meta Information Version       OB: b'\x00\x01'
(0002,0002) Media Storage SOP Class UID         UI: VL Whole Slide Microscopy Image Storage
(0002,0003) Media Storage SOP Instance UID      UI: 1.3.6.1.4.1.5962.99.1.1907836886.1276421301.1719894788054.11.0
(0002,0010) Transfer Syntax UID                 UI: JPEG Baseline (Process 1)
(0002,0012) Implementation Class UID            UI: 1.3.6.1.4.1.5962.99.2
(0002,0013) Implementation Version Name         SH: 'PIXELMEDJAVA001'
(0002,0016) Source Application Entity Title     AE: 'OURAETITLE'
-------------------------------------------------
(0008,0008) Image Type                          CS: ['DERIVED', 'PRIMARY', 'VOLUME', 'NONE']
(0008,0012) Instance Creation Date              DA: '20240702'
(0008,0013) Instance Creation Time              TM: '043524.332'
(0008,0014) Instance Creator UID                UI: 1.3.6.1.4.1.5

In [25]:
# get dictionary of field values
field_values_dict = get_values_dict(dicomFileList)

/sbgenomics/project-files/RMS-Mutation-Prediction/RMS_Imaging/b9de96fe-afc1-43a2-9b0e-855233df30cf.dcm processed. 1 files have been read.
/sbgenomics/project-files/RMS-Mutation-Prediction/RMS_Imaging/40a77216-1153-49f6-9a0c-ab542d77b334.dcm processed. 2 files have been read.
/sbgenomics/project-files/MCI/MCI_Imaging/92297591-f9f7-4271-8adf-5e2b8130fbdd.dcm processed. 3 files have been read.
/sbgenomics/project-files/RMS-Mutation-Prediction/RMS_Imaging/4a4766fa-fcac-4bb1-9d79-7c575efc1988.dcm processed. 4 files have been read.
/sbgenomics/project-files/RMS-Mutation-Prediction/RMS_Imaging/6cb660ed-f540-4322-a13e-9f09322cf51e.dcm processed. 5 files have been read.
/sbgenomics/project-files/MCI/MCI_Imaging/c317160f-054b-4c92-82ae-86ede93a7c15.dcm processed. 6 files have been read.
/sbgenomics/project-files/RMS-Mutation-Prediction/RMS_Imaging/82a95076-edbd-4671-9fac-da083f34950d.dcm processed. 7 files have been read.
/sbgenomics/project-files/MCI/MCI_Imaging/ba0d9d33-7b21-4b2a-94d0-4acf55ee

In [26]:
len(field_values_dict.keys())

101

In [27]:
# print the key, value pair in the field_values dictionary
for key, value in field_values_dict.items():
    print(f"{key}: {len(value)}")

ImageType: 399
InstanceCreationDate: 399
InstanceCreationTime: 399
InstanceCreatorUID: 399
SOPClassUID: 399
AcquisitionUID: 399
SOPInstanceUID: 399
PyramidUID: 399
StudyDate: 399
SeriesDate: 399
AcquisitionDate: 399
ContentDate: 399
AcquisitionDateTime: 399
StudyTime: 399
SeriesTime: 399
AcquisitionTime: 399
ContentTime: 399
AccessionNumber: 399
Modality: 399
Manufacturer: 399
ReferringPhysicianName: 399
TimezoneOffsetFromUTC: 399
StudyDescription: 399
SeriesDescription: 399
AdmittingDiagnosesDescription: 399
AdmittingDiagnosesCodeSequence:CodeValue: 399
AdmittingDiagnosesCodeSequence:CodingSchemeDesignator: 399
AdmittingDiagnosesCodeSequence:CodeMeaning: 399
ManufacturerModelName: 399
VolumetricProperties: 399
PatientName: 399
PatientID: 399
PatientBirthDate: 399
PatientSex: 399
EthnicGroup: 399
DeviceSerialNumber: 399
SoftwareVersions: 399
ContentQualification: 399
ContributingEquipmentSequence:Manufacturer: 399
ContributingEquipmentSequence:InstitutionName: 399
ContributingEquipment

In [28]:
# get consistency values dictionary
consistency_values_dict = get_completeness_consistency(field_values_dict)

In [29]:
# print results by looping through the consistency_values dictionary
for key in consistency_values_dict.keys():
    values = consistency_values_dict[key]
    count = len(values)
    if count > 30: # supress output if the unique counts are greater than 30
        values = "Output supressed"
    print(f"{key} has count of {count}: {values}")

ImageType has count of 399: Output supressed
InstanceCreationDate has count of 24: Counter({'20240928': 42, '20240927': 41, '20240701': 37, '20230618': 26, '20230603': 26, '20240702': 21, '20230623': 20, '20230613': 19, '20230611': 19, '20230624': 18, '20230616': 14, '20230604': 14, '20230614': 13, '20240926': 12, '20230621': 12, '20230615': 12, '20230607': 7, '20230617': 7, '20230605': 7, '20230608': 7, '20230606': 7, '20230622': 6, '20230609': 6, '20230610': 6})
InstanceCreationTime has count of 399: Output supressed
InstanceCreatorUID has count of 1: Counter({'1.3.6.1.4.1.5962.99.3': 399})
SOPClassUID has count of 1: Counter({'1.2.840.10008.5.1.4.1.1.77.1.6': 399})
AcquisitionUID has count of 68: Output supressed
SOPInstanceUID has count of 399: Output supressed
PyramidUID has count of 69: Output supressed
StudyDate has count of 32: Output supressed
SeriesDate has count of 33: Output supressed
AcquisitionDate has count of 33: Output supressed
ContentDate has count of 33: Output supr

In [30]:
# pd.set_option('display.max_columns', None)

In [31]:
# dicom_df = pd.DataFrame(field_values_dict)

In [34]:
# dicom_df.to_csv("/sbgenomics/output-files/dicom_metadata_extract.csv", index=False)

# Final Completeness, Consistency, Outlier, and Imbalance calculations

In [44]:
# initiate counters
missing_count = 0
values_dict = field_values_dict.copy()
numeric_fields_list = []
outlier_count = 0

In [45]:
# loop through all keys in the consistency values dictionary
for key in consistency_values_dict.keys():
        values = consistency_values_dict[key]

        #print results
        if type(values) != list and len(values)<30: 
            print(f"{key} has count of {len(values)}: {values}")
        else: 
            print(f"{key} has count of {len(values)}")
        
        # get missing and outlier counts
        try:
            # get missing counts
            missing_count = missing_count + consistency_values_dict[key]["Not Present"] # count number of "Not Present"
            missing_count = missing_count + consistency_values_dict[key][""] # count number of empty string

            # get outlier counts

            # remove "Not present" for outlier calculation
            if "Not present" in values: 
                values_dict[key] = values_dict[key].remove("Not Present")

            # remove empty string for outlier calculation
            if "" in values: 
                values_dict[key] = values_dict[key].remove("")

            # field is numeric    
            if all(isinstance(value, int) for value in values_dict[key]) or all(isinstance(value, float) for value in values_dict[key]):
                numeric_fields_list.append(key) # add key to numeric_fields list

                values_dict[key] = values_dict[key].sort() # sort the values
                values_array = np.array(values_dict[key]) # turn type into an array
                first_quantile = np.quantile(values_array, 0.25) # get first quantile
                third_quantile = np.quantile(values_array, 0.75) # get third quantile
                IQR = third_quantile - first_quantile # get interquantile range
                
                # get list of outliers under the first quantile
                if any(v < (first_quantile - IQR) for v in values_dict[key]):
                    outlier_list = [v for v in values_dict[key] if v < (first_quantile - IQR)]
                    print(f"low outliers: {outlier_list}")
                    outlier_count = outlier_count + len(outlier_list)

                # get list of outliers over the third quantile
                if any(v > (third_quantile + IQR) for v in values_dict[key]):
                    outlier_list = [v for v in values_dict[key] if v > (third_quantile + IQR)]
                    print(f"high outliers: {outlier_list}")
                    outlier_count = outlier_count + len(outlier_list)

        except Exception as e: # handle exception
            continue

print(f"Number of missing values: {missing_count}")
        

ImageType has count of 399
InstanceCreationDate has count of 24: Counter({'20240928': 42, '20240927': 41, '20240701': 37, '20230618': 26, '20230603': 26, '20240702': 21, '20230623': 20, '20230613': 19, '20230611': 19, '20230624': 18, '20230616': 14, '20230604': 14, '20230614': 13, '20240926': 12, '20230621': 12, '20230615': 12, '20230607': 7, '20230617': 7, '20230605': 7, '20230608': 7, '20230606': 7, '20230622': 6, '20230609': 6, '20230610': 6})
InstanceCreationTime has count of 399
InstanceCreatorUID has count of 1: Counter({'1.3.6.1.4.1.5962.99.3': 399})
SOPClassUID has count of 1: Counter({'1.2.840.10008.5.1.4.1.1.77.1.6': 399})
AcquisitionUID has count of 68
SOPInstanceUID has count of 399
PyramidUID has count of 69
StudyDate has count of 32
SeriesDate has count of 33
AcquisitionDate has count of 33
ContentDate has count of 33
AcquisitionDateTime has count of 65
StudyTime has count of 63
SeriesTime has count of 65
AcquisitionTime has count of 65
ContentTime has count of 65
Accessi

In [46]:
# percentage missing
missing_count/(len(values_dict.keys())*399) * 100

3.2308494007295465

In [47]:
# percentage outliers
outlier_count/(len(numeric_fields_list)*399) * 100

0.0

In [48]:
# numeric fields list
numeric_fields_list

['SeriesNumber',
 'InstanceNumber',
 'SamplesPerPixel',
 'PlanarConfiguration',
 'NumberOfFrames',
 'Rows',
 'Columns',
 'BitsAllocated',
 'BitsStored',
 'HighBit',
 'PixelRepresentation',
 'TotalPixelMatrixColumns',
 'TotalPixelMatrixRows',
 'TotalPixelMatrixOriginSequence:XOffsetInSlideCoordinateSystem',
 'TotalPixelMatrixOriginSequence:YOffsetInSlideCoordinateSystem',
 'OpticalPathSequence:ObjectiveLensPower',
 'NumberOfOpticalPaths',
 'TotalPixelMatrixFocalPlanes']