In [1]:
import pandas as pd
import os
from collections import OrderedDict
from pathlib import Path
import gzip
import json

import pydicom
from pydicom._dicom_dict import DicomDictionary

# we need the location of MIMIC-CXR 2.0.0
# we use this to get cxr-records-list.csv.gz
mimic_cxr_path = Path('/db/mimic-cxr')

# we also need dicom-metadata.csv.gz and dicom-metadata.json.gz generated
# these are generated by export_metadata.py in this folder.

In order to store sequences from the DICOM, we created a JSON. We will load in that JSON now.

In [2]:
# load json
with gzip.open('dicom-metadata.json.gz', 'r') as fp:
    tmp = json.load(fp)

dcm_metadata = dict()
# convert from length list of 1 item dicts to single dict
for d in tmp:
    for k, v in d.items():
        dcm_metadata[k] = v
        
del tmp

# figure out how many unique top level meta-data fields in the json
# also get a list of all the top level tags
json_keys = [list(dcm_metadata[x].keys()) for x in dcm_metadata]
json_keys = set([int(item) for sublist in json_keys for item in sublist])
json_keys = list(json_keys)
json_keys.sort()

n_attrib = len(json_keys)
print(f'There are {n_attrib} top-level attributes in the DICOM json.')

# show an example
dcm_metadata['000046e4-e4d7f796-72c3dba4-8b67a485-0eea211d']

There are 10 top-level attributes in the DICOM json.


{'528434': [{'524544': 'C12',
   '524546': 'CLP',
   '524548': 'CHEST (PORTABLE AP)'}],
 '533016': [{'524544': 'T-D3000',
   '524546': 'SNM3',
   '524548': 'Chest',
   '524549': 'DCMR',
   '524550': '20020904',
   '524559': '4031'}],
 '1179748': [{'524544': '113100',
   '524546': 'DCM',
   '524547': '20170914',
   '524548': 'Basic Application Confidentiality Profile'},
  {'524544': '113105',
   '524546': 'DCM',
   '524547': '20170914',
   '524548': 'Clean Descriptors Option'},
  {'524544': '113107',
   '524546': 'DCM',
   '524547': '20170914',
   '524548': 'Retain Longitudinal Temporal Information Modified Dates Option'},
  {'524544': '113101',
   '524546': 'DCM',
   '524547': '20170914',
   '524548': 'Clean Pixel Data Option'},
  {'524544': '113103',
   '524546': 'DCM',
   '524547': '20170914',
   '524548': 'Clean Graphics Option'}],
 '5505568': [{'524544': 'R-10206',
   '524546': 'SNM3',
   '524548': 'antero-posterior',
   '524549': 'DCMR',
   '524550': '20040302',
   '524559': '4010

There are two very useful items in this sequence that we'd like to have in an easier form for all images: the procedure code sequence (`'528434'`), the coded view position (`'5505568'`), and the coded patient orientation (`'5506064'`). For convenience, we will pull the textual description of each (`'524548'`), rather than the ontology code itself.

In [3]:
cols = ['528434', '5505568', '5506064']
dcm_metadata_simple = {}
for k, v in dcm_metadata.items():
    dcm_metadata_simple[k] = [v[c][0]['524548']
                              for c in cols
                              if c in v and len(v[c])>0]
dcm_metadata_simple = pd.DataFrame.from_dict(dcm_metadata_simple, orient='index')

# convert columns to be human readable
dcm_metadata_simple.columns = [DicomDictionary[int(c)][-1] + '_' + DicomDictionary[int('524548')][-1]  for c in cols]
dcm_metadata_simple.head()

Unnamed: 0,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,CHEST (PA AND LAT),postero-anterior,Erect
174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,CHEST (PA AND LAT),lateral,Erect
2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,CHEST (PA AND LAT),postero-anterior,Erect
e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,CHEST (PA AND LAT),lateral,Erect
68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,CHEST (PORTABLE AP),antero-posterior,


In [12]:
metadata.head()

Unnamed: 0_level_0,524293,524296,524310,524312,524320,524321,524322,524323,524336,524337,...,1578288,1610546,4194912,1577040,1577236,1605968,4195073,4195086,4198403,1577328
dicom_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,ISO_IR 100,"['DERIVED', 'PRIMARY']",1.2.840.10008.5.1.4.1.1.1.1,2.25.3543748844510614920925352225862149680,21800506,21800506,21800506.0,21800506,213014.531,213026.75,...,,,,,,,,,,
174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,ISO_IR 100,"['DERIVED', 'PRIMARY']",1.2.840.10008.5.1.4.1.1.1.1,2.25.30925724177439423411425919179398157560,21800506,21800506,21800506.0,21800506,213014.531,213133.484,...,,,,,,,,,,
2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,ISO_IR 100,"['DERIVED', 'PRIMARY']",1.2.840.10008.5.1.4.1.1.1.1,2.25.56006540967197077610238991327864082702,21800626,21800626,21800626.0,21800626,165500.312,165512.437,...,,,,,,,,,,
e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,ISO_IR 100,"['DERIVED', 'PRIMARY']",1.2.840.10008.5.1.4.1.1.1.1,2.25.298436961669509509569879822879236656638,21800626,21800626,21800626.0,21800626,165500.312,165558.968,...,,,,,,,,,,
68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,ISO_IR 100,"['DERIVED', 'PRIMARY']",1.2.840.10008.5.1.4.1.1.1.1,2.25.139183506679367140539825912154983541585,21800723,21800723,21800723.0,21800723,80556.875,80714.5,...,,,,,,,,,,


In [13]:
# load in MIMIC-CXR 2.0.0 record list
records =  pd.read_csv(mimic_cxr_path / 'cxr-record-list.csv.gz')
records.set_index('dicom_id', inplace=True)

# load in a CSV of meta-data derived from MIMIC-CXR
metadata = pd.read_csv('dicom-metadata.csv.gz', index_col=0)
metadata.index.name = 'dicom_id'

# subselect to useful metadata
metadata = metadata[['4194900', '1593601', '2621456', '2621457', '524320', '524336', '1577984']]

# rename columns to be human readable
metadata.columns = [DicomDictionary[int(c)][-1] for c in metadata.columns]

# merge into records
metadata = records[['subject_id', 'study_id']].merge(
    metadata, how='left', left_index=True, right_index=True
)

# add in the metadata from the JSON file
metadata = metadata.merge(
    dcm_metadata_simple, how='left', left_index=True, right_index=True
)
metadata.head()

Unnamed: 0_level_0,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,AcquisitionDeviceProcessingDescription,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
dicom_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,,CHEST (PA AND LAT),postero-anterior,Erect
174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,,CHEST (PA AND LAT),lateral,Erect
2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,,CHEST (PA AND LAT),postero-anterior,Erect
e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,,CHEST (PA AND LAT),lateral,Erect
68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,,CHEST (PORTABLE AP),antero-posterior,


In [14]:
metadata.sort_values(['subject_id', 'study_id'], inplace=True)
metadata.to_csv('mimic-cxr-2.0.0-metadata.csv.gz', index=True, compression='gzip')