In [None]:
import pandas as pd
import os
from collections import OrderedDict
from pathlib import Path
import gzip
import json

import pydicom
from pydicom._dicom_dict import DicomDictionary

# we need the location of MIMIC-CXR 2.0.0
# we use this to get cxr-records-list.csv.gz
mimic_cxr_path = Path('/db/mimic-cxr')

# we also need dicom-metadata.csv.gz and dicom-metadata.json.gz generated
# these are generated by export_metadata.py in this folder.

In order to store sequences from the DICOM, we created a JSON. We will load in that JSON now.

In [None]:
# load json
with gzip.open('dicom-metadata.json.gz', 'r') as fp:
    tmp = json.load(fp)

dcm_metadata = dict()
# convert from length list of 1 item dicts to single dict
for d in tmp:
    for k, v in d.items():
        dcm_metadata[k] = v
        
del tmp

# figure out how many unique top level meta-data fields in the json
# also get a list of all the top level tags
json_keys = [list(dcm_metadata[x].keys()) for x in dcm_metadata]
json_keys = set([int(item) for sublist in json_keys for item in sublist])
json_keys = list(json_keys)
json_keys.sort()

n_attrib = len(json_keys)
print(f'There are {n_attrib} top-level attributes in the DICOM json.')

# show an example
dcm_metadata['000046e4-e4d7f796-72c3dba4-8b67a485-0eea211d']

There are two very useful items in this sequence that we'd like to have in an easier form for all images: the procedure code sequence (`'528434'`), the coded view position (`'5505568'`), and the coded patient orientation (`'5506064'`). For convenience, we will pull the textual description of each (`'524548'`), rather than the ontology code itself.

In [None]:
cols = ['528434', '5505568', '5506064']
dcm_metadata_simple = {}
for k, v in dcm_metadata.items():
    dcm_metadata_simple[k] = [v[c][0]['524548']
                              for c in cols
                              if c in v and len(v[c])>0]
dcm_metadata_simple = pd.DataFrame.from_dict(dcm_metadata_simple, orient='index')

# convert columns to be human readable
dcm_metadata_simple.columns = [DicomDictionary[int(c)][-1] + '_' + DicomDictionary[int('524548')][-1]  for c in cols]
dcm_metadata_simple.head()

In [None]:
# load in MIMIC-CXR 2.0.0 record list
records =  pd.read_csv(mimic_cxr_path / 'cxr-record-list.csv.gz')
records.set_index('dicom_id', inplace=True)

# load in a CSV of meta-data derived from MIMIC-CXR
metadata = pd.read_csv('dicom-metadata.csv.gz', index_col=0)
metadata.index.name = 'dicom_id'

# subselect to useful metadata
metadata = metadata[['4194900', '1593601', '2621456', '2621457', '524320', '524336']]

# rename columns to be human readable
metadata.columns = [DicomDictionary[int(c)][-1] for c in metadata.columns]

# merge into records
metadata = records[['subject_id', 'study_id']].merge(
    metadata, how='left', left_index=True, right_index=True
)

# add in the metadata from the JSON file
metadata = metadata.merge(
    dcm_metadata_simple, how='left', left_index=True, right_index=True
)
metadata.head()

In [None]:
metadata.sort_values(['subject_id', 'study_id'], inplace=True)
metadata.to_csv('mimic-cxr-2.0.0-metadata.csv.gz', index=True, compression='gzip')