In [None]:
import datetime
import os
from collections import OrderedDict
from pathlib import Path
import json
import gzip


import pydicom
from pydicom.tag import Tag
from pydicom._dicom_dict import DicomDictionary
import pandas as pd
import numpy as np

# Summarize dataset

In [None]:
# load in mapping file
df = pd.read_csv('cxr-record-list.csv.gz', header=0, sep=',')

n = df.shape[0]
print(f'{n} DICOMs in MIMIC-CXR v2.0.0.')

n = df['study_id'].nunique()
print(f'  {n} studies.')

n = df['subject_id'].nunique()
print(f'  {n} subjects.')

dicoms = set(df['dicom_id'].tolist())

# Examination type

This requires loading in the DICOM meta-data. Here we load in an already generated CSV file and JSON which have all the meta-data attributes.

In [None]:
data_path = Path('/db/eddata/dicom-metadata')
# must contain:
#   dicom-metadata.csv.gz
#   dicom-metadata.json.gz

#md = pd.read_csv(data_path / 'dicom-metadata.csv.gz',
#                 header=0,
#                 sep=',', compression='gzip', index_col=0)
#md.columns = [int(c) for c in md.columns]

# load json
with gzip.open(data_path / 'dicom-metadata.json.gz', 'r') as fp:
    tmp = json.load(fp)

dcm_metadata = dict()
# convert from length list of 1 item dicts to single dict
for d in tmp:
    for k, v in d.items():
        dcm_metadata[k] = v
del tmp

# figure out how many unique top level meta-data fields in the json
# also get a list of all the top level tags
json_keys = [list(dcm_metadata[x].keys()) for x in dcm_metadata]
json_keys = set([int(item) for sublist in json_keys for item in sublist])
json_keys = list(json_keys)
json_keys.sort()

n_attrib = len(json_keys)
print(f'There are {n_attrib} top-level attributes in the DICOM json.')

# filter metadata to dicoms in our list
dcm_metadata = {x: dcm_metadata[x] for x in dcm_metadata.keys() if x in dicoms}

# grab the examination type
dcm_exam = {x: dcm_metadata[x]['528434'][0] for x in dcm_metadata if '528434' in dcm_metadata[x]}
dcm_exam = pd.DataFrame.from_dict(dcm_exam, orient='index')
# filter to mimic-cxr images
dcm_exam = df[['dicom_id']].merge(dcm_exam, how='left', left_on='dicom_id', right_index=True)
dcm_exam.head()

# using the observed meta-data, create a mapping of the examination codes to their description
exam_mapping = dcm_exam[['524544', '524548']].drop_duplicates().values
exam_mapping = {x[0]: x[1] for x in exam_mapping if x[0] is not np.nan}

In [None]:
n = dcm_exam.shape[0]
print(f'{n} DICOMs.')

n = dcm_exam['524544'].notnull().sum()
f = n / dcm_exam.shape[0] * 100.0
print(f'  {n} ({f:3.1f}%) have an exam name.\n')

# create table in the paper
grp = dcm_exam.groupby(['524544', '524548'])[['dicom_id']].count()
grp['(%)'] = np.round(grp['dicom_id'] / dcm_exam.shape[0] * 100.0, 2)
grp.reset_index(inplace=True)
grp.sort_values('dicom_id', ascending=False, inplace=True)

# rename columns to be interpretable
grp.rename(columns={
    '524544': 'Code',
    '524548': 'Examination name',
    'dicom_id': 'Number of DICOMs'
}, inplace=True)

grp.to_latex('table_exam_names.tex', index=False)

print('Exam names:')
display(grp)