In [1]:
import datetime
import os
from collections import OrderedDict
from pathlib import Path
import json
import gzip


import pydicom
from pydicom.tag import Tag
from pydicom._dicom_dict import DicomDictionary
import pandas as pd
import numpy as np

# Summarize dataset

In [3]:
# load in mapping file
mimic_cxr_path = Path('/db/mimic-cxr')
df = pd.read_csv(mimic_cxr_path / 'cxr-record-list.csv.gz', header=0, sep=',')

n = df.shape[0]
print(f'{n} DICOMs in MIMIC-CXR v2.0.0.')

n = df['study_id'].nunique()
print(f'  {n} studies.')

n = df['subject_id'].nunique()
print(f'  {n} subjects.')

dicoms = set(df['dicom_id'].tolist())

377110 DICOMs in MIMIC-CXR v2.0.0.
  227835 studies.
  65379 subjects.


# Load dataframes

In [4]:
df_split = pd.read_csv(mimic_cxr_path / 'mimic-cxr-2.0.0-split.csv.gz')
df_metadata = pd.read_csv(mimic_cxr_path / 'mimic-cxr-2.0.0-metadata.csv.gz')

Unnamed: 0,dicom_id,study_id,subject_id,split
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,10000032,train
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,10000032,train
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,10000032,train
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,10000032,train


## Generate view

In [17]:
# initialize view with a mapping from ViewPosition
VIEW_MAP = {
    'AP': 'frontal',
    'PA': 'frontal',
    'LATERAL': 'lateral',
    'LL': 'lateral',
    'LPO': 'other',
    'RAO': 'other',
    'RPO': 'other',
    'LAO': 'other',
    # the below are overwritten in some instances by manual review
    'AP AXIAL': 'other',
    'XTABLE LATERAL': 'other',
    'AP LLD': 'other',
    'PA LLD': 'other',
    'L5 S1': 'other',
    'SWIMMERS': 'other',
    'AP RLD': 'other',
    'PA RLD': 'other',
}

df_metadata['view'] = df_metadata['ViewPosition'].map(VIEW_MAP)

# for 'other' category, currently many of these are simply unknown
# so try to update them with acq device map
ADPD_MAP = {
    'CHEST, LATERAL': 'lateral',
    'CHEST, PA': 'frontal',
    # manually checked 100 records, below is always frontal
    'CHEST, PORTABLE': 'frontal',
    'CHEST, PA X-WISE': 'frontal',
    'CHEST, AP (GRID)': 'frontal',
    'CHEST LAT': 'lateral',
    'CHEST PA': 'frontal',
    'CHEST, AP NON-GRID': 'frontal',
    'CHEST AP NON GRID': 'frontal',
    'CHEST PA X-WISE': 'frontal',
    'CHEST AP GRID': 'frontal',
    'CHEST, PORTABLE X-WISE': 'other',
    # below have < 25 samples each
    'CHEST PORT': 'frontal',
    'CHEST PORT X-WISE': 'frontal',
    # manually classified below
    'SHOULDER': 'other',
    'CHEST, PEDI (4-10 YRS)': 'other',
    'LOWER RIBS': 'other',
    'CHEST, DECUB.': 'other',
    'ABDOMEN, PORTABLE': 'other',
    'UPPER RIBS': 'frontal',
    'STERNUM, LATERAL': 'lateral',
    'KNEE, AP/OBL': 'other',
    'STERNUM, PA/OBL.': 'other',
    'CLAVICLE/ AC JOINTS': 'other',
    'ABDOMEN,GENERAL': 'other',
    'LOWER RIB': 'other',
    'SCOLIOSIS AP': 'frontal'
}

good_view = ['frontal', 'lateral']
idxUpdate = ~df_metadata['view'].isin(good_view)
c = 'AcquisitionDeviceProcessingDescription'
idx = (df_metadata[c].notnull()) & idxUpdate
df_metadata.loc[idx, 'view'] = df_metadata.loc[idx, c].map(ADPD_MAP)

DICOM_TO_VIEW = {
    '2164992c-f4abb30a-7aaaf4f4-383cab47-4e3eb1c8': ['PA', 'frontal'],
    '5e6881e2-ff4254e0-b99f0c2f-8964482a-031364db': ['LL', 'lateral'],
    'fcdf7a30-3236b74e-65b97587-cdd4cfde-63cd1de0': ['PA', 'frontal'],
    'fb074ec1-6715839c-84fa75e6-adc3f026-448b1481': ['PA', 'frontal'],
    'dfb8080a-8506e43e-840d9d58-0f738f41-82c120b0': ['PA', 'frontal'],
    '4b32608b-c2ead7c4-1fe5565f-42f7ab80-9dad30de': ['LL', 'lateral'],
    '53663e89-8f9ca9bb-df1bf434-8d6b1283-2b612609': ['LL', 'lateral'],
    # below are AP, but incorrectly in View Position
    '8672a4e7-366801a0-26cf2395-9344335c-aac8d728': ['AP', 'frontal'],
    '9800b28e-3ff3b417-18473be2-1a66131d-aca88488': ['AP', 'frontal'],
    '598cfe48-33a8643e-843e27e2-5dd584e7-3cd5f1c0': ['AP', 'frontal']
}

# we manually reviewed a few DICOMs to keep them in
for dcm, row in DICOM_TO_VIEW.items():
    view = row[1]
    idx = df_metadata['dicom_id'] == dcm
    if idx.any():
        df_metadata.loc[idx, 'view'] = view

# Merge dataframes

In [49]:
df = df_split.merge(df_metadata.drop(['study_id', 'subject_id'], axis=1),
                   on='dicom_id', how='inner')


nb = pd.read_csv(mimic_cxr_path / 'mimic-cxr-2.0.0-negbio.csv.gz')
# avoid redundant columns
nb.drop('subject_id', axis=1, inplace=True)
findings = [x for x in nb.columns if x != 'study_id']
df = df.merge(nb, how='left', on='study_id')

# indicator flag for the study having a NegBio finding
df['has_negbio_finding'] = df[[x for x in findings if x != 'No Finding']].notnull().sum(axis=1) > 0

df[['dicom_id', 'split', 'view'] + findings].head().T

Unnamed: 0,0,1,2,3,4
dicom_id,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714
split,train,train,train,train,train
view,frontal,lateral,frontal,lateral,frontal
Atelectasis,,,,,
Cardiomegaly,,,,,
Consolidation,,,,,
Edema,,,,,
Enlarged Cardiomediastinum,,,,,
Fracture,,,,,
Lung Lesion,,,,,


In [51]:
splits = ['train', 'validate', 'test']
split_views = df.groupby(['split', 'view'])[['dicom_id']].count()

row_idx = ['frontal', 'lateral', 'other']
tbl = None
for c in splits:
    if tbl is None:
        tbl = split_views.loc[c].loc[row_idx, 'dicom_id']
    else:
        tbl = pd.concat([tbl, split_views.loc[c].loc[row_idx, 'dicom_id']], axis=1)
tbl.columns = splits


# add in the number of subjects
n_studies = df.groupby('split')[['study_id']].nunique().T
n_studies.index = ['Number of studies']
tbl = pd.concat([tbl, n_studies], axis=0, sort=False)

# studies with a finding
n_studies = df.loc[df['has_negbio_finding']].groupby('split')[['study_id']].nunique().T
n_studies.index = ['  with a finding']
tbl = pd.concat([tbl, n_studies], axis=0, sort=False)
n_studies.index = ['  with a finding (%)']
tbl = pd.concat([tbl, n_studies / tbl.iloc[-2] * 100.0], axis=0, sort=False)

# patients
n_pt = df.groupby('split')[['subject_id']].nunique().T
n_pt.index = ['Number of patients']
tbl = pd.concat([tbl, n_pt], axis=0, sort=False)

# patients with a finding
n_studies = df.loc[df['has_negbio_finding']].groupby('split')[['subject_id']].nunique().T
n_studies.index = ['  with a finding']
tbl = pd.concat([tbl, n_studies], axis=0, sort=False)
n_studies.index = ['  with a finding (%)']
tbl = pd.concat([tbl, n_studies / tbl.iloc[-2] * 100.0], axis=0, sort=False)

tbl.to_latex('table2.tex')
tbl

Unnamed: 0,train,validate,test
frontal,248020.0,2041.0,3653.0
lateral,120795.0,949.0,1502.0
other,145.0,1.0,4.0
Number of studies,222758.0,1808.0,3269.0
with a finding,170420.0,1394.0,2912.0
with a finding (%),76.504548,77.10177,89.079229
Number of patients,64586.0,500.0,293.0
with a finding,44157.0,344.0,288.0
with a finding (%),68.369306,68.8,98.293515
