# 01 Parse Annotations from Json and Gather Metadata

First, load json and extract csv with all annotation information

In [1]:
import json
import pandas as pd
import pydicom
import os
from collections import Counter
import numpy as np
from utils import replace_item, remove_item, import_dicom_header

In [None]:
json_filepath = '/data/larson2/RCC_dl/metadata/annotations_labelgroup_all_2021-02-04.json' #annotation json imported from MD.ai

with open(json_filepath, 'r') as j:
     contents = json.loads(j.read())

labelGroups = contents['labelGroups']
annotations = pd.io.json.json_normalize(contents['datasets'][0]['annotations'])

# Using LabelGroups create dictionary to translate label id to label name
labels = [l['labels'] for l in labelGroups]
ids = [l['id'] for l in labels[0]] + [l['id'] for l in labels[1]] + [l['id'] for l in labels[2]]
names = [l['name'] for l in labels[0]] + [l['name'] for l in labels[1]] + [l['name'] for l in labels[2]]
label_conv = dict(zip(ids,names))

#drop unnecessary columns
annotations = annotations.drop(['id', 'parentId', 'isImported', 'createdAt', 'createdById', 'updatedAt', 'updatedById', 'note', 'radlexTagIds', 'reviewsPositiveCount', 'reviewsNegativeCount'], axis=1)
labelIDs = list(annotations["labelId"])
labelNames = [label_conv[id] for id in labelIDs] #convert labelIDs to names of the labels
annotations["labelName"] = labelNames
print(annotations[:10])

Pull out Study and Series UIDs for scans with annotations and iterate over them pulling out neccesary metadata. Also, extract prone/stacked markers and phase name from annotations. Compile all metadata into a dataframe and save csv. (note: cell below takes a while to run-- ~1.5 hr for me)

In [None]:
dicom_base_dir = '/data/larson2/RCC_dl/1.all/'

uniqid = annotations[['StudyInstanceUID', 'SeriesInstanceUID']].drop_duplicates() #pull out unique exam/series pairs that are annotated
uniqid2 = list(uniqid.T.to_dict().values())

order = ['Patient ID', 'StudyInstanceUID', 'SeriesInstanceUID', 'Series Number', 'Phase Name', 'Pixel Spacing', 'Slice Thickness', 'Prone', 'Stacked']
fulldata = []
for id in uniqid2:
    # for each series pull out metadata and save as a list of dictionaries
    print('on',uniqid2.index(id)+1, 'out of', len(uniqid2))
    StUID = id['StudyInstanceUID']
    SerUID = id['SeriesInstanceUID']
    dicom_dir = dicom_base_dir + StUID + '/' + SerUID + '/'
    if os.path.isdir(dicom_dir):
        metadata = import_dicom_header(dicom_dir)
    else:
        print('No DICOM:', dicom_dir)
        continue

    #extract name of label for each exam/series pair
    sernames = [a['labelName'] for a in annotations.T.to_dict().values() if a['StudyInstanceUID'] == StUID and a['SeriesInstanceUID'] == SerUID]
    sernames = list(set([a.split("_")[-1] for a in sernames]))

    # using names of the annotations- determine which phase it is and flag prone/stacked
    proneflag = False
    stackflag = False
    if "ART" in sernames: 
        sernames = replace_item(sernames,'ART', 'arterial')
    if "art" in sernames:
        sernames = replace_item(sernames,'art', 'arterial')
    if "concon" in sernames:
        sernames = replace_item(sernames,'concon', 'noncon')
    if "NC" in sernames:
        sernames = replace_item(sernames,'NC', 'noncon')
    if "pv" in sernames:
        sernames = replace_item(sernames,'pv', 'portven')
    if "PV" in sernames:
        sernames = replace_item(sernames,'PV', 'portven')
    if "DEL" in sernames:
        sernames = replace_item(sernames,'DEL', 'delay')
    if "del" in sernames:
        sernames = replace_item(sernames,'del', 'delay')
    if 'tumor' in sernames:
        sernames = remove_item(sernames,'tumor')
    if 'BB' in sernames:
        sernames = remove_item(sernames,'BB')
    if 'NewBB' in sernames:
        sernames = remove_item(sernames,'NewBB')
    if 'prone' in sernames:
        proneflag = True
        sernames = remove_item(sernames,'prone')
    if 'stacked' in sernames:
        stackflag = True
        sernames = remove_item(sernames,'stacked')
    sernames = list(set(sernames))

    # save all data in a dictionary
    savedata = {'Patient ID': metadata[0x0010, 0x0020].value}
    savedata['StudyInstanceUID'] = StUID
    savedata['SeriesInstanceUID'] = SerUID
    savedata['Series Number'] = metadata[0x0020, 0x0011].value
    if sernames:
        savedata['Phase Name'] = sernames[0]
    else:
        savedata['Phase Name'] = 'none'
    savedata['Pixel Spacing'] = metadata[0x0028, 0x0030].value
    savedata['Slice Thickness'] = metadata[0x0018, 0x0050].value
    savedata['Prone'] = proneflag
    savedata['Stacked'] = stackflag
    fulldata.append(savedata)
    
# save combined data
savedf = pd.DataFrame(fulldata)
savedf = savedf[order]
savedf.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True) #drop unnamed column
savedf.to_csv('anon_metadata_phase.csv')

# merge with annotation dataframe
annsdf = pd.merge(annotations, savedf, left_on='SeriesInstanceUID', right_on='SeriesInstanceUID')
annsdf.to_csv('anon_metadata_annotations.csv')



10 annotated exams do not have corresponding DICOMs in folder.

In [8]:
csv_filepath = '/data/larson2/RCC_dl/metadata/anon_metadata_annotations.csv'
annsdf = pd.read_csv(csv_filepath)
print(len(annsdf)) #number of individual annotations

26425


Append instance number (to get slice location of annotation) to anon_metadata_annotation.csv

In future: can be combined with previous cells to optimize 

In [None]:
dicom_base_dir = '/data/larson2/RCC_dl/1.all/'

SOPIDs = annsdf[['StudyInstanceUID_x', 'SeriesInstanceUID', 'SOPInstanceUID']].drop_duplicates() #pull out unique exam/series/slice that are annotated
SOPlist = list(SOPIDs.T.to_dict().values())


fulldata = []
for id in SOPlist:
    print('on',SOPlist.index(id)+1, 'out of', len(SOPlist))
    StUID = id['StudyInstanceUID_x']
    SerUID = id['SeriesInstanceUID']
    SopUID = id['SOPInstanceUID']

    dicom_dir = os.path.join(dicom_base_dir, StUID, SerUID, (str(SopUID)+'.dcm'))
    if os.path.isfile(dicom_dir):
        metadata = pydicom.dcmread(dicom_dir, stop_before_pixels=True)
    else:
        print('No DICOM:', dicom_dir) #nan dicoms are due to global annotations (prone etc)
        continue

    savedata = {'SOPInstanceUID': SopUID}
    savedata['StudyInstanceUID'] = StUID
    savedata['SeriesInstanceUID'] = SerUID
    savedata['Instance Number'] = metadata[0x0020, 0x0013].value
    #savedata['Slice Location'] = metadata[0x0020, 0x1041].value #no slice location for some case
    fulldata.append(savedata)
    
# save combined data
SOPdf = pd.DataFrame(fulldata)

# merge with annotation dataframe
new_annsdf = pd.merge(annsdf, SOPdf, on=['SeriesInstanceUID','SOPInstanceUID'])
new_annsdf.drop(['StudyInstanceUID_x', 'StudyInstanceUID_y'], axis=1, inplace=True) #drop unnamed column
new_annsdf.to_csv('anon_metadata_annotations.csv')
print(new_annsdf.head())

### Next, pull some useful values from the data

In [12]:
# read in csv created in above cell 
csv_filepath = '/data/larson2/RCC_dl/metadata/anon_metadata_phase.csv'
savedf = pd.read_csv(csv_filepath)

In [13]:
# Unique values for each metadata field
metadatacounts = savedf.nunique()
print(metadatacounts)

Unnamed: 0           2407
Patient ID            876
StudyInstanceUID      962
SeriesInstanceUID    2407
Series Number         116
Phase Name              5
Pixel Spacing         427
Slice Thickness        18
Prone                   2
Stacked                 2
dtype: int64


In [14]:
# Unique phases
print(Counter(savedf['Phase Name']))
#print(Counter(savedf['Pixel Spacing']))
#print(Counter(savedf['Slice Thickness']))

Counter({'noncon': 871, 'portven': 589, 'delay': 491, 'arterial': 442, 'none': 14})


Noncontrast scans:

In [None]:
noncon = savedf[savedf['Phase Name'].str.contains('noncon')]
nonconcounts = noncon.nunique()
print(nonconcounts)
#print(Counter(noncon['Pixel Spacing']))
#print(Counter(noncon['Slice Thickness']))
#print(noncon.loc[noncon['Stacked']==True])
#print(Counter(noncon['Stacked']))

Portal Venous:

In [12]:
portven = savedf[savedf['Phase Name'].str.contains('portven')]
portvencounts = portven.nunique()
print(portvencounts)
#print(Counter(portven['Pixel Spacing']))
#print(Counter(portven['Slice Thickness']))

Unnamed: 0           589
Patient ID           578
StudyInstanceUID     582
SeriesInstanceUID    589
Series Number         53
Phase Name             1
Pixel Spacing        264
Slice Thickness       12
Prone                  2
Stacked                1
dtype: int64


Delay: 

In [15]:
delay = savedf[savedf['Phase Name'].str.contains('delay')]
delaycounts = delay.nunique()
print(delaycounts)
#print(Counter(delay['Pixel Spacing']))
#print(Counter(delay['Slice Thickness']))

Unnamed: 0           491
Patient ID           482
StudyInstanceUID     483
SeriesInstanceUID    491
Series Number         41
Phase Name             1
Pixel Spacing        219
Slice Thickness       12
Prone                  2
Stacked                2
dtype: int64


Arterial:

In [16]:
arterial = savedf[savedf['Phase Name'].str.contains('arterial')]
arterialcounts = arterial.nunique()
print(arterialcounts)
#print(Counter(arterial['Pixel Spacing']))
#print(Counter(arterial['Slice Thickness']))

Unnamed: 0           442
Patient ID           432
StudyInstanceUID     434
SeriesInstanceUID    442
Series Number         34
Phase Name             1
Pixel Spacing        194
Slice Thickness       14
Prone                  2
Stacked                2
dtype: int64


Investigate scans with no phase name:

In [None]:
none = savedf[savedf['Phase Name'].str.contains('none')]
print(none)
#print(Counter(delay['Pixel Spacing']))
#print(Counter(delay['Slice Thickness']))

# these scans are just annotated with a "tumor" label, no phase label