In [1]:
import json
import pandas as pd

In [2]:
# constants
project_id = "clvmkr23r0cb407xt8zz95ou2"
export_file = "data_resources/export_after_review.json"
labels_save_file = "data_resources/labels_after_review.csv"

In [3]:
# load in export.json
with open(export_file, 'r') as file:
    export_json = json.load(file)

In [4]:
# get the label objects
all_labels = []
for row in export_json:
    labels = row['projects'][project_id]['labels']
    data_row_id = row["data_row"]["id"]
    global_key = row["data_row"]["global_key"]
    for lab in labels:
        # add the data_row_id to the label object
        lab["data_row_id"] = data_row_id
        lab["global_key"] = global_key
        all_labels.append(lab)

In [5]:
def parse_classification(clf):
    clf_dict = {}

    key = clf['value']

    if key == 'anatomical_view':
        clf_dict[key] = clf['radio_answer']['value']
        # fix the improperly coded "pda_view"
        if clf_dict[key] == '1':
            clf_dict[key] = 'pda_view'
    elif clf['value'] == 'notes':
        clf_dict[key] = clf['text_answer']['content']
    else:
        raise ValueError("Invalid classification value: " + clf['value'])
    
    # get the imaging modality for pda and pda-related views
    if (key == 'anatomical_view') & (clf_dict[key] != 'non_pda_view'):
        assert clf['radio_answer'].get('classifications') is not None, f"For key 'anatomical_view', should have sub-classifications. {clf['radio_answer']}"
        subclf = clf['radio_answer']['classifications']

        assert len(subclf) == 1, f"Expected one classification. Received {len(subclf)}"
        subclf = subclf[0]

        assert subclf['value'] == 'imaging_modality', f"Expected 'imaging_modality'. Received {subclf['value']}"
        clf_dict['imaging_modality'] = subclf['radio_answer']['value']
                                                              
    return clf_dict                                            


# parse the label objects
def parse_labels(label):

    lab = {}
    lab['id'] = label['id']
    lab['data_row_id'] = label['data_row_id']
    lab['global_key'] = label['global_key']
    lab['created_at'] = label['label_details']['created_at']
    lab['created_by'] = label['label_details']['created_by']
    lab['seconds_to_create'] = label['performance_details']['seconds_to_create']
    lab['skipped'] = label['performance_details']['skipped']

    # parse the annotations
    assert label.get('annotations') is not None, "No annotations"
    annotations = label['annotations']

    assert annotations.get('classifications') is not None, "No classifications"
    classifications = annotations['classifications']

    # parse each classification.
    for cls in classifications:
        clf_dict = parse_classification(cls)

        # merge the classification dictionary into the label dictionary
        lab.update(clf_dict)

    return lab

df_labs = pd.DataFrame([parse_labels(lab) for lab in all_labels])
df_labs

Unnamed: 0,id,data_row_id,global_key,created_at,created_by,seconds_to_create,skipped,anatomical_view,imaging_modality,notes
0,clvnyawdw02sk070hbxyd7l4g,clvmkoj5ectty07487t4qlptz,1_76_1.2.840.59519430613589928298844279859441,2024-05-01T16:29:24.000+00:00,baker@musc.edu,2016,False,non_pda_view,,
1,clvnyawsj21uf07f15kffgyib,clvmkoj5ecttz0748ewpuquuy,1_23_1.2.840.62292556923670791241514372480335,2024-05-01T16:29:42.000+00:00,baker@musc.edu,18,False,non_pda_view,,
2,clvo18j2p242k07f1akjh5wjh,clvmkoj5ectu00748kbn1o4r1,1_18_1.2.840.54997166204916837963487368101055,2024-05-01T16:29:52.000+00:00,baker@musc.edu,9,False,non_pda_view,,
3,clvo18x0p24zn07g199h12nn0,clvmkoj5ectu10748tj3yaqzz,1_16_1.2.840.52511812615726335165825448877932,2024-05-01T16:29:56.000+00:00,baker@musc.edu,4,False,non_pda_view,,
4,clvo194an242y07f181qvez0k,clvmkoj5ectu20748ysp3wskb,1_37_1.2.840.12659387360726375267236143825160,2024-05-01T16:30:01.000+00:00,baker@musc.edu,5,False,non_pda_view,,
...,...,...,...,...,...,...,...,...,...,...
2397,clvu79fnp03sr07gd3mcq9usq,clvmkojnvcvih074834u56bj3,1_5_1.2.840.99810563647194059285337754442311,2024-05-06T00:04:50.000+00:00,addison.gearhart@cardio.chboston.org,7,False,pda_view,color_compare,
2398,clvu79he21ety07ck0u3n5zpl,clvmkojnvcvii0748zho083i0,1_6_1.2.840.99810563647194059285337754442311,2024-05-06T00:05:00.000+00:00,addison.gearhart@cardio.chboston.org,10,False,pda_view,color_compare,
2399,clvu79n6r1kpc07hpbg6k64tt,clvmkojnvcvij07481plge0zh,1_7_1.2.840.99810563647194059285337754442311,2024-05-06T00:05:04.000+00:00,addison.gearhart@cardio.chboston.org,3,False,pda_view,color_compare,
2400,clvu79uk503ir07fndnndej6x,clvmkojnvcvik0748jb92pwns,1_9_1.2.840.99810563647194059285337754442311,2024-05-06T00:05:06.000+00:00,addison.gearhart@cardio.chboston.org,2,False,non_pda_view,,


In [6]:
df_labs.anatomical_view.value_counts()

non_pda_view        1944
pda_related_view     277
pda_view             181
Name: anatomical_view, dtype: int64

In [7]:
# save the dataframe
df_labs.to_csv(labels_save_file, index=False)