In [4]:
import json
import pandas as pd
import glob

In [5]:
data_dir = "/project/dane2/wficai/pda/external_validation/Boston/frames"
study_csv = "data_resources/study_id_to_pda_status_mapping.csv"
labelbox_csv = "data_resources/ground_truth.csv"

# Labeled View and Mode data for filtering

In [6]:
df_vm = pd.read_csv(labelbox_csv)

# rename "global_key" column to "clip_id"
df_vm = df_vm.rename(columns={"global_key": "clip_id"})

# only include clip_id, anatomical_view, and imaging_modality columns
df_vm = df_vm[["clip_id", "anatomical_view", "imaging_modality"]]

# # filter to only include "color_doppler" and "color_compare" imaging_modalities
# df_vm = df_vm[df_vm["imaging_modality"].isin({"color_doppler", "color_compare"})]

df_vm

Unnamed: 0,clip_id,anatomical_view,imaging_modality
0,1_100_1.2.840.11348449676493461455641392115705,non_pda_view,
1,1_100_1.2.840.24572707487245121949231819213812,non_pda_view,
2,1_100_1.2.840.25823283352704988781894431829478,non_pda_view,
3,1_100_1.2.840.85458994400069811464357653232227,non_pda_view,
4,1_100_1.2.840.91191565485023629653170076312991,non_pda_view,
...,...,...,...
2179,1_9_1.2.840.87814992980701538212818673097911,pda_related_view,color_doppler
2180,1_9_1.2.840.91191565485023629653170076312991,non_pda_view,
2181,1_9_1.2.840.94689438369083512870257995817850,non_pda_view,
2182,1_9_1.2.840.98762134419995893455563586563363,non_pda_view,


# PDA Status data

In [7]:
df_study = pd.read_csv(study_csv)[['has_pda', 'Hashed_Study_UID']]

df_study.columns = ['has_pda', 'study_id']
df_study.head()

Unnamed: 0,has_pda,study_id
0,1,1.2.840.36739489188022575661183006720752
1,1,1.2.840.59459975202128111902049599635364
2,1,1.2.840.27479349261379474174666099986067
3,1,1.2.840.85458994400069811464357653232227
4,1,1.2.840.58026814883990290449038193782372


In [8]:
df_study.has_pda.value_counts()

1    64
0    56
Name: has_pda, dtype: int64

In [9]:
# create a dictionary mapping from study_id to has_pda
study_id_to_pda = dict(zip(df_study.study_id, df_study.has_pda))

## Create csv containing all frames

In [10]:
# use glob to get all jpg files in the data directory
jpg_files = glob.glob(f"{data_dir}/**/*.jpg", recursive=True)

jpg_files[:5]

['/project/dane2/wficai/pda/external_validation/Boston/frames/1.2.840.10008274105826202655205768128764/1_103_1.2.840.10008274105826202655205768128764/frame_0001.jpg',
 '/project/dane2/wficai/pda/external_validation/Boston/frames/1.2.840.10008274105826202655205768128764/1_103_1.2.840.10008274105826202655205768128764/frame_0002.jpg',
 '/project/dane2/wficai/pda/external_validation/Boston/frames/1.2.840.10008274105826202655205768128764/1_103_1.2.840.10008274105826202655205768128764/frame_0003.jpg',
 '/project/dane2/wficai/pda/external_validation/Boston/frames/1.2.840.10008274105826202655205768128764/1_103_1.2.840.10008274105826202655205768128764/frame_0004.jpg',
 '/project/dane2/wficai/pda/external_validation/Boston/frames/1.2.840.10008274105826202655205768128764/1_103_1.2.840.10008274105826202655205768128764/frame_0005.jpg']

In [11]:
# build the dataframe from the file paths
def extract_record(file_path):
    split = file_path.split("/")
    condition_dict = {0: "nopda", 1: "pda"}

    split_dict = dict(
        Train = "TRAIN",
        Train_pdaFiltered = "TRAIN_PDAFILTERED",
        Test = "TEST",
        Val = "VAL"
    )

    record = dict(
        study_id = split[-3],
        clip_id = split[-2],
        patient_type = condition_dict[study_id_to_pda[split[-3]]],
        Split = split_dict["Test"],
        png_path = file_path
    )

    return record

df_frames = pd.DataFrame([extract_record(f) for f in jpg_files])
df_frames

Unnamed: 0,study_id,clip_id,patient_type,Split,png_path
0,1.2.840.10008274105826202655205768128764,1_103_1.2.840.10008274105826202655205768128764,pda,TEST,/project/dane2/wficai/pda/external_validation/...
1,1.2.840.10008274105826202655205768128764,1_103_1.2.840.10008274105826202655205768128764,pda,TEST,/project/dane2/wficai/pda/external_validation/...
2,1.2.840.10008274105826202655205768128764,1_103_1.2.840.10008274105826202655205768128764,pda,TEST,/project/dane2/wficai/pda/external_validation/...
3,1.2.840.10008274105826202655205768128764,1_103_1.2.840.10008274105826202655205768128764,pda,TEST,/project/dane2/wficai/pda/external_validation/...
4,1.2.840.10008274105826202655205768128764,1_103_1.2.840.10008274105826202655205768128764,pda,TEST,/project/dane2/wficai/pda/external_validation/...
...,...,...,...,...,...
237936,1.2.840.99810563647194059285337754442311,1_9_1.2.840.99810563647194059285337754442311,pda,TEST,/project/dane2/wficai/pda/external_validation/...
237937,1.2.840.99810563647194059285337754442311,1_9_1.2.840.99810563647194059285337754442311,pda,TEST,/project/dane2/wficai/pda/external_validation/...
237938,1.2.840.99810563647194059285337754442311,1_9_1.2.840.99810563647194059285337754442311,pda,TEST,/project/dane2/wficai/pda/external_validation/...
237939,1.2.840.99810563647194059285337754442311,1_9_1.2.840.99810563647194059285337754442311,pda,TEST,/project/dane2/wficai/pda/external_validation/...


# Merge the label data

In [12]:
df_merged = df_vm.merge(df_frames, on="clip_id", how="inner")
df_merged.head()

Unnamed: 0,clip_id,anatomical_view,imaging_modality,study_id,patient_type,Split,png_path
0,1_100_1.2.840.11348449676493461455641392115705,non_pda_view,,1.2.840.11348449676493461455641392115705,nopda,TEST,/project/dane2/wficai/pda/external_validation/...
1,1_100_1.2.840.11348449676493461455641392115705,non_pda_view,,1.2.840.11348449676493461455641392115705,nopda,TEST,/project/dane2/wficai/pda/external_validation/...
2,1_100_1.2.840.11348449676493461455641392115705,non_pda_view,,1.2.840.11348449676493461455641392115705,nopda,TEST,/project/dane2/wficai/pda/external_validation/...
3,1_100_1.2.840.11348449676493461455641392115705,non_pda_view,,1.2.840.11348449676493461455641392115705,nopda,TEST,/project/dane2/wficai/pda/external_validation/...
4,1_100_1.2.840.11348449676493461455641392115705,non_pda_view,,1.2.840.11348449676493461455641392115705,nopda,TEST,/project/dane2/wficai/pda/external_validation/...


In [13]:
df_merged.anatomical_view.value_counts()

non_pda_view        189379
pda_related_view     25093
pda_view             23469
Name: anatomical_view, dtype: int64

In [14]:
df_merged.imaging_modality.value_counts()

color_compare    23984
color_doppler    13750
grayscale        10774
sepia               54
Name: imaging_modality, dtype: int64

In [15]:
df_merged.study_id.nunique()

115

In [16]:
# number of clips per study
df_numclips = df_merged.groupby("study_id", as_index=False).clip_id.nunique()

# rename clip_id to num_clips
df_numclips = df_numclips.rename(columns={"clip_id": "num_clips"})

df_numclips.to_csv("data_resources/num_clips_per_study.csv", index=False)

In [17]:
# load the full list of study ids and examine the lost studies
all_studies = set(pd.read_csv("data_resources/anonymous_project_ids.csv", header=None).loc[:,0])
all_studies - set(df_merged.study_id)

# all of these had no videos

{'1.2.840.26595384206698375655240022415660',
 '1.2.840.60754225459836956382043835511575',
 '1.2.840.91321638748569760616789524179261',
 '1.2.840.94708146669374965808973852599351',
 '1.2.840.97991357500518621499961722121851'}

In [18]:
# filter out clips that are not a relevant anatomical view or imaging modality
df_merged_filtered = df_merged[df_merged["anatomical_view"].isin({"pda_view", "pda_related_view"}) &
                               df_merged["imaging_modality"].isin({"color_doppler", "color_compare"})]

df_merged_filtered.anatomical_view.value_counts()

pda_related_view    19281
pda_view            18453
Name: anatomical_view, dtype: int64

In [19]:
df_merged_filtered.imaging_modality.value_counts()

color_compare    23984
color_doppler    13750
Name: imaging_modality, dtype: int64

In [20]:
# save the file
df_merged_filtered.to_csv(f"{data_dir}/boston_frames.csv", index=False)

In [21]:
df_merged_filtered.columns

Index(['clip_id', 'anatomical_view', 'imaging_modality', 'study_id',
       'patient_type', 'Split', 'png_path'],
      dtype='object')

In [22]:
df_merged_filtered.groupby('patient_type').study_id.nunique()

patient_type
nopda    25
pda      43
Name: study_id, dtype: int64

In [23]:
df_study.has_pda.value_counts()

1    64
0    56
Name: has_pda, dtype: int64

In [24]:
case_counts = df_merged.groupby('patient_type').study_id.nunique()
case_counts

patient_type
nopda    54
pda      61
Name: study_id, dtype: int64

In [25]:
case_counts_filtered = df_merged_filtered.groupby('patient_type').study_id.nunique()
case_counts_filtered

patient_type
nopda    25
pda      43
Name: study_id, dtype: int64

In [29]:
# number of clips by pateint type
df_merged_filtered.groupby('patient_type').clip_id.nunique()

patient_type
nopda    124
pda      170
Name: clip_id, dtype: int64

In [26]:
# number of cases lost from filtering to PDA/PDA-related and Color/Color-compare
case_counts - case_counts_filtered

patient_type
nopda    29
pda      18
Name: study_id, dtype: int64

In [27]:
# percent of cases lost from filtering to PDA/PDA-related and Color/Color-compare
(case_counts - case_counts_filtered) / case_counts

patient_type
nopda    0.537037
pda      0.295082
Name: study_id, dtype: float64

In [28]:
for pt in ['nopda', 'pda']:
    dm = set(df_merged[df_merged.patient_type==pt].study_id.unique())
    dmf = set(df_merged_filtered[df_merged_filtered.patient_type==pt].study_id.unique())
    ls = list(dm - dmf)
    print(pt)
    print(len(ls))
    print(ls)

nopda
29
['1.2.840.84002470499053283543609265219828', '1.2.840.29087821537513420828502041121163', '1.2.840.33124713069181157712639256930551', '1.2.840.69098020689821592844678554432486', '1.2.840.35412864504535983470948300452134', '1.2.840.50306364364537300025742392045750', '1.2.840.49710080439991270883856486540767', '1.2.840.39750729294416064079599818390141', '1.2.840.57897923621531947579919907476535', '1.2.840.32855709290993807630403066995188', '1.2.840.33945255176679040104276525087404', '1.2.840.40018499378290661054803104073436', '1.2.840.10472209319244886281629301110259', '1.2.840.36160138704975197646181797455316', '1.2.840.12899775971724433627705526634095', '1.2.840.87316487902102827221149309547093', '1.2.840.23072791691187263025871063465686', '1.2.840.52547442026877166430097988637699', '1.2.840.50624933167155463546068355260985', '1.2.840.32173852505293724842363809519660', '1.2.840.49216492365852693001558640730638', '1.2.840.66154620674410818733891486147076', '1.2.840.8458852175926