In [5]:
import json
import pandas as pd
import glob

In [6]:
data_dir = "/project/dane2/wficai/pda/external_validation/CHOC/CHOC_echo_images"
mapping_json = "patient_echoclip_mapping Feb 24 2022.json"

## Clip-patient mapping

In [7]:
with open(f"{data_dir}/{mapping_json}", "r") as f:
    mapping = json.load(f)

df = pd.DataFrame({'patient_id': k, 'clip_id': v} for k,v in mapping.items()).explode('clip_id')
all_clips = set(df['clip_id'])

df

Unnamed: 0,patient_id,clip_id
0,PATIENT1,ECHOCLIP1
1,PATIENT2,ECHOCLIP2
1,PATIENT2,ECHOCLIP278
2,PATIENT3,ECHOCLIP3
2,PATIENT3,ECHOCLIP115
...,...,...
296,PATIENT297,ECHOCLIP451
297,PATIENT298,ECHOCLIP452
297,PATIENT298,ECHOCLIP453
298,PATIENT299,ECHOCLIP456


In [8]:
mapping

{'PATIENT1': ['ECHOCLIP1'],
 'PATIENT2': ['ECHOCLIP2', 'ECHOCLIP278'],
 'PATIENT3': ['ECHOCLIP3', 'ECHOCLIP115'],
 'PATIENT4': ['ECHOCLIP4', 'ECHOCLIP221'],
 'PATIENT5': ['ECHOCLIP5'],
 'PATIENT6': ['ECHOCLIP6', 'ECHOCLIP387'],
 'PATIENT7': ['ECHOCLIP7', 'ECHOCLIP190', 'ECHOCLIP191', 'ECHOCLIP261'],
 'PATIENT8': ['ECHOCLIP8', 'ECHOCLIP38', 'ECHOCLIP293', 'ECHOCLIP294'],
 'PATIENT9': ['ECHOCLIP9', 'ECHOCLIP43'],
 'PATIENT10': ['ECHOCLIP10'],
 'PATIENT11': ['ECHOCLIP11'],
 'PATIENT12': ['ECHOCLIP12'],
 'PATIENT13': ['ECHOCLIP13'],
 'PATIENT14': ['ECHOCLIP14'],
 'PATIENT15': ['ECHOCLIP15', 'ECHOCLIP76', 'ECHOCLIP252'],
 'PATIENT16': ['ECHOCLIP16'],
 'PATIENT17': ['ECHOCLIP17'],
 'PATIENT18': ['ECHOCLIP18'],
 'PATIENT19': ['ECHOCLIP19'],
 'PATIENT20': ['ECHOCLIP20', 'ECHOCLIP454', 'ECHOCLIP455'],
 'PATIENT21': ['ECHOCLIP21'],
 'PATIENT22': ['ECHOCLIP22', 'ECHOCLIP153', 'ECHOCLIP154'],
 'PATIENT23': ['ECHOCLIP23'],
 'PATIENT24': ['ECHOCLIP24', 'ECHOCLIP109', 'ECHOCLIP279', 'ECHOCLIP280'],
 

## Create frame csv used in pipeline

In [9]:
# use glob to get all jpg files in the data directory
jpg_files = glob.glob(f"{data_dir}/**/*.jpg", recursive=True)

# remove those with Train_PDAfiltered in the path
# jpg_files = [f for f in jpg_files if "Train_PDAfiltered" not in f]

jpg_files

['/project/dane2/wficai/pda/external_validation/CHOC/CHOC_echo_images/Test/NEG/ECHOCLIP274/NEG_ECHOCLIP274_001.jpg',
 '/project/dane2/wficai/pda/external_validation/CHOC/CHOC_echo_images/Test/NEG/ECHOCLIP274/NEG_ECHOCLIP274_002.jpg',
 '/project/dane2/wficai/pda/external_validation/CHOC/CHOC_echo_images/Test/NEG/ECHOCLIP274/NEG_ECHOCLIP274_003.jpg',
 '/project/dane2/wficai/pda/external_validation/CHOC/CHOC_echo_images/Test/NEG/ECHOCLIP274/NEG_ECHOCLIP274_004.jpg',
 '/project/dane2/wficai/pda/external_validation/CHOC/CHOC_echo_images/Test/NEG/ECHOCLIP274/NEG_ECHOCLIP274_005.jpg',
 '/project/dane2/wficai/pda/external_validation/CHOC/CHOC_echo_images/Test/NEG/ECHOCLIP274/NEG_ECHOCLIP274_006.jpg',
 '/project/dane2/wficai/pda/external_validation/CHOC/CHOC_echo_images/Test/NEG/ECHOCLIP274/NEG_ECHOCLIP274_007.jpg',
 '/project/dane2/wficai/pda/external_validation/CHOC/CHOC_echo_images/Test/NEG/ECHOCLIP274/NEG_ECHOCLIP274_008.jpg',
 '/project/dane2/wficai/pda/external_validation/CHOC/CHOC_echo_i

In [10]:
# build the dataframe from the file paths
def extract_record(file_path):
    split = file_path.split("/")
    condition_dict = dict(
        POS = "pda",
        NEG = "nopda"
    )
    split_dict = dict(
        Train = "TRAIN",
        Train_pdaFiltered = "TRAIN_PDAFILTERED",
        Test = "TEST",
        Val = "VAL"
    )

    record = dict(
        clip_id = split[-2],
        patient_type = condition_dict[split[-3]],
        Split = split_dict[split[-4]],
        png_path = file_path
    )

    return record

df_frames = pd.DataFrame([extract_record(f) for f in jpg_files])
df_frames

Unnamed: 0,clip_id,patient_type,Split,png_path
0,ECHOCLIP274,nopda,TEST,/project/dane2/wficai/pda/external_validation/...
1,ECHOCLIP274,nopda,TEST,/project/dane2/wficai/pda/external_validation/...
2,ECHOCLIP274,nopda,TEST,/project/dane2/wficai/pda/external_validation/...
3,ECHOCLIP274,nopda,TEST,/project/dane2/wficai/pda/external_validation/...
4,ECHOCLIP274,nopda,TEST,/project/dane2/wficai/pda/external_validation/...
...,...,...,...,...
30604,ECHOCLIP94,pda,VAL,/project/dane2/wficai/pda/external_validation/...
30605,ECHOCLIP94,pda,VAL,/project/dane2/wficai/pda/external_validation/...
30606,ECHOCLIP94,pda,VAL,/project/dane2/wficai/pda/external_validation/...
30607,ECHOCLIP94,pda,VAL,/project/dane2/wficai/pda/external_validation/...


In [11]:
# merge the frame and patient data
df_merged = df.merge(df_frames)

df_merged

Unnamed: 0,patient_id,clip_id,patient_type,Split,png_path
0,PATIENT1,ECHOCLIP1,pda,VAL,/project/dane2/wficai/pda/external_validation/...
1,PATIENT1,ECHOCLIP1,pda,VAL,/project/dane2/wficai/pda/external_validation/...
2,PATIENT1,ECHOCLIP1,pda,VAL,/project/dane2/wficai/pda/external_validation/...
3,PATIENT1,ECHOCLIP1,pda,VAL,/project/dane2/wficai/pda/external_validation/...
4,PATIENT1,ECHOCLIP1,pda,VAL,/project/dane2/wficai/pda/external_validation/...
...,...,...,...,...,...
30604,PATIENT300,ECHOCLIP461,nopda,TRAIN_PDAFILTERED,/project/dane2/wficai/pda/external_validation/...
30605,PATIENT300,ECHOCLIP461,nopda,TRAIN_PDAFILTERED,/project/dane2/wficai/pda/external_validation/...
30606,PATIENT300,ECHOCLIP461,nopda,TRAIN_PDAFILTERED,/project/dane2/wficai/pda/external_validation/...
30607,PATIENT300,ECHOCLIP461,nopda,TRAIN_PDAFILTERED,/project/dane2/wficai/pda/external_validation/...


In [12]:
# number of frames
df_merged.groupby(['Split', 'patient_type'], as_index=False)["png_path"].count()

Unnamed: 0,Split,patient_type,png_path
0,TEST,nopda,3072
1,TEST,pda,2568
2,TRAIN,nopda,9147
3,TRAIN,pda,852
4,TRAIN_PDAFILTERED,nopda,9149
5,TRAIN_PDAFILTERED,pda,852
6,VAL,nopda,2301
7,VAL,pda,2668


In [14]:
# numbver of clips
df_merged.groupby(['Split'], as_index=False)["clip_id"].nunique()

Unnamed: 0,Split,clip_id
0,TEST,72
1,TRAIN,129
2,TRAIN_PDAFILTERED,153
3,VAL,74


In [16]:
df_merged.groupby(['Split', 'patient_type']).clip_id.nunique()

Split              patient_type
TEST               nopda            38
                   pda              34
TRAIN              nopda           116
                   pda              13
TRAIN_PDAFILTERED  nopda           116
                   pda              37
VAL                nopda            36
                   pda              38
Name: clip_id, dtype: int64

In [19]:
df_merged.groupby(['Split', 'patient_type']).patient_id.nunique()

Split              patient_type
TEST               nopda           31
                   pda             25
TRAIN              nopda           99
                   pda             10
TRAIN_PDAFILTERED  nopda           99
                   pda             32
VAL                nopda           30
                   pda             27
Name: patient_id, dtype: int64

In [None]:
# save the file
df_merged.to_csv(f"{data_dir}/pda_train_val_test.csv", index=False)