In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
import json
import glob

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torchvision.transforms.functional as TF

In [2]:
datestamp = '20221008'

In [3]:
pda_source_dir = '/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior Views/'
nopda_source_dir = '/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superior Views/'
export_dir = f'/zfs/wficai/pda/model_data/{datestamp}/'

if not os.path.exists(export_dir):
    os.makedirs(export_dir, exist_ok=True)

pda_label_jsons = ['/zfs/wficai/pda/model_data/20220711_PDA_small_incomplete_batch.json', '/zfs/wficai/pda/model_data/pda_export-2022-10-09T02 09 53.372Z.json']
nopda_label_jsons = ['/zfs/wficai/pda/model_data/nopda_export-2022-10-09T02 12 19.017Z.json']

In [4]:
res_x = 224
res_y = 224

# Process source video data

In [5]:
pda_vids = [{'patient_type': 'pda', 'mp4_path': path} for path in glob.glob(f"{pda_source_dir}/*.mp4")]
nopda_vids = [{'patient_type': 'nopda', 'mp4_path': path} for path in glob.glob(f"{nopda_source_dir}/*.mp4")]
df_vids = pd.DataFrame(pda_vids+nopda_vids)
df_vids['external_id'] = ["_".join(fname[-1].split('_')[:2]) for fname in df_vids.mp4_path.str.split('/')]
df_vids

Unnamed: 0,patient_type,mp4_path,external_id
0,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...,study26_dicom51
1,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...,study54_dicom98
2,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...,study48_dicom136
3,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...,study22_dicom70
4,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...,study19_dicom17
...,...,...,...
5154,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...,study61_dicom49
5155,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...,study34_dicom78
5156,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...,study24_dicom113
5157,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...,study55_dicom122


In [6]:
df_vids.groupby('patient_type').size()

patient_type
nopda    2468
pda      2691
dtype: int64

In [9]:
# number of studies
df_vids_temp = df_vids
df_vids_temp['study'] = df_vids.external_id.apply(lambda x: x.split('_')[0])
print(len(df_vids_temp.drop_duplicates(subset=['study', 'patient_type'])))
del df_vids_temp

165


# Process label data

In [10]:
# load the json data
labels = []
for path in pda_label_jsons + nopda_label_jsons:
    with open(path, 'r') as f:
        labels += json.load(f)

In [11]:
# parse
type_map = {
    "PDA Classification -- Non-PDA Views": "nopda",
    "PDA Classification": "pda",
    "PDA Classification -- PDA Views": "pda",
}

labels_parsed = []
for lab in labels:
    if type(lab['Label'])==dict:
        res = lab['Label']
        res['external_id'] = lab['External ID']
        res['study'] = lab['External ID'].split('_')[0]
        res['patient_type'] = type_map[lab['Project Name']]
        labels_parsed.append(res)

In [12]:
df_labs = pd.DataFrame(labels_parsed)
df_labs

Unnamed: 0,view,mode,diagnosis,external_id,study,patient_type
0,pdaView,2d,[],study1_dicom8,study1,pda
1,pdaView,2d,[],study1_dicom10,study1,pda
2,pdaView,color,[],study1_dicom11,study1,pda
3,nonPDAView,2d,,study1_dicom1,study1,pda
4,nonPDAView,2d,[],study1_dicom2,study1,pda
...,...,...,...,...,...,...
3190,nonPDAView,2d,[],study45_dicom54,study45,nopda
3191,pdaRelatedView,color,[],study57_dicom101,study57,nopda
3192,nonPDAView,2d,[],study9_dicom11,study9,nopda
3193,pdaRelatedView,color_compare,[],study24_dicom110,study24,nopda


In [13]:
# number of studies
len(df_labs.drop_duplicates(subset=['study', 'patient_type']))

139

In [14]:
# ensure uniqueness of external id among each patient_type
assert not (df_labs.groupby(['patient_type', 'external_id']).size()>1).any(), "patient_type-external_id pairs must be unique"

In [15]:
df_labs.groupby(['patient_type']).size()

patient_type
nopda    1238
pda      1957
dtype: int64

In [16]:
df_labs.groupby(['view']).size()

view
nonPDAView        2094
pdaRelatedView     638
pdaView            457
dtype: int64

In [17]:
df_labs.groupby(['mode']).size()

mode
2d               1376
color            1067
color_compare     748
dtype: int64

In [18]:
df_labs.groupby(['patient_type', 'view', 'mode']).size()

patient_type  view            mode         
nopda         nonPDAView      2d               406
                              color            275
                              color_compare    190
              pdaRelatedView  2d                98
                              color             85
                              color_compare     59
              pdaView         2d                32
                              color             41
                              color_compare     51
pda           nonPDAView      2d               582
                              color            425
                              color_compare    216
              pdaRelatedView  2d               157
                              color            146
                              color_compare     93
              pdaView         2d               100
                              color             95
                              color_compare    138
dtype: int64

In [19]:
df_labs.query('view!="nonPDAView" and mode!="2d"')

Unnamed: 0,view,mode,diagnosis,external_id,study,patient_type
2,pdaView,color,[],study1_dicom11,study1,pda
8,pdaView,color,[],study1_dicom12,study1,pda
9,pdaView,color,[],study1_dicom15,study1,pda
11,pdaRelatedView,color,[],study1_dicom17,study1,pda
20,pdaRelatedView,color,[],study1_dicom29,study1,pda
...,...,...,...,...,...,...
3167,pdaView,color,[],study37_dicom84,study37,nopda
3187,pdaView,color,[],study44_dicom56,study44,nopda
3189,pdaRelatedView,color_compare,[],study40_dicom77,study40,nopda
3191,pdaRelatedView,color,[],study57_dicom101,study57,nopda


In [20]:
df_labs.query('view!="nonPDAView" and mode!="2d"').groupby(['patient_type']).size()

patient_type
nopda    236
pda      477
dtype: int64

# Merge datasets

In [21]:
df_model = df_labs.merge(df_vids, how='inner')

In [22]:
df_model

Unnamed: 0,view,mode,diagnosis,external_id,study,patient_type,mp4_path
0,pdaView,2d,[],study1_dicom8,study1,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
1,pdaView,2d,[],study1_dicom10,study1,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
2,pdaView,color,[],study1_dicom11,study1,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
3,nonPDAView,2d,,study1_dicom1,study1,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
4,nonPDAView,2d,[],study1_dicom2,study1,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
...,...,...,...,...,...,...,...
3172,nonPDAView,2d,[],study45_dicom54,study45,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...
3173,pdaRelatedView,color,[],study57_dicom101,study57,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...
3174,nonPDAView,2d,[],study9_dicom11,study9,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...
3175,pdaRelatedView,color_compare,[],study24_dicom110,study24,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...


In [23]:
df_model[['study', 'patient_type']].drop_duplicates()

Unnamed: 0,study,patient_type
0,study1,pda
30,study2,pda
56,study3,pda
87,study4,pda
102,study5,pda
...,...,...
2222,study41,nopda
2228,study16,nopda
2246,study36,nopda
2327,study46,nopda


# Preprocess frames

In [24]:
def my_line(x_coord, slope, intercept):
    return x_coord*slope + intercept

def mask_image(img, slope, intercept):
    img = img.clone()

    # list all x and y coordinate indices
    y_coordinates = torch.arange(0, img.shape[-2])
    x_coordinates = torch.arange(0, img.shape[-1])

    # compute the y-values associated with given slope and intercept
    # for each x coordinate
    y_line_points = my_line(x_coordinates, slope=slope, intercept=intercept)

    # locate all pixels with y coordinates less than line values
    mask = y_coordinates[:,None] < y_line_points 

    # make symmetric mask by flipping
    mask = mask | mask.flip(-1)

    # expand mask dimensions to match image dimensions
    expanded_mask = mask[None].expand(img.shape[0],img.shape[1],-1, -1)

    # zero out image in masked regions
    img[expanded_mask] = 0

    return img

In [25]:
def preproc(frames):

    frames = frames.permute(0,3,1,2)
    
    #Resize
    splitFrames = torch.split(frames, 10)
    resizedFrames = []
    aspectRatio = (frames.shape[3])/(frames.shape[2])
    for f in splitFrames:
        h = int(1.2 * res_y)
        w = int(h*aspectRatio)
        f = TF.resize(f, (h, w))
        f = TF.center_crop(f, (res_x, res_y))
        
        resizedFrames.append(f)

    resizedFrames = torch.concat(resizedFrames)
    
    #Image Masking
    yIntercept = int(resizedFrames.shape[2]/3.2)
    finalResult = mask_image(resizedFrames, -2, yIntercept)
    
    #tor
    return finalResult

In [26]:
video_metadata = []
for ix, row in df_model.iterrows():
    print(" "*100, end='\r')
    print(f"Converting video {ix+1} of {len(df_model)}.", end="\r")
    
    frames, _, _ =  torchvision.io.read_video(row['mp4_path'])
    
    # store some video metadata
    video_metadata.append({'external_id': row['external_id'], 'patient_type': row['patient_type'], 'num_frames': frames.shape[0]})
    
    frames = preproc(frames)
    
    # save individual frames
    for ix, f in enumerate(frames):
        filename = f"{export_dir}{row['patient_type']}_{row['external_id']}_frame{str(ix).zfill(4)}.jpg"
        torchvision.io.write_jpeg(f, filename)

Converting video 1 of 3177.                                                                         



Converting video 3177 of 3177.                                                                      

In [28]:
df_vid_meta = pd.DataFrame(video_metadata)
df_vid_meta = df_vid_meta.rename(columns={'patient_id': 'patient_type'})
df_vid_meta

Unnamed: 0,external_id,patient_type,num_frames
0,study1_dicom8,pda,134
1,study1_dicom10,pda,100
2,study1_dicom11,pda,70
3,study1_dicom1,pda,109
4,study1_dicom2,pda,72
...,...,...,...
3172,study45_dicom54,nopda,105
3173,study57_dicom101,nopda,49
3174,study9_dicom11,nopda,107
3175,study24_dicom110,nopda,147


# Prepare CSV data tables

We will prepare the following metadata tables:
1. study.csv -- each row corresponds to a particular study. We need this in order to correctly split studies into train/test partitions. Indexed by `patient_type`-`study` pairs. Note: `study` alone is not unique.
2. video.csv -- each row corresponds to a particular video from a study. We use this to store the video metadata since our annotations are at the video level. `patient_type`-`external_id` pairs. 
3. frame.csv -- each row corresponds to a particular frame from a video. This contains the filepaths to the frame images. Can be merged with video on `patient_type`-`external_id`.

### 3. frame.csv

In [29]:
frames = glob.glob(export_dir + '*.jpg')

def parse_path(fp):
    fn = fp.split('/')[-1]
    
    return dict(
        patient_type = fn.split('_')[0],
        external_id = '_'.join(fn.split('_')[1:3]),
        png_path = fp
    )

df_frames = pd.DataFrame([parse_path(p) for p in frames])

df_frames

Unnamed: 0,patient_type,external_id,png_path
0,pda,study27_dicom42,/zfs/wficai/pda/model_data/20221008/pda_study2...
1,pda,study59_dicom33,/zfs/wficai/pda/model_data/20221008/pda_study5...
2,nopda,study76_dicom45,/zfs/wficai/pda/model_data/20221008/nopda_stud...
3,nopda,study34_dicom113,/zfs/wficai/pda/model_data/20221008/nopda_stud...
4,pda,study53_dicom50,/zfs/wficai/pda/model_data/20221008/pda_study5...
...,...,...,...
286975,pda,study59_dicom49,/zfs/wficai/pda/model_data/20221008/pda_study5...
286976,nopda,study35_dicom65,/zfs/wficai/pda/model_data/20221008/nopda_stud...
286977,pda,study60_dicom26,/zfs/wficai/pda/model_data/20221008/pda_study6...
286978,pda,study48_dicom76,/zfs/wficai/pda/model_data/20221008/pda_study4...


In [30]:
df_frames.to_csv(f'/zfs/wficai/pda/model_data/{datestamp}_frame.csv', index=False)

### 2. video.csv

In [31]:
df_video = df_vid_meta.merge(df_model, how='inner')

# drop if missing view or mode
df_video = df_video.dropna(subset=['view', 'mode'])

df_video

Unnamed: 0,external_id,patient_type,num_frames,view,mode,diagnosis,study,mp4_path
0,study1_dicom8,pda,134,pdaView,2d,[],study1,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
1,study1_dicom10,pda,100,pdaView,2d,[],study1,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
2,study1_dicom11,pda,70,pdaView,color,[],study1,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
3,study1_dicom1,pda,109,nonPDAView,2d,,study1,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
4,study1_dicom2,pda,72,nonPDAView,2d,[],study1,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
...,...,...,...,...,...,...,...,...
3172,study45_dicom54,nopda,105,nonPDAView,2d,[],study45,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...
3173,study57_dicom101,nopda,49,pdaRelatedView,color,[],study57,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...
3174,study9_dicom11,nopda,107,nonPDAView,2d,[],study9,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...
3175,study24_dicom110,nopda,147,pdaRelatedView,color_compare,[],study24,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...


In [32]:
df_video.to_csv(f'/zfs/wficai/pda/model_data/{datestamp}_video.csv', index=False)

### 1. study.csv

In [33]:
# construct a study table. 
df_study = df_video.groupby(['patient_type', 'study'], as_index=False).agg({'external_id': 'nunique', 'num_frames': 'sum'}).rename(columns={'external_id': 'num_videos'})
df_study

Unnamed: 0,patient_type,study,num_videos,num_frames
0,nopda,study1,12,610
1,nopda,study10,19,1370
2,nopda,study11,8,595
3,nopda,study12,7,413
4,nopda,study13,13,1006
...,...,...,...,...
133,pda,study62,25,2105
134,pda,study63,1,54
135,pda,study7,24,2392
136,pda,study8,46,4244


In [34]:
df_study.to_csv(f'/zfs/wficai/pda/model_data/{datestamp}_study.csv', index=False)