In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
import json
import glob

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torchvision.transforms.functional as TF

In [2]:
datestamp = '20221028'

In [3]:
pda_source_dir = '/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior Views/'
nopda_source_dir = '/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superior Views/'
export_dir = f'/zfs/wficai/pda/model_data/{datestamp}/'

if not os.path.exists(export_dir):
    os.makedirs(export_dir, exist_ok=True)

pda_label_jsons = ['/zfs/wficai/pda/model_data/20220711_PDA_small_incomplete_batch.json', '/zfs/wficai/pda/model_data/pda_export-2022-10-28T19 50 43.345Z.json']
nopda_label_jsons = ['/zfs/wficai/pda/model_data/nopda_export-2022-10-28T19 38 31.411Z.json']

In [4]:
res_x = 224
res_y = 224

# Process source video data

In [5]:
pda_vids = [{'patient_type': 'pda', 'mp4_path': path} for path in glob.glob(f"{pda_source_dir}/*.mp4")]
nopda_vids = [{'patient_type': 'nopda', 'mp4_path': path} for path in glob.glob(f"{nopda_source_dir}/*.mp4")]
df_vids = pd.DataFrame(pda_vids+nopda_vids)
df_vids['external_id'] = ["_".join(fname[-1].split('_')[:2]) for fname in df_vids.mp4_path.str.split('/')]
df_vids

Unnamed: 0,patient_type,mp4_path,external_id
0,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...,study26_dicom51
1,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...,study54_dicom98
2,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...,study48_dicom136
3,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...,study22_dicom70
4,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...,study19_dicom17
...,...,...,...
5154,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...,study61_dicom49
5155,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...,study34_dicom78
5156,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...,study24_dicom113
5157,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...,study55_dicom122


In [6]:
df_vids.groupby('patient_type').size()

patient_type
nopda    2468
pda      2691
dtype: int64

In [7]:
# number of studies
df_vids_temp = df_vids
df_vids_temp['study'] = df_vids.external_id.apply(lambda x: x.split('_')[0])
print(len(df_vids_temp.drop_duplicates(subset=['study', 'patient_type'])))
del df_vids_temp

165


# Process label data

In [8]:
# load the json data
labels = []
for path in pda_label_jsons + nopda_label_jsons:
    with open(path, 'r') as f:
        labels += json.load(f)

In [9]:
# parse
type_map = {
    "PDA Classification -- Non-PDA Views": "nopda",
    "PDA Classification": "pda",
    "PDA Classification -- PDA Views": "pda",
}

labels_parsed = []
for lab in labels:
    if type(lab['Label'])==dict:
        res = lab['Label']
        res['external_id'] = lab['External ID']
        res['study'] = lab['External ID'].split('_')[0]
        res['patient_type'] = type_map[lab['Project Name']]
        labels_parsed.append(res)

In [10]:
df_labs = pd.DataFrame(labels_parsed)
df_labs

Unnamed: 0,view,mode,diagnosis,external_id,study,patient_type
0,pdaView,2d,[],study1_dicom8,study1,pda
1,pdaView,2d,[],study1_dicom10,study1,pda
2,pdaView,color,[],study1_dicom11,study1,pda
3,nonPDAView,2d,,study1_dicom1,study1,pda
4,nonPDAView,2d,[],study1_dicom2,study1,pda
...,...,...,...,...,...,...
5137,pdaRelatedView,color_compare,[],study24_dicom113,study24,nopda
5138,nonPDAView,color,[],study55_dicom122,study55,nopda
5139,pdaView,color,[],study9_dicom7,study9,nopda
5140,nonPDAView,2d,[],study44_dicom95,study44,nopda


In [11]:
# number of studies
len(df_labs.drop_duplicates(subset=['study', 'patient_type']))

165

In [12]:
# ensure uniqueness of external id among each patient_type
assert not (df_labs.groupby(['patient_type', 'external_id']).size()>1).any(), "patient_type-external_id pairs must be unique"

In [13]:
df_labs.groupby(['patient_type']).size()

patient_type
nopda    2463
pda      2679
dtype: int64

In [14]:
df_labs.groupby(['view']).size()

view
nonPDAView        3350
pdaRelatedView    1099
pdaView            682
dtype: int64

In [15]:
df_labs.groupby(['mode']).size()

mode
2d               2164
color            1669
color_compare    1299
dtype: int64

In [16]:
df_labs.groupby(['patient_type', 'view', 'mode']).size()

patient_type  view            mode         
nopda         nonPDAView      2d               783
                              color            557
                              color_compare    381
              pdaRelatedView  2d               202
                              color            182
                              color_compare    119
              pdaView         2d                54
                              color             87
                              color_compare     96
pda           nonPDAView      2d               764
                              color            537
                              color_compare    327
              pdaRelatedView  2d               224
                              color            195
                              color_compare    176
              pdaView         2d               136
                              color            111
                              color_compare    197
dtype: int64

In [17]:
df_labs.query('view!="nonPDAView" and mode!="2d"').shape

(1175, 6)

In [18]:
df_labs.query('view!="nonPDAView" and mode!="2d"').groupby(['patient_type']).size()

patient_type
nopda    485
pda      690
dtype: int64

# Merge datasets

In [19]:
df_model = df_labs.merge(df_vids, how='inner')

In [20]:
df_model

Unnamed: 0,view,mode,diagnosis,external_id,study,patient_type,mp4_path
0,pdaView,2d,[],study1_dicom8,study1,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
1,pdaView,2d,[],study1_dicom10,study1,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
2,pdaView,color,[],study1_dicom11,study1,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
3,nonPDAView,2d,,study1_dicom1,study1,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
4,nonPDAView,2d,[],study1_dicom2,study1,pda,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
...,...,...,...,...,...,...,...
5095,pdaRelatedView,color_compare,[],study24_dicom113,study24,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...
5096,nonPDAView,color,[],study55_dicom122,study55,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...
5097,pdaView,color,[],study9_dicom7,study9,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...
5098,nonPDAView,2d,[],study44_dicom95,study44,nopda,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...


In [21]:
df_model[['study', 'patient_type']].drop_duplicates()

Unnamed: 0,study,patient_type
0,study1,pda
30,study2,pda
56,study3,pda
87,study4,pda
102,study5,pda
...,...,...
2920,study41,nopda
2926,study16,nopda
2944,study36,nopda
3025,study46,nopda


# Preprocess frames

In [22]:
def my_line(x_coord, slope, intercept):
    return x_coord*slope + intercept

def mask_image(img, slope, intercept):
    img = img.clone()

    # list all x and y coordinate indices
    y_coordinates = torch.arange(0, img.shape[-2])
    x_coordinates = torch.arange(0, img.shape[-1])

    # compute the y-values associated with given slope and intercept
    # for each x coordinate
    y_line_points = my_line(x_coordinates, slope=slope, intercept=intercept)

    # locate all pixels with y coordinates less than line values
    mask = y_coordinates[:,None] < y_line_points 

    # make symmetric mask by flipping
    mask = mask | mask.flip(-1)

    # expand mask dimensions to match image dimensions
    expanded_mask = mask[None].expand(img.shape[0],img.shape[1],-1, -1)

    # zero out image in masked regions
    img[expanded_mask] = 0

    return img

In [23]:
def preproc(frames):

    frames = frames.permute(0,3,1,2)
    
    #Resize
    splitFrames = torch.split(frames, 10)
    resizedFrames = []
    aspectRatio = (frames.shape[3])/(frames.shape[2])
    for f in splitFrames:
        h = int(1.2 * res_y)
        w = int(h*aspectRatio)
        f = TF.resize(f, (h, w))
        f = TF.center_crop(f, (res_x, res_y))
        
        resizedFrames.append(f)

    resizedFrames = torch.concat(resizedFrames)
    
    #Image Masking
    yIntercept = int(resizedFrames.shape[2]/3.2)
    finalResult = mask_image(resizedFrames, -2, yIntercept)
    
    #tor
    return finalResult

In [24]:
from concurrent.futures import ProcessPoolExecutor

In [25]:
def convert_video(ix, row):
    if ix % 50==0:
        print(f"Converting video {ix+1} of {len(df_model)}.", end="\r")
    
    frames, _, _ =  torchvision.io.read_video(row['mp4_path'], pts_unit='sec')
    
    # store some video metadata
    metadata = {'external_id': row['external_id'], 'patient_type': row['patient_type'], 'num_frames': frames.shape[0]}
    
    frames = preproc(frames)
    
    # save individual frames
    for ix, f in enumerate(frames):
        filename = f"{export_dir}{row['patient_type']}_{row['external_id']}_frame{str(ix).zfill(4)}.jpg"
        torchvision.io.write_jpeg(f, filename)
        
    return metadata
        
futures = []
with ProcessPoolExecutor(10) as executor:
    for ix, row in df_model.iterrows():
        futures.append(executor.submit(convert_video, ix=ix, row=row))
        
video_metadata = [f.result() for f in futures]

Converting video 5051 of 5100.

In [26]:
df_vid_meta = pd.DataFrame(video_metadata)
df_vid_meta = df_vid_meta.rename(columns={'patient_id': 'patient_type'})
df_vid_meta

Unnamed: 0,external_id,patient_type,num_frames
0,study1_dicom8,pda,134
1,study1_dicom10,pda,100
2,study1_dicom11,pda,70
3,study1_dicom1,pda,109
4,study1_dicom2,pda,72
...,...,...,...
5095,study24_dicom113,nopda,153
5096,study55_dicom122,nopda,13
5097,study9_dicom7,nopda,114
5098,study44_dicom95,nopda,72


# Prepare CSV data tables

We will prepare the following metadata tables:
1. study.csv -- each row corresponds to a particular study. We need this in order to correctly split studies into train/test partitions. Indexed by `patient_type`-`study` pairs. Note: `study` alone is not unique.
2. video.csv -- each row corresponds to a particular video from a study. We use this to store the video metadata since our annotations are at the video level. `patient_type`-`external_id` pairs. 
3. frame.csv -- each row corresponds to a particular frame from a video. This contains the filepaths to the frame images. Can be merged with video on `patient_type`-`external_id`.

### 3. frame.csv

In [27]:
frames = glob.glob(export_dir + '*.jpg')

def parse_path(fp):
    fn = fp.split('/')[-1]
    
    return dict(
        patient_type = fn.split('_')[0],
        external_id = '_'.join(fn.split('_')[1:3]),
        png_path = fp
    )

df_frames = pd.DataFrame([parse_path(p) for p in frames])

df_frames

Unnamed: 0,patient_type,external_id,png_path
0,pda,study49_dicom76,/zfs/wficai/pda/model_data/20221028/pda_study4...
1,nopda,study74_dicom32,/zfs/wficai/pda/model_data/20221028/nopda_stud...
2,pda,study84_dicom33,/zfs/wficai/pda/model_data/20221028/pda_study8...
3,pda,study23_dicom14,/zfs/wficai/pda/model_data/20221028/pda_study2...
4,nopda,study37_dicom68,/zfs/wficai/pda/model_data/20221028/nopda_stud...
...,...,...,...
473489,pda,study48_dicom64,/zfs/wficai/pda/model_data/20221028/pda_study4...
473490,pda,study28_dicom65,/zfs/wficai/pda/model_data/20221028/pda_study2...
473491,pda,study17_dicom8,/zfs/wficai/pda/model_data/20221028/pda_study1...
473492,pda,study85_dicom96,/zfs/wficai/pda/model_data/20221028/pda_study8...


In [28]:
df_frames.to_csv(f'/zfs/wficai/pda/model_data/{datestamp}_frame.csv', index=False)

### 2. video.csv

In [29]:
df_video = df_vid_meta.merge(df_model, how='inner')

# drop if missing view or mode
df_video = df_video.dropna(subset=['view', 'mode'])

df_video

Unnamed: 0,external_id,patient_type,num_frames,view,mode,diagnosis,study,mp4_path
0,study1_dicom8,pda,134,pdaView,2d,[],study1,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
1,study1_dicom10,pda,100,pdaView,2d,[],study1,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
2,study1_dicom11,pda,70,pdaView,color,[],study1,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
3,study1_dicom1,pda,109,nonPDAView,2d,,study1,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
4,study1_dicom2,pda,72,nonPDAView,2d,[],study1,/zfs/wficai/pda/batch_1/PDA_Batch_1/Superior V...
...,...,...,...,...,...,...,...,...
5095,study24_dicom113,nopda,153,pdaRelatedView,color_compare,[],study24,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...
5096,study55_dicom122,nopda,13,nonPDAView,color,[],study55,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...
5097,study9_dicom7,nopda,114,pdaView,color,[],study9,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...
5098,study44_dicom95,nopda,72,nonPDAView,2d,[],study44,/zfs/wficai/pda/batch_1/Non-PDA_Batch_1/Superi...


In [30]:
df_video.to_csv(f'/zfs/wficai/pda/model_data/{datestamp}_video.csv', index=False)

### 1. study.csv

In [31]:
# construct a study table. 
df_study = df_video.groupby(['patient_type', 'study'], as_index=False).agg({'external_id': 'nunique', 'num_frames': 'sum'}).rename(columns={'external_id': 'num_videos'})
df_study

Unnamed: 0,patient_type,study,num_videos,num_frames
0,nopda,study1,24,1510
1,nopda,study10,30,2152
2,nopda,study11,16,1328
3,nopda,study12,15,956
4,nopda,study13,25,1884
...,...,...,...,...
158,pda,study86,51,6893
159,pda,study87,24,2224
160,pda,study88,16,1719
161,pda,study89,56,7369


In [32]:
df_study.to_csv(f'/zfs/wficai/pda/model_data/{datestamp}_study.csv', index=False)