In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
import json
import glob

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torchvision.transforms.functional as TF

In [2]:
pda_source_dir = '/mnt/data/pda/superior_views/PDA/'
nopda_source_dir = '/mnt/data/pda/superior_views/non_PDA/'
export_dir = '/mnt/data/pda/model_data/20220822/'

if not os.path.exists(export_dir):
    os.makedirs(export_dir, exist_ok=True)

pda_label_jsons = ['../label_data/20220711_PDA.json', '../label_data/20220822_PDA.json']
nopda_label_jsons = ['../label_data/20220817_NOPDA.json']

In [3]:
res_x = 224
res_y = 224

# Process source video data

In [4]:
pda_vids = [{'patient_type': 'pda', 'mp4_path': path} for path in glob.glob(f"{pda_source_dir}/*.mp4")]
nopda_vids = [{'patient_type': 'nopda', 'mp4_path': path} for path in glob.glob(f"{nopda_source_dir}/*.mp4")]
df_vids = pd.DataFrame(pda_vids+nopda_vids)
df_vids['external_id'] = ["_".join(fname[-1].split('_')[:2]) for fname in df_vids.mp4_path.str.split('/')]
df_vids

Unnamed: 0,patient_type,mp4_path,external_id
0,pda,/mnt/data/pda/superior_views/PDA/study48_dicom...,study48_dicom134
1,pda,/mnt/data/pda/superior_views/PDA/study57_dicom...,study57_dicom76
2,pda,/mnt/data/pda/superior_views/PDA/study11_dicom...,study11_dicom58
3,pda,/mnt/data/pda/superior_views/PDA/study28_dicom...,study28_dicom58
4,pda,/mnt/data/pda/superior_views/PDA/study79_dicom...,study79_dicom1
...,...,...,...
5155,nopda,/mnt/data/pda/superior_views/non_PDA/study7_di...,study7_dicom103
5156,nopda,/mnt/data/pda/superior_views/non_PDA/study25_d...,study25_dicom72
5157,nopda,/mnt/data/pda/superior_views/non_PDA/study53_d...,study53_dicom29
5158,nopda,/mnt/data/pda/superior_views/non_PDA/study8_di...,study8_dicom48


In [5]:
df_vids.groupby('patient_type').size()

patient_type
nopda    2468
pda      2692
dtype: int64

# Process label data

In [6]:
# load the json data
labels = []
for path in pda_label_jsons + nopda_label_jsons:
    with open(path, 'r') as f:
        labels += json.load(f)

In [7]:
# parse
type_map = {
    "PDA Classification -- Non-PDA Views": "nopda",
    "PDA Classification": "pda",
    "PDA Classification -- PDA Views": "pda",
}

labels_parsed = []
for lab in labels:
    if type(lab['Label'])==dict:
        res = lab['Label']
        res['external_id'] = lab['External ID']
        res['study'] = lab['External ID'].split('_')[0]
        res['patient_type'] = type_map[lab['Project Name']]
        labels_parsed.append(res)

In [8]:
df_labs = pd.DataFrame(labels_parsed)
df_labs

Unnamed: 0,view,mode,diagnosis,external_id,study,patient_type
0,pdaView,2d,[],study1_dicom8,study1,pda
1,pdaView,2d,[],study1_dicom10,study1,pda
2,pdaView,color,[],study1_dicom11,study1,pda
3,nonPDAView,2d,,study1_dicom1,study1,pda
4,nonPDAView,2d,[],study1_dicom2,study1,pda
...,...,...,...,...,...,...
2277,pdaRelatedView,color_compare,[],study15_dicom54,study15,nopda
2278,nonPDAView,color,[],study69_dicom50,study69,nopda
2279,pdaRelatedView,color,[],study19_dicom75,study19,nopda
2280,nonPDAView,color_compare,[],study42_dicom94,study42,nopda


In [9]:
# ensure uniqueness of external id among each patient_type
assert not (df_labs.groupby(['patient_type', 'external_id']).size()>1).any(), "patient_type-external_id pairs must be unique"

In [10]:
df_labs.groupby(['patient_type']).size()

patient_type
nopda    1153
pda      1129
dtype: int64

In [11]:
df_labs.groupby(['view']).size()

view
nonPDAView        1501
pdaRelatedView     451
pdaView            326
dtype: int64

In [12]:
df_labs.groupby(['mode']).size()

mode
2d               979
color            761
color_compare    540
dtype: int64

In [13]:
df_labs.groupby(['patient_type', 'view', 'mode']).size()

patient_type  view            mode         
nopda         nonPDAView      2d               378
                              color            261
                              color_compare    177
              pdaRelatedView  2d                86
                              color             78
                              color_compare     57
              pdaView         2d                31
                              color             35
                              color_compare     49
pda           nonPDAView      2d               329
                              color            250
                              color_compare    106
              pdaRelatedView  2d                93
                              color             76
                              color_compare     61
              pdaView         2d                61
                              color             61
                              color_compare     89
dtype: int64

# Merge datasets

In [14]:
df_model = df_labs.merge(df_vids, how='inner')

In [15]:
df_model

Unnamed: 0,view,mode,diagnosis,external_id,study,patient_type,mp4_path
0,pdaView,2d,[],study1_dicom8,study1,pda,/mnt/data/pda/superior_views/PDA/study1_dicom8...
1,pdaView,2d,[],study1_dicom10,study1,pda,/mnt/data/pda/superior_views/PDA/study1_dicom1...
2,pdaView,color,[],study1_dicom11,study1,pda,/mnt/data/pda/superior_views/PDA/study1_dicom1...
3,nonPDAView,2d,,study1_dicom1,study1,pda,/mnt/data/pda/superior_views/PDA/study1_dicom1...
4,nonPDAView,2d,[],study1_dicom2,study1,pda,/mnt/data/pda/superior_views/PDA/study1_dicom2...
...,...,...,...,...,...,...,...
2259,pdaRelatedView,color_compare,[],study15_dicom54,study15,nopda,/mnt/data/pda/superior_views/non_PDA/study15_d...
2260,nonPDAView,color,[],study69_dicom50,study69,nopda,/mnt/data/pda/superior_views/non_PDA/study69_d...
2261,pdaRelatedView,color,[],study19_dicom75,study19,nopda,/mnt/data/pda/superior_views/non_PDA/study19_d...
2262,nonPDAView,color_compare,[],study42_dicom94,study42,nopda,/mnt/data/pda/superior_views/non_PDA/study42_d...


# Preprocess frames

In [16]:
def my_line(x_coord, slope, intercept):
    return x_coord*slope + intercept

def mask_image(img, slope, intercept):
    img = img.clone()

    # list all x and y coordinate indices
    y_coordinates = torch.arange(0, img.shape[-2])
    x_coordinates = torch.arange(0, img.shape[-1])

    # compute the y-values associated with given slope and intercept
    # for each x coordinate
    y_line_points = my_line(x_coordinates, slope=slope, intercept=intercept)

    # locate all pixels with y coordinates less than line values
    mask = y_coordinates[:,None] < y_line_points 

    # make symmetric mask by flipping
    mask = mask | mask.flip(-1)

    # expand mask dimensions to match image dimensions
    expanded_mask = mask[None].expand(img.shape[0],img.shape[1],-1, -1)

    # zero out image in masked regions
    img[expanded_mask] = 0

    return img

In [17]:
def preproc(frames):

    frames = frames.permute(0,3,1,2)
    
    #Resize
    splitFrames = torch.split(frames, 10)
    resizedFrames = []
    aspectRatio = (frames.shape[3])/(frames.shape[2])
    for f in splitFrames:
        h = int(1.2 * res_y)
        w = int(h*aspectRatio)
        f = TF.resize(f, (h, w))
        f = TF.center_crop(f, (res_x, res_y))
        
        resizedFrames.append(f)

    resizedFrames = torch.concat(resizedFrames)
    
    #Image Masking
    yIntercept = int(resizedFrames.shape[2]/3.2)
    finalResult = mask_image(resizedFrames, -2, yIntercept)
    
    #tor
    return finalResult

In [18]:
video_metadata = []
for ix, row in df_model.iterrows():
    print(" "*100, end='\r')
    print(f"Converting video {ix+1} of {len(df_model)}.", end="\r")
    
    frames, _, _ =  torchvision.io.read_video(row['mp4_path'])
    
    # store some video metadata
    video_metadata.append({'external_id': row['external_id'], 'patient_type': row['patient_type'], 'num_frames': frames.shape[0]})
    
    frames = preproc(frames)
    
    # save individual frames
    for ix, f in enumerate(frames):
        filename = f"{export_dir}{row['patient_type']}_{row['external_id']}_frame{str(ix).zfill(4)}.jpg"
        torchvision.io.write_jpeg(f, filename)

Converting video 1 of 2264.                                                                         



Converting video 81 of 2264.                                                                        

KeyboardInterrupt: 

In [26]:
df_vid_meta = pd.DataFrame(video_metadata)
df_vid_meta

Unnamed: 0,external_id,patient_id,num_frames
0,study1_dicom8,pda,134
1,study1_dicom10,pda,100
2,study1_dicom11,pda,70
3,study1_dicom1,pda,109
4,study1_dicom2,pda,72
...,...,...,...
1930,study15_dicom54,nopda,44
1931,study69_dicom50,nopda,40
1932,study19_dicom75,nopda,45
1933,study42_dicom94,nopda,86


In [27]:
df_vid_meta = df_vid_meta.rename(columns={'patient_id': 'patient_type'})
df_vid_meta

Unnamed: 0,external_id,patient_type,num_frames
0,study1_dicom8,pda,134
1,study1_dicom10,pda,100
2,study1_dicom11,pda,70
3,study1_dicom1,pda,109
4,study1_dicom2,pda,72
...,...,...,...
1930,study15_dicom54,nopda,44
1931,study69_dicom50,nopda,40
1932,study19_dicom75,nopda,45
1933,study42_dicom94,nopda,86


# Prepare CSV data tables

We will prepare the following metadata tables:
1. study.csv -- each row corresponds to a particular study. We need this in order to correctly split studies into train/test partitions. Indexed by `patient_type`-`study` pairs. Note: `study` alone is not unique.
2. video.csv -- each row corresponds to a particular video from a study. We use this to store the video metadata since our annotations are at the video level. `patient_type`-`external_id` pairs. 
3. frame.csv -- each row corresponds to a particular frame from a video. This contains the filepaths to the frame images. Can be merged with video on `patient_type`-`external_id`.

### 3. frame.csv

In [74]:
frames = glob.glob(export_dir + '*.png')

def parse_path(fp):
    fn = fp.split('/')[-1]
    
    return dict(
        patient_type = fn.split('_')[0],
        external_id = '_'.join(fn.split('_')[1:3]),
        png_path = fp
    )

df_frames = pd.DataFrame([parse_path(p) for p in frames])

df_frames

Unnamed: 0,patient_type,external_id,png_path
0,nopda,study30_dicom52,/mnt/data/pda/model_data/20220817/nopda_study3...
1,pda,study10_dicom62,/mnt/data/pda/model_data/20220817/pda_study10_...
2,pda,study22_dicom78,/mnt/data/pda/model_data/20220817/pda_study22_...
3,pda,study3_dicom1,/mnt/data/pda/model_data/20220817/pda_study3_d...
4,pda,study17_dicom40,/mnt/data/pda/model_data/20220817/pda_study17_...
...,...,...,...
176688,nopda,study19_dicom75,/mnt/data/pda/model_data/20220817/nopda_study1...
176689,nopda,study47_dicom3,/mnt/data/pda/model_data/20220817/nopda_study4...
176690,pda,study3_dicom31,/mnt/data/pda/model_data/20220817/pda_study3_d...
176691,nopda,study27_dicom66,/mnt/data/pda/model_data/20220817/nopda_study2...


In [75]:
df_frames.to_csv('../label_data/20220817_frame.csv', index=False)

### 2. video.csv

In [85]:
df_video = df_vid_meta.merge(df_model, how='inner')

# drop if missing view or mode
df_video = df_video.dropna(subset=['view', 'mode'])

df_video

Unnamed: 0,external_id,patient_type,num_frames,view,mode,diagnosis,study,mp4_path
0,study1_dicom8,pda,134,pdaView,2d,[],study1,/mnt/data/pda/h264_superior_views/PDA/study1_d...
1,study1_dicom10,pda,100,pdaView,2d,[],study1,/mnt/data/pda/h264_superior_views/PDA/study1_d...
2,study1_dicom11,pda,70,pdaView,color,[],study1,/mnt/data/pda/h264_superior_views/PDA/study1_d...
3,study1_dicom1,pda,109,nonPDAView,2d,,study1,/mnt/data/pda/h264_superior_views/PDA/study1_d...
4,study1_dicom2,pda,72,nonPDAView,2d,[],study1,/mnt/data/pda/h264_superior_views/PDA/study1_d...
...,...,...,...,...,...,...,...,...
1930,study15_dicom54,nopda,44,pdaRelatedView,color_compare,[],study15,/mnt/data/pda/h264_superior_views/non_PDA/stud...
1931,study69_dicom50,nopda,40,nonPDAView,color,[],study69,/mnt/data/pda/h264_superior_views/non_PDA/stud...
1932,study19_dicom75,nopda,45,pdaRelatedView,color,[],study19,/mnt/data/pda/h264_superior_views/non_PDA/stud...
1933,study42_dicom94,nopda,86,nonPDAView,color_compare,[],study42,/mnt/data/pda/h264_superior_views/non_PDA/stud...


In [91]:
df_video['mode'].value_counts()

2d               829
color            630
color_compare    472
Name: mode, dtype: int64

In [87]:
df_video.to_csv('../label_data/20220817_video.csv', index=False)

### 1. study.csv

In [88]:
# construct a study table. 
df_study = df_video.groupby(['patient_type', 'study'], as_index=False).agg({'external_id': 'nunique', 'num_frames': 'sum'}).rename(columns={'external_id': 'num_videos'})
df_study

Unnamed: 0,patient_type,study,num_videos,num_frames
0,nopda,study1,10,534
1,nopda,study10,17,1124
2,nopda,study11,8,595
3,nopda,study12,7,413
4,nopda,study13,12,847
...,...,...,...,...
98,pda,study5,48,5539
99,pda,study6,30,2102
100,pda,study7,24,2392
101,pda,study8,46,4244


In [90]:
df_study.to_csv('../label_data/20220817_study.csv', index=False)