In [7]:
import os
import csv
import random
from collections import defaultdict

# This will be used later on when storing the annotation data in a useful way.
def default_inner_dict():
    return {
        'SOPInstanceUID': set(),
        'free_fluid_label': 0,
        'frame_count': 0
           }

In [2]:
image_metadata_fname = '/scratch/users/austin.zane/ucsf_fast/data/labeled_fast_morison/free_fluid_labels.csv'

with open(image_metadata_fname, 'r') as file:
    # Rownames should be ['filename', 'free_fluid_label', 'creator_id', 'dataset_name', 'study_id'].
    csv_reader = csv.DictReader(file)
    
    # We only care about these three columns.
    image_fnames = [(row['study_id'], row['filename'], row['free_fluid_label']) for row in csv_reader]

Notes for the following data processing steps. 
1. For now, I want to exclude the positive series with SOPInstanceUID = 1.2.840.114340.3.48100016190144.3.20201005.183539.6271.4. The annotation is very messy and would only serve to hurt performance. The main issue is that it is no longer a worm and is instead a giant blob that deviates significantly from Morison's Pouch. This is a model made to detect bleeding in Morison's Pouch, specifically. This means that there could very well be bleeding nearby that a human would catch but this model will miss.

2. There are some studies that contain both positive and negative series. I'm going to split the data into train/validation/test on a study-by-study level and do it separately for positives and negatives to preserve the ratio, but this is made more difficult when there are dependencies between series that prevent us from labeling an entire study as positive or negative. As a result, I will ignore the negative worm annotations in these instances because the postive annotations are much more valuable.

In [33]:
# Get list of study IDs that contain positive images so I know to ignore negative annotations in these studies.
pos_exam_names = set()

for study_id, fname, label in image_fnames:
    if label == '1':
        pos_exam_names.add(study_id)

In [37]:
exam_dict = defaultdict(default_inner_dict)

for study_id, fname, label in image_fnames:
    if fname.split('_')[0] == '1.2.840.114340.3.48100016190144.3.20201005.183539.6271.4':
        bad_frames+=1
        continue
    if study_id in pos_exam_names and label == '-1':
        continue
    
    exam_dict[study_id]['SOPInstanceUID'].add(fname)
    exam_dict[study_id]['frame_count'] += 1
    
    if exam_dict[study_id]['free_fluid_label'] * int(label) == -1:
        print('Problem with label')
    elif exam_dict[study_id]['free_fluid_label'] == 0:
        exam_dict[study_id]['free_fluid_label'] = label

Quick check to see how many of each type of study I have and how many frames.

In [48]:
pos_exams = {}
total_frames_pos = 0

for k in exam_dict:
    if exam_dict[k]['free_fluid_label'] == '1':
        pos_exams[k] = exam_dict[k]
        print(f'Study: {k}, Count: {pos_exams[k]['frame_count']}')
        total_frames_pos += pos_exams[k]['frame_count']
print(f'\nTotal number of studies is: {len(pos_exams)}')
print(f'Total number of frames is: {total_frames_pos}\n')

neg_exams = {}
total_frames_neg = 0

for k in exam_dict:
    if exam_dict[k]['free_fluid_label'] == '-1':
        neg_exams[k] = exam_dict[k]
        print(f'Study: {k}, Count: {neg_exams[k]['frame_count']}')
        total_frames_neg += neg_exams[k]['frame_count']
print(f'\nTotal number of studies is: {len(neg_exams)}')
print(f'Total number of frames is: {total_frames_neg}\n')

Study: 1.2.124.113532.80.22017.45499.20181212.101448.4178768, Count: 85
Study: 1.2.124.113532.80.22185.43466.20201005.192348.26378127, Count: 120
Study: 1.2.840.114340.3.8251017179172.1.20150128.164353.2928, Count: 176
Study: 1.2.840.114340.3.8251017179172.1.20150413.182024.3279, Count: 116
Study: 1.2.840.114340.3.8251017179172.1.20140531.233136.1747, Count: 86
Study: 1.2.840.114340.3.8251017179172.1.20141207.22848.2724, Count: 27
Study: 1.2.840.114340.3.8251017179172.1.20150117.145059.2895, Count: 107
Study: 1.2.840.114340.3.8251017179172.1.20150110.210256.2861, Count: 40

Total number of studies is: 8
Total number of frames is: 757

Study: 1.2.840.114340.3.48100023067031.1.20191026.182159.266, Count: 3
Study: 1.3.6.1.4.1.30071.8.345051894651.5889832971367579, Count: 1
Study: 1.2.840.114340.3.8251050064157.1.20180731.171754.506, Count: 1
Study: 1.3.6.1.4.1.30071.8.345051894651.5975873832447853, Count: 1
Study: 2.25.307180669327115312541721960352445902433, Count: 1
Study: 1.3.6.1.4.1.3

In [71]:
pos_exam_names = list(pos_exams.keys())
neg_exam_names = list(neg_exams.keys())

random.shuffle(pos_exam_names)
random.shuffle(neg_exam_names)

# We know we want 3, 2, 3 for positives. Get proportional indices for negatives.
n = float(len(neg_exam_names))
train_end = int(n * (3.0/8.0))
val_end = int(n * (5.0/8.0))

train_pos, val_pos, test_pos = pos_exam_names[:3], pos_exam_names[3:5], pos_exam_names[5:]
train_neg, val_neg, test_neg = neg_exam_names[:train_end], neg_exam_names[train_end:val_end], neg_exam_names[val_end:]


In [54]:
def count_frames(study_list):
    total = 0
    for k in study_list:
        total += exam_dict[k]['frame_count']
    return total
    

In [72]:
print(f'Positive sets')
print(f'Num of pos train frames: {count_frames(train_pos)}')
print(f'Num of pos vali. frames: {count_frames(val_pos)}')
print(f'Num of pos test frames: {count_frames(test_pos)}')


print(f'Negative sets')
print(f'Num of neg train frames: {count_frames(train_neg)}')
print(f'Num of neg vali. frames: {count_frames(val_neg)}')
print(f'Num of neg test frames: {count_frames(test_neg)}')

Positive sets
Num of pos train frames: 343
Num of pos vali. frames: 112
Num of pos test frames: 302
Negative sets
Num of neg train frames: 194
Num of neg vali. frames: 95
Num of neg test frames: 190


In [92]:
print(f'Train:\n')
print(f'train = [')
for i in train_pos:
    print(f"\t'{i}',")
for i in train_neg:
    print(f"\t'{i}',")
print(f']\n')
    
print(f'Validation:\n')
print(f'val = [')
for i in val_pos:
    print(f"\t'{i}',")
for i in val_neg:
    print(f"\t'{i}',")
print(f']\n')
    
print(f'Test:\n')
print(f'test = [')
for i in test_pos:
    print(f"\t'{i}',")
for i in test_neg:
    print(f"\t'{i}',")
print(f']\n')



Train:

train = [
	'1.2.840.114340.3.8251017179172.1.20150117.145059.2895',
	'1.2.124.113532.80.22185.43466.20201005.192348.26378127',
	'1.2.840.114340.3.8251017179172.1.20150413.182024.3279',
	'1.3.6.1.4.1.30071.8.345051894651.5792422788785640',
	'2.25.177088945512522741463896912923250539068',
	'1.3.6.1.4.1.30071.8.345051894651.5859709797197467',
	'1.3.6.1.4.1.30071.8.345051894651.5780083138551689',
	'1.3.6.1.4.1.30071.8.345051894651.5927222542216282',
	'1.2.840.114340.3.48100021226225.1.20190905.174151.2270',
	'1.3.6.1.4.1.30071.8.345051894651.5809582418560733',
	'1.3.6.1.4.1.30071.8.345051894651.5975873832447853',
	'1.2.840.114340.3.8251050064157.1.20181020.134451.813',
	'1.2.840.114340.3.8251050064157.1.20180802.120625.517',
	'1.3.6.1.4.1.30071.8.345051894651.5998089161221277',
	'1.2.840.114340.3.8251050064157.1.20190428.203851.1330',
	'1.3.6.1.4.1.30071.8.345051894651.5929714404515387',
	'2.25.307180669327115312541721960352445902433',
	'1.2.840.114340.3.48100023067031.1.20191110.3

The above have been placed in the main config file. Next, I need to get a new visualization set from the validation set.

In [97]:
vis_exams = random.sample(val_pos, 2) + random.sample(val_neg, 3)

In [105]:
vis_series = []
for e in vis_exams:
    vis_series.append(random.choice(list(exam_dict[e]['SOPInstanceUID'])))

In [106]:
vis_series

['1.2.840.114340.3.8251017179172.3.20141207.22903.11967.6_16.jpg',
 '1.2.840.114340.3.48100016196004.3.20181212.112840.4792.4_68.jpg',
 '1.2.840.114340.3.48100021226225.3.20190716.165507.5358.4_167.jpg',
 '1.2.840.114340.3.48100027231036.3.20180619.193417.1817.4_82.jpg',
 '1.2.840.114340.3.8251050064157.3.20181111.150608.3461.6_3.jpg']