In [1]:
import csv
with open('original_gradings.csv', 'r') as f:
    all_gradings = list(csv.reader(f))

In [2]:
{grader for path, grader, grade in all_gradings}

{'CH', 'GA', 'JT', 'KP', 'SB'}

In [3]:
juniors = 'CH', 'JT', 'KP', 'SB'
all_graders = juniors + ('GA',)

In [4]:
all_files = {filename for filename, grader, grade in all_gradings}
all_gradings_index = {(filename, grader): grade for filename, grader, grade in all_gradings}
len(all_files), len(all_gradings), len(all_gradings_index)

(7414, 22252, 22252)

In [5]:
def get_group_label(filename):
    all_junior_labels = [all_gradings_index[filename, grader] 
                         for grader in juniors
                         if (filename, grader) in all_gradings_index
                        ]
    # at least 2 juniors have graded, and they all agree
    if len(all_junior_labels) > 1 and len(set(all_junior_labels)) == 1:
        return all_junior_labels[0]

    # only 1 junior label available or disagreement
    return all_gradings_index[filename, 'GA']
    

In [6]:
get_group_label(list(all_files)[0])

'normal'

In [7]:
group_labels = {filename: get_group_label(filename) for filename in all_files}
ga_labels = {filename: all_gradings_index[filename, 'GA'] for filename in all_files}

In [8]:
from collections import Counter
Counter(group_labels.values())

Counter({'normal': 6385, 'pre-plus': 235, 'ungradable': 564, 'plus': 230})

In [9]:
Counter(ga_labels.values())

Counter({'normal': 6898, 'ungradable': 232, 'plus': 197, 'pre-plus': 87})

## Eye laterality

In [10]:
import pandas as pd 
def clean_path(filepath):
    path = filepath.split('\\')[-1]
    return [p for p in (path, path[2:], (path + '.png'), (path[:-1])) if p in all_files][0]

def clean_laterality(laterality):
    try: 
        return {
            'Right': 'R',
            'right': 'R',
            'R': 'R',
            
            'L': 'L',
            'Left':'L',
            'left': 'L',
            'lef': 'L',
            'left ': 'L'
        }[laterality]
    except:
        return '?'

from collections import defaultdict

eye_grading = defaultdict(list)
grading_folder = '/media/QNAP/People/GongyuZhang/ROP/data/raw/Grading/gradingonIGC/'
for grader in juniors:
    orig_xls = '{}{}_ROP_grading.xlsx'.format(grading_folder, grader)
    df = pd.read_excel(orig_xls, engine='openpyxl', index_col=False)
    
    for i, r in df.iterrows():
        path = clean_path(r.Filepath)
        laterality = clean_laterality(r.Laterality)
        eye_grading[path].append(laterality)

Counter(tuple(sorted(x)) for x in eye_grading.values())

Counter({('R', 'R'): 2494,
         ('L', 'R'): 836,
         ('L', 'L'): 3089,
         ('?', 'R'): 351,
         ('?', 'L'): 279,
         ('?', '?'): 351,
         ('L', 'L', 'L', 'L'): 4,
         ('L', 'L', 'L', 'R'): 3,
         ('R', 'R', 'R', 'R'): 3,
         ('L', 'R', 'R', 'R'): 1,
         ('?', '?', 'R', 'R'): 1,
         ('L', 'L', 'R'): 1,
         ('R',): 1})

In [11]:
laterality_replace = { 
    ('R', 'R'): 'R',
    ('L', 'R'): '?',
    ('L', 'L'): 'L',
    ('?', 'R'): 'R',
    ('?', 'L'): 'L',
    ('?', '?'): '?',
    ('L', 'L', 'L', 'L'): 'L',
    ('L', 'L', 'L', 'R'): 'L',
    ('R', 'R', 'R', 'R'): 'R',
    ('L', 'R', 'R', 'R'): 'R',
    ('?', '?', 'R', 'R'): 'R',
    ('L', 'L', 'R'): 'L',
    ('R',): 'R'
}

In [12]:
laterality_index = {path: laterality_replace[tuple(sorted(x))] for path, x in eye_grading.items()}

# Make split

In [13]:
reference = group_labels

In [14]:
import numpy as np

In [15]:
patient_lateralities = defaultdict(list)
for k, v in reference.items():
    p_id = k.split('.')[0]
    patient_lateralities[p_id].append(laterality_index[k])

In [16]:
test_set = set()
eligible = reference

def select_files(criterium, eligible):
    
    # subset of eligible files 
    draw_from = sorted([filename for filename, grading in eligible.items()
                        if criterium(grading)])
    
    # random file
    filename = np.random.choice(draw_from)
    eye = laterality_index[filename]
    try:
        fellow_eye = {'L': 'R', 'R': 'L'}[eye]
    except:
        # laterality is unknown
        return eligible
        
    test_set.add(filename)
    
    p_id = filename.split('.')[0]
    # find fellow eye with same referability
    draw_from = sorted([filename for filename, grading in eligible.items()
                        if criterium(grading)
                        and filename.split('.')[0] == p_id
                        and laterality_index[filename] == fellow_eye
           ])
    
    # add image of fellow eye if possible
    if draw_from:    
        filename = np.random.choice(draw_from)
        test_set.add(filename)

    # exclude all files from the same patient 
    eligible = {k: v for k, v in eligible.items()
                if k.split('.')[0] != p_id}
    
    return eligible

np.random.seed(1)
while len(test_set) < 100:
     eligible = select_files(lambda grading: grading in ('plus', 'pre-plus'), eligible)
        
while len(test_set) < 200:
    eligible = select_files(lambda grading: grading == 'normal', eligible)

In [17]:
len(test_set), len({k.split('.')[0] for k in test_set})

(200, 112)

In [18]:
len(eligible)

6620

In [19]:
test_set_index = {filename: group_labels[filename] for filename in test_set}
development = {filename: group_labels[filename] for filename in eligible}
excluded = {f for f in reference if f not in test_set and f not in development}

In [20]:
Counter(test_set_index.values())

Counter({'plus': 51, 'normal': 100, 'pre-plus': 49})

In [21]:
Counter(development.values())

Counter({'normal': 5877, 'pre-plus': 122, 'ungradable': 552, 'plus': 69})

## Split folds

In [22]:
development_patients = {f.split('.')[0] for f in development}
n_folds = 5

np.random.seed(1)
patient_folds = np.array_split(sorted(list(development_patients)), n_folds)
patient_fold_index = {p: i for i, ps in enumerate(patient_folds) for p in ps}
fold_index = {f: patient_fold_index[f.split('.')[0]] for f in development}

In [23]:
final_split = []
for f in sorted(all_files):
    if f in test_set:
        subset = 'test'
    elif f in excluded:
        subset = 'excluded'
    else:
        subset = 'fold_{}'.format(fold_index[f])
        

    graders_labels = tuple(all_gradings_index[f, grader] if (f, grader) in all_gradings_index else '?'
                           for grader in all_graders)
    final_split.append((f, subset, *graders_labels, group_labels[f]))
header = 'file', 'subset', *all_graders, 'group'

In [27]:
subsets = {r[1] for r in final_split}
for subset in sorted(subsets):
    print(subset)
    print('files', len([r[0] for r in final_split if r[1] == subset]))
    print('referable files (group)', len([r[0] for r in final_split if r[1] == subset and r[-1] in ('plus', 'pre-plus')]))
    print('referable patients (group)', len({r[0].split('.')[0] for r in final_split if r[1] == subset and r[-1] in ('plus', 'pre-plus')}))
    
    print('referable files (GA)', len([r[0] for r in final_split if r[1] == subset and r[-2] in ('plus', 'pre-plus')]))
    print('referable patients (GA)', len({r[0].split('.')[0] for r in final_split if r[1] == subset and r[-2] in ('plus', 'pre-plus')}))
    print()
    

excluded
files 594
referable files (group) 174
referable patients (group) 51
referable files (GA) 127
referable patients (GA) 36

fold_0
files 1349
referable files (group) 48
referable patients (group) 18
referable files (GA) 25
referable patients (GA) 7

fold_1
files 1353
referable files (group) 42
referable patients (group) 18
referable files (GA) 16
referable patients (GA) 5

fold_2
files 1313
referable files (group) 33
referable patients (group) 11
referable files (GA) 21
referable patients (GA) 5

fold_3
files 1300
referable files (group) 35
referable patients (group) 16
referable files (GA) 19
referable patients (GA) 7

fold_4
files 1305
referable files (group) 33
referable patients (group) 13
referable files (GA) 19
referable patients (GA) 7

test
files 200
referable files (group) 100
referable patients (group) 61
referable files (GA) 57
referable patients (GA) 39



In [28]:
with open('data_split.csv', 'w') as f:
    csv.writer(f).writerows([header] + final_split)