In [1]:
import shutil
import pandas as pd
import os
import glob
import random 

def construct_class_dict(csv_path, mode, include_M_3 = True):
    
    data = pd.read_csv(csv_path)
    class_dict = {}
    
    if mode == 'all':
        vals = [1,2,3,5,6]
        classes = data['Mandard'].unique()[vals]
        print(classes)
        
        for class_name in classes:
            class_dict[class_name] = list(data.query(f"Mandard=='{class_name}'")['record_id'])

    elif mode == 'bin':
        classes = ['True', 'False']

        true_query = "Mandard=='I - No residual cancer' | Mandard=='II - rare residual cancer cells'"
        
        if include_M_3 == True:
            false_query = "Mandard=='III - Fibrosis outgrowing residual cancer' | Mandard=='IV - Residual cancer outgrowing fibrosis' | Mandard=='V - Absence of regressive changes'"
        else:
            false_query = "Mandard=='IV - Residual cancer outgrowing fibrosis' | Mandard=='V - Absence of regressive changes'"
        
        
        class_dict['True'] = list(data.query(true_query)['record_id'])
        class_dict['False'] = list(data.query(false_query)['record_id'])

    return class_dict

def make_directories(class_dict, target_directory):
    for class_name in class_dict.keys():
        class_dir = os.path.join(target_directory, class_name)
        if os.path.isdir(class_dir) == False:
            os.mkdir(class_dir)
        
def sort_subjects(class_dict, origin_dir, target_dir):
    for class_name, class_list in class_dict.items():
        for subject in class_list:

            subject_path = os.path.join(origin_dir, subject)
            file_list = glob.glob(f'{subject_path}_*')
            
            for file in file_list:
                shutil.copyfile(file, os.path.join(target_dir, class_name, file.split('/')[-1]))
                # print(file.split('/')[-1])
                # print(os.path.join(target_dir, class_name, file.split('/')[-1]))

In [24]:
deepslide_dir = '/data/gpfs/projects/punim2070/code/deepslide/deepslide'
csv_path = os.path.join(deepslide_dir, 'OES_tools', 'Slide Scanning Log.xlsx - Labelling.csv')
class_dict = construct_class_dict(csv_path, mode = 'bin')
# print(class_dict)
print(len(class_dict['False']))


39


In [2]:
deepslide_dir = '/data/gpfs/projects/punim2070/code/deepslide/deepslide'
csv_path = os.path.join(deepslide_dir, 'OES_tools', 'Slide Scanning Log.xlsx - Labelling.csv')
target_dir = os.path.join(deepslide_dir, 'all_wsi')

origin_dir = '/data/gpfs/projects/punim2070/data_jpg/OAC Slides JPEG Export_downsampled10'
mode = 'bin'
print(os.path.exists(target_dir), '\n', os.path.exists(csv_path), '\n', os.path.exists(origin_dir), '\n')

class_dict = construct_class_dict(csv_path, mode)

True 
 True 
 True 



In [2]:
# directories = {'wsi_train_1': train_class_dict, 'wsi_val_1': val_class_dict, 'wsi_test_1': test_class_dict}

deepslide_dir = '/data/gpfs/projects/punim2070/code/deepslide/deepslide'
csv_path = os.path.join(deepslide_dir, 'OES_tools', 'Slide Scanning Log.xlsx - Labelling.csv')
mode = 'bin'
class_dict = construct_class_dict(csv_path, mode)

test_class_dict     =   {}
val_class_dict      =   {}
train_class_dict    =   {}

#Extracting test class subjects:
test_class_dict['True'] = random.sample(class_dict['True'], 2)
for sub in test_class_dict['True']: class_dict['True'].remove(sub)

test_class_dict['False'] = random.sample(class_dict['False'], 2)
for sub in test_class_dict['False']: class_dict['False'].remove(sub)

#Extracting validation class subjects:
val_class_dict['True'] = random.sample(class_dict['True'], 2)
for sub in val_class_dict['True']: class_dict['True'].remove(sub)

val_class_dict['False'] = random.sample(class_dict['False'], 2)
for sub in val_class_dict['False']: class_dict['False'].remove(sub)

train_class_dict = class_dict

print(train_class_dict)
print(val_class_dict)
print(test_class_dict)



{'True': ['8', '9', '39', '48', '60', '73', '77', '121', '131', '151', '159', '161', '162', '179', '188', '194', '261', '280', '318', '338'], 'False': ['5', '6', '20', '27', '56', '68', '82', '83', '91', '93', '98', '111', '119', '124', '128', '136', '146', '147', '148', '153', '157', '160', '163', '167', '174', '177', '193', '202', '213', '228', '248', '294', '295', '304', '343']}
{'True': ['21', '125'], 'False': ['118', '62']}
{'True': ['352', '175'], 'False': ['206', '26']}


In [5]:
# Manually altered class dicts which contain the Mandards 1 and 5s (to check if the previous mandards are ambiguous)
train_class_dict = {'True': ['8', '9', '48', '73', '77', '121', '125', '131', '151', '159', '161', '162', '175', '179', '188', '194', '261', '318', '338', '352'], 'False': ['5', '6', '20', '26', '27', '56', '62', '68', '83', '91', '98', '111', '118', '119', '124', '128', '136', '147', '148', '153', '157', '160', '163', '167', '174', '177', '193', '202', '206', '213', '228', '248', '294', '295', '304']}
val_class_dict = {'True': ['21','280'], 'False': ['82', '146']}
test_class_dict = {'True': ['60', '39'], 'False': ['93', '343']}

In [8]:
directories = {'wsi_train_1': train_class_dict, 'wsi_val_1': val_class_dict, 'wsi_test_1': test_class_dict}
origin_dir = '/data/gpfs/projects/punim2070/data_jpg/OAC Slides JPEG Export_downsampled10'

for folder, class_dict in directories.items():
    target_dir = target_dir = os.path.join(deepslide_dir, folder)
    make_directories(class_dict, target_dir)
    sort_subjects(class_dict, origin_dir, target_dir)
    

In [70]:
print(len(os.listdir('/data/gpfs/projects/punim2070/code/deepslide/deepslide/train_folder/train/True')))
print(len(os.listdir('/data/gpfs/projects/punim2070/code/deepslide/deepslide/train_folder/train/False')))

print(len(os.listdir('/data/gpfs/projects/punim2070/code/deepslide/deepslide/train_folder/val/True')))
print(len(os.listdir('/data/gpfs/projects/punim2070/code/deepslide/deepslide/train_folder/val/False')))

51294
51294
8263
3665


In [15]:
true_subjects = set([jpg_file.split('_')[0] for jpg_file in os.listdir('/data/gpfs/projects/punim2070/code/deepslide/deepslide/train_folder/val/True')])
print(f'The TRUE subjects in the validation set are: {true_subjects}')

false_subjects = set([jpg_file.split('_')[0] for jpg_file in os.listdir('/data/gpfs/projects/punim2070/code/deepslide/deepslide/train_folder/val/False')])
print(f'The FALSE subjects in the validation set are: {false_subjects}')

The TRUE subjects in the validation set are: {'161', '151'}
The FALSE subjects in the validation set are: {'213', '124'}
