In [1]:
import glob
import os
import shutil
from PIL import Image 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import nibabel as nib
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight



from modules.scandata import MriScan, MriSlice, TumourSegmentation, ScanType, ScanPlane, PatientRecord
#from modules.exceptions import ScanDataDirectoryNotFound, ScanFileNotFound

# Seed for test/train split and dropping images for undersampling background cases
RSEED=78 

TESTSIZE=150

In [2]:
class ScanDataDirectoryNotFound(Exception):
    pass
class ScanFileNotFound(Exception):
    pass

In [3]:
# Create data frame of patients data files
raw_data_dir = os.path.join('data', 'UPENN-GBM')
raw_scan_data_dir = os.path.join(raw_data_dir, 'images_structural')
raw_segmentation_dir = os.path.join(raw_data_dir, 'automated_segm')

for dir in raw_scan_data_dir, raw_segmentation_dir:
    if not os.path.exists(dir):
        raise ScanDataDirectoryNotFound(f'{dir} does not exist')

raw_files_list = []

# Only work on preop patient data -- identifiers end *_11
# Test set, take only TESTSIZE patients
patients = 0
for patient_scan_dir in glob.glob(os.path.join(raw_scan_data_dir, 'UPENN*11')):
    patient_identifier = os.path.basename(patient_scan_dir)
    t1_filename = os.path.join(patient_scan_dir, f'{patient_identifier}_T1.nii.gz')
    t1ce_filename = os.path.join(patient_scan_dir, f'{patient_identifier}_T1GD.nii.gz')
    t2_filename = os.path.join(patient_scan_dir, f'{patient_identifier}_T2.nii.gz')
    FLAIR_filename = os.path.join(patient_scan_dir, f'{patient_identifier}_FLAIR.nii.gz')
    seg_filename = os.path.join(
        raw_segmentation_dir, f'{patient_identifier}_automated_approx_segm.nii.gz'
    )

    patient_raw_files = [ 
            patient_identifier, 
            patient_scan_dir,
            t1_filename,
            t1ce_filename,
            t2_filename,
            FLAIR_filename,
            seg_filename,
    ]
    
    for file in patient_raw_files[-5:]:
        if not os.path.exists(file):
            raise ScanFileNotFound(
                f'{file} does not exist'
            )
    raw_files_list.append(patient_raw_files)

    #Check if enough in testing set
    patients += 1
    if patients == TESTSIZE:
        break

df_patient_files = pd.DataFrame(raw_files_list, columns=[ 
            'patient_identifier', 
            'patient_scan_dir',
            't1_filename',
            't1ce_filename',
            't2_filename',
            'FLAIR_filename',
            'seg_filename',
])   



In [4]:
df_clinical = pd.read_csv(
    os.path.join(raw_data_dir,'table_data','UPENN-GBM_clinical_info_v1.0.csv')
)

In [5]:
df_clinical['age_bin'] = '<40'
df_clinical.loc[df_clinical['Age_at_scan_years']>=40, 'age_bin'] = '40-50'
df_clinical.loc[df_clinical['Age_at_scan_years']>=50, 'age_bin'] = '50-60'
df_clinical.loc[df_clinical['Age_at_scan_years']>=60, 'age_bin'] = '60-70'
df_clinical.loc[df_clinical['Age_at_scan_years']>=70, 'age_bin'] = '70-80'
df_clinical.loc[df_clinical['Age_at_scan_years']>=80, 'age_bin'] = '>80'

df_clinical['stratify_class'] = df_clinical['Gender'] + df_clinical['age_bin']


In [6]:
if 'stratify_class' in df_patient_files.columns:
    df_patient_files.drop('stratify_class', axis=1)
df_patient_files = pd.merge(
    df_patient_files, 
    df_clinical[['ID','stratify_class']], 
    how='left', 
    left_on='patient_identifier', 
    right_on='ID'
).drop('ID', axis=1)

In [7]:
# Allocate patient data to train or test set
patients = df_patient_files[['patient_identifier', 'stratify_class']]
train_patients, test_patients = train_test_split(
    patients, 
    test_size=0.5, 
    random_state=RSEED,
    stratify=df_patient_files.stratify_class
)
df_train_patients = pd.DataFrame(train_patients)
df_test_patients = pd.DataFrame(test_patients)
df_train_patients['test_train_set'] = 'train'
df_test_patients['test_train_set'] = 'test'



In [8]:
 #df_slices = df_slices.merge(
df_patient_files = pd.merge(
    df_patient_files,
    pd.concat([df_test_patients, df_train_patients]).drop('stratify_class', axis=1),
    on='patient_identifier',
    how='left',
)


In [9]:
# Create segmentation directories
segmentation_data_dir = os.path.join(raw_data_dir, f'slice_segmentation_stratify_sample{TESTSIZE}')
images_dir_name = 'image_data'
map_dir_name = 'map_data'
try:
    os.mkdir(segmentation_data_dir)
except:
    print(f'{segmentation_data_dir} already exists')

# Split into test and train
for data_set in 'test', 'train':
    data_set_dir = os.path.join(segmentation_data_dir, data_set)
    try:
        os.mkdir(data_set_dir)
    except:
        print(f'{data_set_dir} already exists')

    for image_class in [images_dir_name, map_dir_name]:
        class_dir = os.path.join(data_set_dir, image_class)
        try:
            os.mkdir(class_dir)
        except:
            print(f'{class_dir} already exists')
            



In [10]:
deleted_slices = 0
for idx, row in enumerate(df_patient_files.itertuples()):
    test_train_set = row.test_train_set
    T1_scan = MriScan(
        filename=row.t1_filename,
        sequence=ScanType.T1
    )
    T1CE_scan = MriScan(
        filename=row.t1ce_filename,
        sequence=ScanType.T1CE
    )
    T2_scan = MriScan(
        filename=row.t2_filename,
        sequence=ScanType.T2
    )
    FLAIR_scan = MriScan(
        filename=row.FLAIR_filename,
        sequence=ScanType.FLAIR
    )
    segmentation = TumourSegmentation(
        row.seg_filename,
        scale_png_data=False
        )
    patient_data = PatientRecord()
    patient_data.add_scan_data(T1_scan)
    patient_data.add_scan_data(T1CE_scan)
    patient_data.add_scan_data(T2_scan)
    patient_data.add_scan_data(FLAIR_scan)
    patient_data.add_segmentation(segmentation)
    png_image_prefix = os.path.join(
        segmentation_data_dir,
        test_train_set,
        images_dir_name,
        f'{row.patient_identifier}_allseq',
    )
    png_map_prefix = os.path.join(
        segmentation_data_dir,
        test_train_set,
        map_dir_name,
        f'{row.patient_identifier}_map',
    )
    patient_data.save_multi_channel_png(png_image_prefix)
    patient_data.save_segmentation_png(png_map_prefix)

    # Loop through created scan pngs and remove 95% of images with no brain
    slice_count = len(glob.glob(f'{png_image_prefix}*'))
    for i in range(slice_count):
        img_file = f'{png_image_prefix}_{i:03}.png'
        img = np.array(Image.open(img_file))
        if img.max() == 0 and np.random.rand() > 0.05:
            os.remove(img_file)
            os.remove(f'{png_map_prefix}_{i:03}.png')
            deleted_slices += 1
    

FileNotFoundError: [Errno 2] No such file or directory: 'data/UPENN-GBM/slice_segmentation_stratify_sample150/train/image_data/data/UPENN-GBM/slice_segmentation_stratify_sample150/train/image_data/UPENN-GBM-00563_11_allseq_000.png'