In [1]:
import glob
import os
import shutil
from PIL import Image 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import nibabel as nib
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from modules.scandata import MriScan, MriSlice, TumourSegmentation, ScanType, ScanPlane, PatientRecord
#from modules.exceptions import ScanDataDirectoryNotFound, ScanFileNotFound

# Seed for test/train split and dropping images for undersampling background cases
RSEED=78 

In [2]:
class ScanDataDirectoryNotFound(Exception):
    pass
class ScanFileNotFound(Exception):
    pass

In [3]:
# Create data frame of patients data files
raw_data_dir = os.path.join('data', 'UPENN-GBM')
raw_scan_data_dir = os.path.join(raw_data_dir, 'images_structural')
raw_segmentation_dir = os.path.join(raw_data_dir, 'automated_segm')

for dir in raw_scan_data_dir, raw_segmentation_dir:
    if not os.path.exists(dir):
        raise ScanDataDirectoryNotFound(f'{dir} does not exist')

raw_files_list = []

# Only work on preop patient data -- identifiers end *_11
for patient_scan_dir in glob.glob(os.path.join(raw_scan_data_dir, 'UPENN*11')):
    patient_identifier = os.path.basename(patient_scan_dir)
    t1_filename = os.path.join(patient_scan_dir, f'{patient_identifier}_T1.nii.gz')
    t1ce_filename = os.path.join(patient_scan_dir, f'{patient_identifier}_T1GD.nii.gz')
    t2_filename = os.path.join(patient_scan_dir, f'{patient_identifier}_T2.nii.gz')
    FLAIR_filename = os.path.join(patient_scan_dir, f'{patient_identifier}_FLAIR.nii.gz')
    seg_filename = os.path.join(
        raw_segmentation_dir, f'{patient_identifier}_automated_approx_segm.nii.gz'
    )

    patient_raw_files = [ 
            patient_identifier, 
            patient_scan_dir,
            t1_filename,
            t1ce_filename,
            t2_filename,
            FLAIR_filename,
            seg_filename,
    ]
    
    for file in patient_raw_files[-5:]:
        if not os.path.exists(file):
            raise ScanFileNotFound(
                f'{file} does not exist'
            )
    raw_files_list.append(patient_raw_files)

df_patient_files = pd.DataFrame(raw_files_list, columns=[ 
            'patient_identifier', 
            'patient_scan_dir',
            't1_filename',
            't1ce_filename',
            't2_filename',
            'FLAIR_filename',
            'seg_filename',
])   


In [4]:
# Find all possible slice segmentation types in all patients slices
seg_volumes = {
    0: 'background',  # either healthy brain tissue or nothing
    1: 'tumour', 
    2: 'edema',
    4: 'contrast',
}
slice_list = []
for idx, row in enumerate(df_patient_files.itertuples()):
#for seg_filename in df_patient_files.seg_filename:
    segmentation = TumourSegmentation(row.seg_filename)
    for slice_number , seg_slice in enumerate(segmentation.iterate_slices()):
        slice_list.append([
            row.patient_identifier,
            slice_number,
            '_'.join(
                [seg_volumes[x] for x in sorted(
                    list(set(seg_slice.slice_data.flatten()))
                )]
            )
        ])
        

In [5]:
df_slices = pd.DataFrame(slice_list, columns=['patient_identifier', 'slice_number', 'slice_class'])

In [6]:
df_slices.slice_class.unique()

array(['background_edema', 'background_tumour_edema_contrast',
       'background_edema_contrast', 'background',
       'background_tumour_edema', 'background_tumour_contrast',
       'background_tumour', 'background_contrast'], dtype=object)

In [7]:
# Allocate patient data to train or test set
patients = df_patient_files.patient_identifier
train_patients, test_patients = train_test_split(patients, test_size=0.1, random_state=RSEED)
df_train_patients = pd.DataFrame(train_patients)
df_test_patients = pd.DataFrame(test_patients)
df_train_patients['test_train_set'] = 'train'
df_test_patients['test_train_set'] = 'test'

if 'test_train_set' in df_slices.columns:
    df_slices = df_slices.drop('test_train_set', axis=1)
df_slices = df_slices.merge(
    pd.concat([df_test_patients, df_train_patients]), 
    on='patient_identifier', 
    how='left',
)


In [8]:
# Create classification directories
classification_data_dir = os.path.join(raw_data_dir, 'slice_classification_common')
try:
    os.mkdir(classification_data_dir)
except:
    print(f'{classification_data_dir} already exists')

# Split into test and train
for data_set in 'test', 'train':
    data_set_dir = os.path.join(classification_data_dir, data_set)
    try:
        os.mkdir(data_set_dir)
    except:
        print(f'{data_set_dir} already exists')

    for slice_class in df_slices.slice_class.unique():
        class_dir = os.path.join(data_set_dir, slice_class)
        try:
            os.mkdir(class_dir)
        except:
            print(f'{class_dir} already exists')
            



In [9]:
for idx, row in enumerate(df_patient_files.itertuples()):
    T1_scan = MriScan(
        filename=row.t1_filename,
        sequence=ScanType.T1
    )
    T1CE_scan = MriScan(
        filename=row.t1ce_filename,
        sequence=ScanType.T1CE
    )
    T2_scan = MriScan(
        filename=row.t2_filename,
        sequence=ScanType.T2
    )
    FLAIR_scan = MriScan(
        filename=row.FLAIR_filename,
        sequence=ScanType.FLAIR
    )
    segmentation = TumourSegmentation(
        row.seg_filename,
        )
    patient_data = PatientRecord()
    patient_data.add_scan_data(T1_scan)
    patient_data.add_scan_data(T1CE_scan)
    patient_data.add_scan_data(T2_scan)
    patient_data.add_scan_data(FLAIR_scan)
    patient_data.add_segmentation(segmentation)
    png_basename_prefix = f'{row.patient_identifier}_allseq'
    patient_data.save_multi_channel_png(os.path.join(classification_data_dir,png_basename_prefix))
    

In [10]:
df_slice_class_counts = df_slices.query(
    'test_train_set == "train"'
).slice_class.value_counts().to_frame().reset_index().set_axis(
    ['slice_class', 'slice_class_count'], axis=1
)
df_slices =  pd.merge(df_slices,df_slice_class_counts, how='left')

In [11]:
# Calculate class weight due to unbalanced classes
print(df_slices.query('test_train_set == "train"').slice_class.value_counts())
# Background only slices most common and least useful for training. 
# Take only 30 % in training sample data set and drop classes with less than 100 slices
df_sliced_dropped_background =  df_slices.query(
        'test_train_set == "train" and slice_class == "background"'
).sample(frac=0.70, random_state=RSEED)
df_sliced_dropped_rare = df_slices.query('slice_class_count < 100')
df_slices_after_drop = df_slices.drop(df_sliced_dropped_background.index)
df_slices_after_drop = df_slices_after_drop.drop(df_sliced_dropped_rare.index)
print("After Dropping:")
print(df_slices_after_drop.query('test_train_set == "train"' ).slice_class.value_counts())


background                          50528
background_tumour_edema_contrast    20376
background_edema                    12354
background_edema_contrast            1509
background_tumour_edema               260
background_contrast                    48
background_tumour_contrast             18
background_tumour                       2
Name: slice_class, dtype: int64
After Dropping:
background_tumour_edema_contrast    20376
background                          15158
background_edema                    12354
background_edema_contrast            1509
background_tumour_edema               260
Name: slice_class, dtype: int64


In [12]:
class_weight = compute_class_weight(
    class_weight='balanced',
    classes = np.unique(df_slices_after_drop['slice_class']),
    y = df_slices_after_drop['slice_class']
)
class_weight_dict = dict(
    zip(np.unique(df_slices_after_drop['slice_class']),class_weight)
)
print(class_weight_dict)

{'background': 0.5706499759268175, 'background_edema': 0.8619927272727272, 'background_edema_contrast': 7.231482611348383, 'background_tumour_edema': 35.066272189349114, 'background_tumour_edema_contrast': 0.5206413353832637}


In [13]:
# Move data to classification directories
for idx, row in enumerate(df_slices_after_drop.itertuples()):
    src_file = os.path.join(
        classification_data_dir,
        f'{row.patient_identifier}_allseq_{row.slice_number:03}.png'
    )
    dest_dir = os.path.join(classification_data_dir,row.test_train_set,row.slice_class)
    shutil.move(src_file,dest_dir)
# Delete dropped  files to clean up
for frame in df_sliced_dropped_background, df_sliced_dropped_rare:
    for idx, row in enumerate(frame.itertuples()):
        del_file = os.path.join(
            classification_data_dir,
            f'{row.patient_identifier}_allseq_{row.slice_number:03}.png'
        )
        os.remove(del_file)

In [14]:

# Delete unused class directories
for data_set in 'test', 'train':
    data_set_dir = os.path.join(classification_data_dir, data_set)
    for slice_class in df_sliced_dropped_rare.slice_class.unique():
        
        rm_dir = os.path.join(data_set_dir,slice_class)
        try:
            os.rmdir(rm_dir)
        except:
            print(f'{rm_dir} does not exist')
         
