This notebook runs a preprocessing script in the first block and splits the imaging data into training and testing such that there are no repeating patients in the test set and that the patients in the test set do not appear in training.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import skimage.transform as skTrans
import nibabel as nib
import pandas as pd
import random
import os
import sys
import time

In [None]:
def normalize_img(img_array):
    maxes = np.quantile(img_array,0.995,axis=(0,1,2))
    #print("Max value for each modality", maxes)
    return img_array/maxes


def create_dataset(meta, meta_all,path_to_datadir):
    files = os.listdir(path_to_datadir)
    start = '_'
    end = '.nii'
    for file in files:
        print(file)
        if file != '.DS_Store':
            path = os.path.join(path_to_datadir, file)
            print(path)
            img_id = file.split(start)[-1].split(end)[0]
            idx = meta[meta["Image Data ID"] == img_id].index[0]
            im = nib.load(path).get_fdata()
            n_i, n_j, n_k = im.shape
            center_i = (n_i - 1) // 2
            center_j = (n_j - 1) // 2
            center_k = (n_k - 1) // 2
            im1 = skTrans.resize(im[center_i, :, :], (72, 72), order=1, preserve_range=True)
            im2 = skTrans.resize(im[:, center_j, :], (72, 72), order=1, preserve_range=True)
            im3 = skTrans.resize(im[:, :, center_k], (72, 72), order=1, preserve_range=True)
            im = np.array([im1,im2,im3]).T
            label = meta.at[idx, "Group"]
            subject = meta.at[idx, "Subject"]
            norm_im = normalize_img(im)
            meta_all = meta_all.append({"img_array": im,"label": label,"subject":subject}, ignore_index=True)


    meta_all.to_pickle('mri_meta.pkl')
    # meta_all.flush()
    # os.fsync(meta_all.fileno())
    time.sleep(0.5)



def main():
    args = sys.argv[1:]
    path_to_meta = args[0]
    path_to_datadir = args[1]
    print(path_to_meta)


    meta = pd.read_csv('img_metadata.csv')
    print("opened meta")
    print(len(meta))
    #get rid of not needed columns
    meta = meta[["Image Data ID", "Group", "Subject"]] #MCI = 0, CN =1, AD = 2
    meta["Group"] = pd.factorize(meta["Group"])[0]
    #initialize new dataset where arrays will go
    meta_all = pd.DataFrame(columns = ["img_array","label","subject"])
    create_dataset(meta, meta_all, 'imgs')

main()

In [None]:
#reading in a dataframe that contains image arrays, patient IDs ("subject"), and diagnosis
m2 = pd.read_pickle("mri_meta.pkl")

#cleaning patient IDs
m2["subject"] = m2["subject"].str.replace("s", "S").str.replace("\n", "")

#reading in the overlap test set
ts = pd.read_csv("overlap_test_set.csv")

#removing ids from the overlap test set
m2 = m2[~m2["subject"].isin(list(ts["subject"].values))]

In [None]:
#there are 551 unique patients
subjects = list(set(m2["subject"].values))
len(subjects)

In [None]:
0.1*len(m2) #10% for testing

We have 3674 MRI scans from 551 patients (some patients repeated up to 16 times).
We selected our testing set such that it has 367 unique MRIs (10% of training) shwon below.
We do not allow for any repeating patients in the testing set. We only allowed repetition during training, and no patient was included in both training and testing sets.

In [None]:
#selecting 367 patient IDs
picked_ids = random.sample(subjects, 26)

In [None]:
#creating the test set out of the patient IDs
test = pd.DataFrame(columns = ["img_array", "subject", "label"])
for i in range(len(picked_ids)):
    s = m2[m2["subject"] == picked_ids[i]].sample()
    test = test.append(s)

In [None]:
indexes = list(set(m2.index) - set(test.index))

In [None]:
#creating the training set using all the other data points
train = m2[m2.index.isin(indexes)]

In [None]:
train[["img_array"]].to_pickle("img_train.pkl")
test[["img_array"]].to_pickle("img_test.pkl")

In [None]:
train[["label"]].to_pickle("img_y_train.pkl")
test[["label"]].to_pickle("img_y_test.pkl")