This notebook splits the imaging data into training and testing such that there are no repeating patients in the test set and that the patients in the test set do not appear in training. 

In [1]:
import pandas as pd
import random
#reading in a dataframe that contains image arrays, patient IDs ("subject"), and diagnosis
m2 = pd.read_pickle("mri_meta.pkl")

#cleaning patient IDs
m2["subject"] = m2["subject"].str.replace("s", "S").str.replace("\n", "")

#reading in the overlap test set
ts = pd.read_csv("overlap_test_set.csv")

#removing ids from the overlap test set
m2 = m2[~m2["subject"].isin(list(ts["subject"].values))]
m2

Unnamed: 0,img_array,label,subject
0,"[[[[36.45114748 37.26595571 0.52279603], [63....",0,002_S_0413
1,"[[[[28.35968846 60.78608813 53.39332376], [33....",0,002_S_0413
2,"[[[[67.34232458 13.72514765 39.75249768], [65....",0,002_S_0413
3,"[[[[26.51095324 49.75030355 0.36219632], [84....",0,002_S_0413
4,"[[[[31.09818037 49.19945394 54.24074462], [33....",0,002_S_0413
...,...,...,...
5735,"[[[[30.15725086 52.06041119 2.73605637], [37....",0,941_S_4376
5736,"[[[[ 28.11486643 550.89959322 1.4210884 ], [...",0,941_S_4376
5737,"[[[[ 26.33395312 250.418828 1.31288741], [...",0,941_S_4376
5738,"[[[[ 18.94171287 233.47920191 1.27902779], [...",0,941_S_4376


In [2]:
subjects = list(set(m2["subject"].values))
len(subjects)

331

In [12]:
picked_ids = random.sample(subjects,40)

In [13]:
#creating the test set out of the patient IDs
test = pd.DataFrame(columns = ["img_array", "subject", "label"]) 
for i in range(len(picked_ids)):
    s = m2[m2["subject"] == picked_ids[i]]
    # print(s)
    test = test.append(s)
test

Unnamed: 0,img_array,subject,label
1285,"[[[[7.11156279 9.08812032 0.70540811], [ 8.263...",018_S_4399,0
1286,"[[[[85.0602055 10.82602557 12.75194104], [87....",018_S_4399,0
1287,"[[[[ 9.52641575 26.50618357 11.57507401], [13....",018_S_4399,0
1288,"[[[[12.42748664 77.79214266 6.76133152], [15....",018_S_4399,0
1289,"[[[[105.42138688 7.42936496 6.24551578], [...",018_S_4399,0
...,...,...,...
4861,"[[[[0.83766848 0.99892293 2.77571537], [1.1773...",128_S_1407,1
4862,"[[[[144.26082478 22.40667962 24.24174308], [...",128_S_1407,1
4863,"[[[[198.49663129 31.47953848 27.34683689], [...",128_S_1407,1
4864,"[[[[43.05474056 13.21280384 10.61013824], [42....",128_S_1407,1


In [14]:
indexes = list(set(m2.index) - set(test.index))
len(indexes)

4644

In [15]:
#creating the training set using all the other data points
train = m2[m2.index.isin(indexes)]

In [16]:
train[["img_array"]].to_pickle("img_train.pkl")
test[["img_array"]].to_pickle("img_test.pkl")

In [17]:
train[["label"]].to_pickle("img_y_train.pkl")
test[["label"]].to_pickle("img_y_test.pkl")