This notebook splits the imaging data into training and testing such that there are no repeating patients in the test set and that the patients in the test set do not appear in training. 

In [1]:
import pandas as pd
import random
#reading in a dataframe that contains image arrays, patient IDs ("subject"), and diagnosis
m2 = pd.read_pickle("pet_meta.pkl")

#cleaning patient IDs
m2["subject"] = m2["subject"].str.replace("s", "S").str.replace("\n", "")

#reading in the overlap test set
ts = pd.read_csv("overlap_test_set.csv")

#removing ids from the overlap test set
m2 = m2[~m2["subject"].isin(list(ts["subject"].values))]
m2

Unnamed: 0,img_array,label,subject
0,"[[[[1.4607339 1.88971052 0. ], [3.5236...",0,002_S_1261
1,"[[[[1.47235808 1.46073439 0. ], [4.3837...",0,002_S_1261
2,"[[[[1.87808691 1.46073382 0. ], [4.6664...",0,002_S_1261
3,"[[[[2.24326979 2.25489455 0. ], [5.3749...",0,002_S_1261
4,"[[[[3.32719655 2.9736367 0. ], [7.3156...",0,002_S_1261
...,...,...,...
3338,"[[[[ 8.86171522 11.51162418 0. ], [ 7....",0,941_S_4100
3339,"[[[[ 9.73837582 10.66153666 0. ], [ 8....",0,941_S_4100
3340,"[[[[9.75000057 8.87499943 0. ], [9.9107...",0,941_S_4100
3341,"[[[[8.86337582 8.875 0. ], [8.0298...",0,941_S_4100


In [2]:
subjects = list(set(m2["subject"].values))
len(subjects)

163

In [9]:
picked_ids = random.sample(subjects, 15) 

In [10]:
#creating the test set out of the patient IDs
test = pd.DataFrame(columns = ["img_array", "subject", "label"]) 
for i in range(len(picked_ids)):
    s = m2[m2["subject"] == picked_ids[i]]
    test = test.append(s)
test

Unnamed: 0,img_array,subject,label
798,"[[[[ 87.52289374 108.77633063 0. ], [...",014_S_0658,1
799,"[[[[53.45382945 98.33837636 0. ], [101...",014_S_0658,1
800,"[[[[ 43.18469584 134.38366868 0. ], [...",014_S_0658,1
801,"[[[[ 43.18469584 103.47294867 0. ], [...",014_S_0658,1
802,"[[[[102.85728472 179.90780652 0. ], [...",014_S_0658,1
...,...,...,...
2584,[[[[ 660.67545966 -490.58509784 0. ]...,127_S_0925,1
2585,[[[[-431.52617466 -666.99322538 0. ]...,127_S_0925,1
2586,"[[[[288.74824517 380.3832117 0. ], [...",127_S_0925,1
2587,"[[[[472.80809637 125.80341065 0. ], [...",127_S_0925,1


In [12]:
indexes = list(set(m2.index) - set(test.index))
len(indexes)

2268

In [13]:
#creating the training set using all the other data points
train = m2[m2.index.isin(indexes)]
train

Unnamed: 0,img_array,label,subject
0,"[[[[1.4607339 1.88971052 0. ], [3.5236...",0,002_S_1261
1,"[[[[1.47235808 1.46073439 0. ], [4.3837...",0,002_S_1261
2,"[[[[1.87808691 1.46073382 0. ], [4.6664...",0,002_S_1261
3,"[[[[2.24326979 2.25489455 0. ], [5.3749...",0,002_S_1261
4,"[[[[3.32719655 2.9736367 0. ], [7.3156...",0,002_S_1261
...,...,...,...
3338,"[[[[ 8.86171522 11.51162418 0. ], [ 7....",0,941_S_4100
3339,"[[[[ 9.73837582 10.66153666 0. ], [ 8....",0,941_S_4100
3340,"[[[[9.75000057 8.87499943 0. ], [9.9107...",0,941_S_4100
3341,"[[[[8.86337582 8.875 0. ], [8.0298...",0,941_S_4100


In [14]:
train[["img_array"]].to_pickle("pet_train.pkl")
test[["img_array"]].to_pickle("pet_test.pkl")

In [15]:
train[["label"]].to_pickle("pet_y_train.pkl")
test[["label"]].to_pickle("pet_y_test.pkl")