In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"  # specify which GPU(s) to be used

In [2]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

### Define training folders and files

In [3]:
LABELS = './prostate-cancer-grade-assessment/train.csv'
TRAIN = './panda-16x128x128-tiles-data/train/'

### Define K-fold and random seed

In [4]:
nfolds = 4
SEED = 2020

### Assign folder to each case

In [5]:
center = 'radboud' ## choose from karolinska and radboud
df = pd.read_csv(LABELS).set_index('image_id') ## read in "train.csv" and set 'image_id' as index column
files = sorted(set([p[:32] for p in os.listdir(TRAIN)])) ## extract image id from the img patch folder
df = df.loc[files]
df = df.loc[df['data_provider'] == center]
df = df.reset_index() ## remove 'image_id' that without masks
df.head()

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score
0,0018ae58b01bdadc8e347995b69f99aa,radboud,4,4+4
1,004dd32d9cd167d9cc31c13b704498af,radboud,1,3+3
2,0068d4c7529e34fd4c9da863ce01a161,radboud,3,4+3
3,006f6aa35a78965c92fffd1fbd53a058,radboud,3,4+3
4,007433133235efc27a39f11df6940829,radboud,0,negative


In [6]:
## stratified KFold class that can preserve the sample percentatage in each fold. 
splits = StratifiedKFold(n_splits=nfolds, random_state=SEED, shuffle=True) 
splits = list(splits.split(df,df.isup_grade)) ## list[(fold1_train_idx, fold1_test_idx), (fold2_train_idx, fold2_test_idx), ...]

## mark each sample to the fold, in which the sample serve as a test case
folds_splits = np.zeros(len(df)).astype(np.int) ## [0, 0, ...] with the number of cases
for i in range(nfolds): 
    folds_splits[splits[i][1]] = i

df['split'] = folds_splits ## add the K fold assignment column
df = df.set_index('image_id')
df.head(10)

Unnamed: 0_level_0,data_provider,isup_grade,gleason_score,split
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0018ae58b01bdadc8e347995b69f99aa,radboud,4,4+4,0
004dd32d9cd167d9cc31c13b704498af,radboud,1,3+3,1
0068d4c7529e34fd4c9da863ce01a161,radboud,3,4+3,2
006f6aa35a78965c92fffd1fbd53a058,radboud,3,4+3,0
007433133235efc27a39f11df6940829,radboud,0,negative,3
0076bcb66e46fb485f5ba432b9a1fe8a,radboud,3,4+3,1
008069b542b0439ed69b194674051964,radboud,4,4+4,1
00928370e2dfeb8a507667ef1d4efcbb,radboud,5,4+5,3
00951a7fad040bf7e90f32e81fc0746f,radboud,1,3+3,2
00a26aaa82c959624d90dfb69fcf259c,radboud,4,4+4,2


### Save df to file

In [7]:
df.to_csv('./panda-32x256x256-tiles-data/{}_{}_fold_train.csv'.format(center,nfolds))

## Read the file back to test index

In [8]:
TRAINFOLD = './panda-32x256x256-tiles-data/{}_{}_fold_train.csv'.format(center, nfolds)

In [9]:
df = pd.read_csv(TRAINFOLD) ## read in "train.csv" and set 'image_id' as index column
df.head(10)

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score,split
0,0018ae58b01bdadc8e347995b69f99aa,radboud,4,4+4,0
1,004dd32d9cd167d9cc31c13b704498af,radboud,1,3+3,1
2,0068d4c7529e34fd4c9da863ce01a161,radboud,3,4+3,2
3,006f6aa35a78965c92fffd1fbd53a058,radboud,3,4+3,0
4,007433133235efc27a39f11df6940829,radboud,0,negative,3
5,0076bcb66e46fb485f5ba432b9a1fe8a,radboud,3,4+3,1
6,008069b542b0439ed69b194674051964,radboud,4,4+4,1
7,00928370e2dfeb8a507667ef1d4efcbb,radboud,5,4+5,3
8,00951a7fad040bf7e90f32e81fc0746f,radboud,1,3+3,2
9,00a26aaa82c959624d90dfb69fcf259c,radboud,4,4+4,2


In [10]:
val_inx = df.index[df['split'] == 1].tolist()
print(val_inx[:10])

[1, 5, 6, 12, 14, 16, 25, 31, 32, 38]


In [11]:
train_inx = list(set([x for x in range(len(df))]) - set(val_inx))
print(train_inx[:10])

[0, 2, 3, 4, 7, 8, 9, 10, 11, 13]


In [12]:
len(train_inx), len(val_inx)

(3795, 1265)

In [13]:
df.loc[val_inx[0], 'image_id'], df.loc[val_inx[0], 'isup_grade']

('004dd32d9cd167d9cc31c13b704498af', 1)