# Generate Folds

The MURA dataset is delivered with a predefined test and validation split. We will use this validation set as test set and then generate 4 folds from the train data for cross validation.

In [1]:
import itertools
import pprint
import shutil
import re
import pandas as pd
import numpy as np
from pathlib import Path

num_folds = 7
dataset_in = Path(r"C:\Users\jeroe\data\MURA\MURA-v1.1\MURA-v1.1")
dataset_out = Path(r"C:\Users\jeroe\data\MURA") / f'{num_folds}_folded'

## Data Copy

In [2]:
dataset_out.mkdir(parents=True, exist_ok=True)

shutil.copytree(dataset_in / 'valid', dataset_out / 'test')
shutil.copytree(dataset_in / 'train', dataset_out / 'train')

WindowsPath('C:/Users/jeroe/data/MURA/7_folded/train')

## Test set dataframe

In [3]:
df = pd.read_csv(dataset_in / 'valid_image_paths.csv', names=['full_path'])
df['full_path'] = df['full_path'].str.replace('valid', 'test').str.replace('MURA-v1.1/', '')
df.to_csv(dataset_out / 'test.csv', header=None, index=False)

## Train and validation folds

First we load the train dataset csv which we are going to make folds from.

In [13]:
df = pd.read_csv(dataset_in / 'train_image_paths.csv', names=['full_path'])
df['full_path'] = df['full_path'].str.replace('MURA-v1.1/', '')
df['category'] = df['full_path'].str.split('/').str[1]
df['patient'] = df['full_path'].str.split('/').str[2]
df['study'] = df['full_path'].str.split('/').str[3]
df

Unnamed: 0,full_path,category,patient,study
0,train/XR_SHOULDER/patient00001/study1_positive...,XR_SHOULDER,patient00001,study1_positive
1,train/XR_SHOULDER/patient00001/study1_positive...,XR_SHOULDER,patient00001,study1_positive
2,train/XR_SHOULDER/patient00001/study1_positive...,XR_SHOULDER,patient00001,study1_positive
3,train/XR_SHOULDER/patient00002/study1_positive...,XR_SHOULDER,patient00002,study1_positive
4,train/XR_SHOULDER/patient00002/study1_positive...,XR_SHOULDER,patient00002,study1_positive
...,...,...,...,...
36803,train/XR_HAND/patient11183/study1_negative/ima...,XR_HAND,patient11183,study1_negative
36804,train/XR_HAND/patient11183/study1_negative/ima...,XR_HAND,patient11183,study1_negative
36805,train/XR_HAND/patient11184/study1_negative/ima...,XR_HAND,patient11184,study1_negative
36806,train/XR_HAND/patient11184/study1_negative/ima...,XR_HAND,patient11184,study1_negative


Then we only look at the patients and split them over the folds. This way we ensure that the patients in the train set do not overlap with the validation set.

In [16]:
df_patients = df[['patient']].copy().drop_duplicates().sample(frac=1).reset_index()
patient_fold_dfs = np.array_split(df_patients, num_folds)
[len(d) for d in patient_fold_dfs]

[1598, 1598, 1598, 1598, 1598, 1597, 1597]

And now we split the original train dataset according to the patient split we made.

In [17]:
fold_dfs = [df.merge(d, on=['patient'], how='inner') for d in patient_fold_dfs]
[d.groupby('category').agg(count=('category', 'count')) for d in fold_dfs]

[             count
 category          
 XR_ELBOW       720
 XR_FINGER      737
 XR_FOREARM     285
 XR_HAND        840
 XR_HUMERUS     194
 XR_SHOULDER   1188
 XR_WRIST      1318,
              count
 category          
 XR_ELBOW       743
 XR_FINGER      729
 XR_FOREARM     256
 XR_HAND        795
 XR_HUMERUS     179
 XR_SHOULDER   1161
 XR_WRIST      1365,
              count
 category          
 XR_ELBOW       621
 XR_FINGER      744
 XR_FOREARM     294
 XR_HAND        723
 XR_HUMERUS     210
 XR_SHOULDER   1237
 XR_WRIST      1448,
              count
 category          
 XR_ELBOW       670
 XR_FINGER      776
 XR_FOREARM     262
 XR_HAND        799
 XR_HUMERUS     161
 XR_SHOULDER   1186
 XR_WRIST      1434,
              count
 category          
 XR_ELBOW       723
 XR_FINGER      639
 XR_FOREARM     249
 XR_HAND        805
 XR_HUMERUS     204
 XR_SHOULDER   1229
 XR_WRIST      1376,
              count
 category          
 XR_ELBOW       682
 XR_FINGER      784
 XR_FOREARM    

We do a quick check to verify that the patients never overlap between folds.

In [18]:
for (a, b) in itertools.combinations(range(num_folds), 2):
    merged = fold_dfs[a].merge(fold_dfs[b], on=['patient'], how='inner')
    
    print(a, b)
    print(merged)
    print("\n")

0 1
Empty DataFrame
Columns: [full_path_x, category_x, patient, study_x, index_x, full_path_y, category_y, study_y, index_y]
Index: []


0 2
Empty DataFrame
Columns: [full_path_x, category_x, patient, study_x, index_x, full_path_y, category_y, study_y, index_y]
Index: []


0 3
Empty DataFrame
Columns: [full_path_x, category_x, patient, study_x, index_x, full_path_y, category_y, study_y, index_y]
Index: []


0 4
Empty DataFrame
Columns: [full_path_x, category_x, patient, study_x, index_x, full_path_y, category_y, study_y, index_y]
Index: []


0 5
Empty DataFrame
Columns: [full_path_x, category_x, patient, study_x, index_x, full_path_y, category_y, study_y, index_y]
Index: []


0 6
Empty DataFrame
Columns: [full_path_x, category_x, patient, study_x, index_x, full_path_y, category_y, study_y, index_y]
Index: []


1 2
Empty DataFrame
Columns: [full_path_x, category_x, patient, study_x, index_x, full_path_y, category_y, study_y, index_y]
Index: []


1 3
Empty DataFrame
Columns: [full_path_x

Finally, we write out the different folds to separate csv files.

In [19]:
for i in range(num_folds):
    val_id = i
    train_ids = set(range(num_folds)) - {i}
    
    df_val = fold_dfs[val_id]
    df_train = pd.concat([fold_dfs[j] for j in train_ids])
    
    df_val[['full_path']].to_csv(dataset_out / f'fold_{i}_val.csv', header=None, index=False)
    df_train[['full_path']].to_csv(dataset_out / f'fold_{i}_train.csv', header=None, index=False)
    
    print(i, len(df_train), len(df_val))
    print(df_train.groupby('category').agg(count=('category', 'count')))
    print(df_val.groupby('category').agg(count=('category', 'count')))
    print("\n\n")

0 31526 5282
             count
category          
XR_ELBOW      4211
XR_FINGER     4369
XR_FOREARM    1540
XR_HAND       4703
XR_HUMERUS    1078
XR_SHOULDER   7191
XR_WRIST      8434
             count
category          
XR_ELBOW       720
XR_FINGER      737
XR_FOREARM     285
XR_HAND        840
XR_HUMERUS     194
XR_SHOULDER   1188
XR_WRIST      1318



1 31580 5228
             count
category          
XR_ELBOW      4188
XR_FINGER     4377
XR_FOREARM    1569
XR_HAND       4748
XR_HUMERUS    1093
XR_SHOULDER   7218
XR_WRIST      8387
             count
category          
XR_ELBOW       743
XR_FINGER      729
XR_FOREARM     256
XR_HAND        795
XR_HUMERUS     179
XR_SHOULDER   1161
XR_WRIST      1365



2 31531 5277
             count
category          
XR_ELBOW      4310
XR_FINGER     4362
XR_FOREARM    1531
XR_HAND       4820
XR_HUMERUS    1062
XR_SHOULDER   7142
XR_WRIST      8304
             count
category          
XR_ELBOW       621
XR_FINGER      744
XR_FOREARM     294
XR_HA