# Data splits

This notebook focuses on building the data splits to train the model, namely: train, validation and test.

*Constraint: every splits must contain unique subjects, namely different MR session for a subject x, needs to be contained in only ONE split*.

In [1]:
import pandas as pd
import os
import sys

# Append the path to access custom modules
if '../' not in sys.path:
    sys.path.append('../')

import src.utils.data_splitter as ds
import src.utils.dataset_helper as db
import importlib

In [2]:
importlib.reload(ds);
importlib.reload(db);

In [3]:
helper = db.DatasetHelper

In [4]:
df = pd.read_csv('../data/csv/final_dataset.csv')
df = df.drop(columns='Unnamed: 0')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2379 entries, 0 to 2378
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Subject                       2379 non-null   object 
 1   MR_session                    2379 non-null   object 
 2   TOTAL_HIPPOCAMPUS_VOLUME      2379 non-null   float64
 3   Left-Hippocampus_volume       2379 non-null   float64
 4   lh_parahippocampal_thickness  2379 non-null   float64
 5   lh_parahippocampal_volume     2379 non-null   int64  
 6   rh_parahippocampal_volume     2379 non-null   int64  
 7   rh_parahippocampal_thickness  2379 non-null   float64
 8   Right-Hippocampus_volume      2379 non-null   float64
 9   label                         2379 non-null   object 
 10  CDRTOT                        2379 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 204.6+ KB


## Define the test set

In [6]:
df = pd.read_csv('../data/csv/final_dataset_undersampled.csv')

- Remove error instances (impossible to segment) from the undersampled dataset

In [7]:
errors = [
    #'OAS30163_MR_d0091',
    'OAS30393_MR_d6803',
    #'OAS30549_MR_d1244',
    'OAS30662_MR_d2080',
    #'OAS30678_MR_d0093'
]

df = df.drop(index=df[df['MR_session'].isin(errors)].index)

- Check if the sessions in the csv match the sessions in the segmented folder

In [8]:
df['MR_session'].sort_values().to_list() == os.listdir('../data/segmented')

True

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1034 entries, 0 to 1035
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    1034 non-null   int64  
 1   Subject                       1034 non-null   object 
 2   MR_session                    1034 non-null   object 
 3   TOTAL_HIPPOCAMPUS_VOLUME      1034 non-null   float64
 4   Left-Hippocampus_volume       1034 non-null   float64
 5   lh_parahippocampal_thickness  1034 non-null   float64
 6   lh_parahippocampal_volume     1034 non-null   int64  
 7   rh_parahippocampal_volume     1034 non-null   int64  
 8   rh_parahippocampal_thickness  1034 non-null   float64
 9   Right-Hippocampus_volume      1034 non-null   float64
 10  label                         1034 non-null   object 
 11  CDRTOT                        1034 non-null   float64
dtypes: float64(6), int64(3), object(3)
memory usage: 105.0+ KB


- Instantiate the data splitter

In [10]:
splitter = ds.DataSplitterTrainTest(df['MR_session'], df['label'])

- Retrieve the the train and test instances

In [11]:
train_idx, test_idx = splitter.get_train_test_set_idx()

- Check train label distribution

In [13]:
df.loc[train_idx]['label'].value_counts()

label
Cognitevely-normal    474
Early-stage           323
Demented              102
Name: count, dtype: int64

- Check test label distribution

In [12]:
df.loc[test_idx]['label'].value_counts()

label
Cognitevely-normal    74
Early-stage           44
Demented              17
Name: count, dtype: int64

- Create a csv file related to test instances

In [13]:
df.loc[test_idx].to_csv('../data/csv/test.csv')

- Test volumetric images can now be moved in the dedicated folder

In [14]:
helper.move_folders(
    subject_experiments=df.loc[test_idx, 'MR_session'],
    base_dir=os.path.join('..', 'data', 'segmented'),
    dst_dir=os.path.join('..', 'data', 'test')
)

## Define validation set

We can procede like before to define the validation set, to evaluate the model during training

In [15]:
# Work on a copy of the training set only
X_train = df.loc[train_idx].copy(deep=True) 

- Split the data

In [16]:
splitter = ds.DataSplitterTrainTest(X_train['MR_session'], X_train['label'])
train_idx, val_idx = splitter.get_train_test_set_idx()

In [17]:
X_train.loc[train_idx, 'label'].value_counts()

label
Cognitevely-normal    415
Early-stage           291
Demented               87
Name: count, dtype: int64

- Drop some cognitevely normal instances, in order to build a training set with 400 samples for each class

In [18]:
train_idx = helper.undersample_majority_class(X_train.loc[train_idx, 'label'], 400)

- Get the final distribution of the training set, before data augmentation

In [19]:
X_train.loc[train_idx, 'label'].value_counts()

label
Cognitevely-normal    400
Early-stage           291
Demented               87
Name: count, dtype: int64

- Distribution of classes for the validation set

In [20]:
X_train.loc[val_idx, 'label'].value_counts()

label
Cognitevely-normal    59
Early-stage           32
Demented              15
Name: count, dtype: int64

- Move train set images to a separate folder

In [21]:
helper.move_folders(
    subject_experiments=X_train.loc[train_idx, 'MR_session'],
    base_dir=os.path.join('..', 'data', 'segmented'),
    dst_dir=os.path.join('..', 'data', 'train')
)

- Move the validation set images to a separate folder

In [22]:
helper.move_folders(
    subject_experiments=X_train.loc[val_idx, 'MR_session'],
    base_dir=os.path.join('..', 'data', 'segmented'),
    dst_dir=os.path.join('..', 'data', 'validation')
)

- Create clean (with no augmentated instances) csv file related to train and validation set

In [23]:
X_train.loc[train_idx].to_csv('../data/csv/train_clean.csv')
X_train.loc[val_idx].to_csv('../data/csv/validation_clean.csv')

## Add augmentation labels to train set

- This section will add to the train dataset (train + validation) the augmented instances to improve class balance
- Let's start by augmenting MCIs

In [24]:
mci_aug_train = helper.augment_dataframe(
    df=X_train.loc[train_idx][X_train.loc[train_idx, 'label'] == 'Early-stage'].sample(n=30, random_state=42),
    id_col_name='MR_session',
    suffix_list=['flip']
) + helper.augment_dataframe(
    df=X_train.loc[train_idx][X_train.loc[train_idx, 'label'] == 'Early-stage'].sample(n=28, random_state=43),
    id_col_name='MR_session',
    suffix_list=['rot30']
) + helper.augment_dataframe(
    df=X_train.loc[train_idx][X_train.loc[train_idx, 'label'] == 'Early-stage'].sample(n=25, random_state=44),
    id_col_name='MR_session',
    suffix_list=['rot60']
) + helper.augment_dataframe(
    df=X_train.loc[train_idx][X_train.loc[train_idx, 'label'] == 'Early-stage'].sample(n=26, random_state=45),
    id_col_name='MR_session',
    suffix_list=['rot90']
)

- Augment demented label

In [25]:
dem_aug_train = helper.augment_dataframe(
    df=X_train.loc[train_idx][X_train.loc[train_idx, 'label'] == 'Demented'].sample(n=78, random_state=42),
    id_col_name='MR_session',
    suffix_list=['flip']
) + helper.augment_dataframe(
    df=X_train.loc[train_idx][X_train.loc[train_idx, 'label'] == 'Demented'].sample(n=79, random_state=43),
    id_col_name='MR_session',
    suffix_list=['rot30']
) + helper.augment_dataframe(
    df=X_train.loc[train_idx][X_train.loc[train_idx, 'label'] == 'Demented'].sample(n=77, random_state=44),
    id_col_name='MR_session',
    suffix_list=['rot60']
) + helper.augment_dataframe(
    df=X_train.loc[train_idx][X_train.loc[train_idx, 'label'] == 'Demented'].sample(n=79, random_state=45),
    id_col_name='MR_session',
    suffix_list=['rot90']
)

- Get the final training dataframe

In [27]:
X_train_final = pd.concat(objs=[X_train.loc[train_idx]] + mci_aug_train + dem_aug_train, ignore_index=True)

- This is the final distribution of the labels on the training set

In [28]:
X_train_final['label'].value_counts()

label
Cognitevely-normal    400
Demented              400
Early-stage           400
Name: count, dtype: int64

- Augment the validation set for the early-stage instances

In [29]:
mci_aug_val = helper.augment_dataframe(
    df=X_train.loc[val_idx][X_train.loc[val_idx, 'label'] == 'Early-stage'].sample(n=6, random_state=42),
    id_col_name='MR_session',
    suffix_list=['flip']
) + helper.augment_dataframe(
    df=X_train.loc[val_idx][X_train.loc[val_idx, 'label'] == 'Early-stage'].sample(n=7, random_state=42),
    id_col_name='MR_session',
    suffix_list=['rot30']
) + helper.augment_dataframe(
    df=X_train.loc[val_idx][X_train.loc[val_idx, 'label'] == 'Early-stage'].sample(n=7, random_state=42),
    id_col_name='MR_session',
    suffix_list=['rot60']
) + helper.augment_dataframe(
    df=X_train.loc[val_idx][X_train.loc[val_idx, 'label'] == 'Early-stage'].sample(n=7, random_state=42),
    id_col_name='MR_session',
    suffix_list=['rot90']
)

- Augment the validation set for the demented instances

In [30]:
dem_aug_val = helper.augment_dataframe(
    df=X_train.loc[val_idx][X_train.loc[val_idx, 'label'] == 'Demented'].sample(n=10, random_state=42),
    id_col_name='MR_session',
    suffix_list=['flip']
) + helper.augment_dataframe(
    df=X_train.loc[val_idx][X_train.loc[val_idx, 'label'] == 'Demented'].sample(n=11, random_state=43),
    id_col_name='MR_session',
    suffix_list=['rot30']
) + helper.augment_dataframe(
    df=X_train.loc[val_idx][X_train.loc[val_idx, 'label'] == 'Demented'].sample(n=11, random_state=44),
    id_col_name='MR_session',
    suffix_list=['rot60']
) + helper.augment_dataframe(
    df=X_train.loc[val_idx][X_train.loc[val_idx, 'label'] == 'Demented'].sample(n=12, random_state=45),
    id_col_name='MR_session',
    suffix_list=['rot90']
)

- Get the final validation dataset

In [31]:
X_val_final = pd.concat(objs=[X_train.loc[val_idx]] + mci_aug_val + dem_aug_val, ignore_index=True)

- This is the final distribution of labels on the validation set

In [32]:
X_val_final['label'].value_counts()

label
Cognitevely-normal    59
Demented              59
Early-stage           59
Name: count, dtype: int64

- Build a new column to indicate the augmentation operation to perform for a specific instance

In [33]:
X_train_final['augmentation'] = helper.extract_augmentation_column(X_train_final['MR_session'])
X_val_final['augmentation'] = helper.extract_augmentation_column(X_val_final['MR_session'])

In [34]:
X_train_final['MR_session_original'] = X_train_final['MR_session_original'].fillna(value='normal')
X_val_final['MR_session_original'] = X_val_final['MR_session_original'].fillna(value='normal')

- Save the augmented dataset to csv

In [35]:
X_train_final.to_csv('../data/csv/train.csv')
X_val_final.to_csv('../data/csv/validation.csv')