# Data splits

This notebook focuses on building the data splits to train the model, namely: train, validation and test.

*Constraint: every splits must contain unique subjects, namely different MR session for a subject x, needs to be contained in only ONE split*.

In [65]:
import pandas as pd
import os
import sys

# Append the path to access custom modules
if '../' not in sys.path:
    sys.path.append('../')

import src.utils.data_splitter as ds
import src.utils.dataset_builder as db
import importlib

importlib.reload(db);
importlib.reload(ds);

In [66]:
df = pd.read_csv('../data/csv/final_dataset.csv')
df = df.drop(columns='Unnamed: 0')

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2475 entries, 0 to 2474
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Subject                       2475 non-null   object 
 1   MR_session                    2475 non-null   object 
 2   TOTAL_HIPPOCAMPUS_VOLUME      2475 non-null   float64
 3   Left-Hippocampus_volume       2475 non-null   float64
 4   lh_parahippocampal_thickness  2475 non-null   float64
 5   lh_parahippocampal_volume     2475 non-null   int64  
 6   rh_parahippocampal_volume     2475 non-null   int64  
 7   rh_parahippocampal_thickness  2475 non-null   float64
 8   Right-Hippocampus_volume      2475 non-null   float64
 9   label                         2475 non-null   object 
dtypes: float64(5), int64(2), object(3)
memory usage: 193.5+ KB


In [68]:
helper = db.DatasetHelper

## Undersample majority class

- Class distribution before resample

In [69]:
df['label'].value_counts()

label
Non-Demented    1981
MCI              381
Demented         113
Name: count, dtype: int64

- Get the undersampled dataset that will be the basis dataset from now on

In [70]:
df = df.loc[helper.undersample_majority_class(df['MR_session'], df['label'], 500)]

- Class distribution after resample

In [71]:
df['label'].value_counts()

label
Non-Demented    500
MCI             381
Demented        113
Name: count, dtype: int64

- Create the csv file

In [72]:
df.to_csv('../data/csv/final_dataset_undersampled.csv', index=False)

- At this point we can create the new folder with the freesurfers whose MR label is in the resampled df

In [73]:
helper.move_folders(
    subject_experiments=df['MR_session'].to_list(), 
    base_dir=os.path.join('..', 'data', 'freesurfers'),
    dst_dir=os.path.join('..', 'data', 'freesurfers_undersampled')
)

## Define data splitting