# Train-Test Data Splitting
60% of the patient's timespan should be for testing, 40% for training
1. Load the timespan
2. Take 60% of it and add it to the first recording's start
3. Round that to be the start of an actual segment (so it's clear what that segment belongs to)
4. Segments before (<= to) that value will be train. Segments after (> than) that value will be test.
5. Save that value


In [1]:
import pandas as pd

from config.paths import PATHS

In [2]:
ptnt_dir = PATHS.patient_dirs()[0]
ptnt_dir

PatientDir('/data/home/webb/UNEEG_data/20240201_UNEEG_ForMayo/K37N36L4D')

In [3]:
ptnt_info = pd.read_pickle(PATHS.patient_info_exact_pkl)
dataset = ptnt_dir.parent.name
ptnt_info = ptnt_info.loc[(dataset, ptnt_dir.name)]
recordings_start, timespan = ptnt_info['recordings_start'], ptnt_info['timespan']
print(ptnt_info)

valid                                          True
total_seizures                                   12
valid_seizures                                   12
enough_valid_seizures                          True
recordings_start                2022-02-23 11:20:29
recordings_end           2022-06-28 11:40:10.458100
timespan                   125 days 00:19:41.458100
duration_recorded          119 days 14:16:12.821550
duration_not_recorded        5 days 10:03:28.636550
ratio_recorded                             0.956652
valid_ratio_recorded                           True
Name: (20240201_UNEEG_ForMayo, K37N36L4D), dtype: object


In [4]:
RATIO_OF_TIMESPAN_FOR_TRAINING = 0.6

In [5]:
# How long the training part is
timespan_train = timespan * RATIO_OF_TIMESPAN_FOR_TRAINING
print('total:', timespan)
print('train:', timespan_train)
print('test: ', timespan - timespan_train)

total: 125 days 00:19:41.458100
train: 75 days 00:11:48.874860
test:  50 days 00:07:52.583240


In [6]:
# End of training portion
train_end_approx = recordings_start + timespan_train
train_end_approx

Timestamp('2022-05-09 11:32:17.874860')

In [29]:
# Adjust train end to segments
segs = pd.read_csv(ptnt_dir.segments_table, usecols=['start'], parse_dates=['start'])
# Finds the index where train_end_approx would be inserted to maintain order, i.e. which starts it's between.
idx = segs['start'].searchsorted(train_end_approx)
train_end_exact = segs['start'].iloc[idx]

In [30]:
print('idx:', idx)
print('Exact  :', train_end_exact)
print('Approx :', train_end_approx)
diff = train_end_exact - train_end_approx
print('diff   :', diff)

idx: 432113
Exact  : 2022-05-09 11:32:31.745750
Approx : 2022-05-09 11:32:17.874860
diff   : 0 days 00:00:13.870890


In [32]:
segs = pd.read_csv(ptnt_dir.segments_table, parse_dates=['start'])
train_mask = segs['start'] <= train_end_exact
segs['train'] = train_mask
segs

Unnamed: 0,start,type,lead_szr,exists,file,start_index,train
0,2022-02-23 11:20:29.000000,interictal,,True,K37N36L4D_2022-02-23_11-20-29.edf,0.0,True
1,2022-02-23 11:20:43.997750,interictal,,True,K37N36L4D_2022-02-23_11-20-29.edf,3105.0,True
2,2022-02-23 11:20:58.995500,interictal,,True,K37N36L4D_2022-02-23_11-20-29.edf,6210.0,True
3,2022-02-23 11:21:13.993250,interictal,,True,K37N36L4D_2022-02-23_11-20-29.edf,9315.0,True
4,2022-02-23 11:21:28.991000,interictal,,True,K37N36L4D_2022-02-23_11-20-29.edf,12420.0,True
...,...,...,...,...,...,...,...
720181,2022-06-28 11:38:43.592750,interictal,,True,K37N36L4D_2022-06-28_06-57-47.edf,3489838.0,False
720182,2022-06-28 11:38:58.590500,interictal,,True,K37N36L4D_2022-06-28_06-57-47.edf,3492943.0,False
720183,2022-06-28 11:39:13.588250,interictal,,True,K37N36L4D_2022-06-28_06-57-47.edf,3496048.0,False
720184,2022-06-28 11:39:28.586000,interictal,,True,K37N36L4D_2022-06-28_06-57-47.edf,3499153.0,False


In [24]:
value_counts = train_mask.value_counts()
value_counts

start
True     432114
False    288072
Name: count, dtype: int64

In [23]:
# Make sure all segs have been assigned
train_mask.isna().any()

np.False_

In [25]:
value_counts.sum() == len(segs)

np.True_

In [27]:
# Ratio for Train
value_counts.loc[True] / segs.shape[0]

np.float64(0.6000033324724446)

In [28]:
# Ratio for Test
value_counts.loc[False] / segs.shape[0]

np.float64(0.39999666752755536)

In [33]:
existing_segs = (segs[segs['exists']]).drop(columns=['exists'])
existing_segs

Unnamed: 0,start,type,lead_szr,file,start_index,train
0,2022-02-23 11:20:29.000000,interictal,,K37N36L4D_2022-02-23_11-20-29.edf,0.0,True
1,2022-02-23 11:20:43.997750,interictal,,K37N36L4D_2022-02-23_11-20-29.edf,3105.0,True
2,2022-02-23 11:20:58.995500,interictal,,K37N36L4D_2022-02-23_11-20-29.edf,6210.0,True
3,2022-02-23 11:21:13.993250,interictal,,K37N36L4D_2022-02-23_11-20-29.edf,9315.0,True
4,2022-02-23 11:21:28.991000,interictal,,K37N36L4D_2022-02-23_11-20-29.edf,12420.0,True
...,...,...,...,...,...,...
720181,2022-06-28 11:38:43.592750,interictal,,K37N36L4D_2022-06-28_06-57-47.edf,3489838.0,False
720182,2022-06-28 11:38:58.590500,interictal,,K37N36L4D_2022-06-28_06-57-47.edf,3492943.0,False
720183,2022-06-28 11:39:13.588250,interictal,,K37N36L4D_2022-06-28_06-57-47.edf,3496048.0,False
720184,2022-06-28 11:39:28.586000,interictal,,K37N36L4D_2022-06-28_06-57-47.edf,3499153.0,False


In [35]:
# Ratios for existing segs
# They also approximately correspond to the split ratio
existing_segs['train'].value_counts() / existing_segs.shape[0]

train
True     0.590647
False    0.409353
Name: count, dtype: float64