In [None]:
import pandas as pd

In [None]:
imaging_data_path = '/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/stroke_research/geneva_stroke_unit_dataset/data/perfusion_imaging_data/random_subset_for_imaging_extraction.xlsx'
features_path = '/Users/jk1/temp/opsum_prepro_output/gsu_prepro_01012023_233050/preprocessed_features_01012023_233050.csv'
labels_path = '/Users/jk1/temp/opsum_prepro_output/gsu_prepro_01012023_233050/preprocessed_outcomes_01012023_233050.csv'

In [None]:
test_size = 0.2
seed=42
n_splits=5
outcome = '3M mRS 0-2'

In [None]:
imaging_data_df = pd.read_excel(imaging_data_path)

In [None]:
imaging_data_df.head()

In [None]:
from sklearn.model_selection import train_test_split
from prediction.utils.utils import check_data
from prediction.outcome_prediction.data_loading.data_formatting import format_to_2d_table_with_time, \
    link_patient_id_to_outcome

### LOAD THE DATA
X, y = format_to_2d_table_with_time(feature_df_path=features_path, outcome_df_path=labels_path,
                                    outcome=outcome)

n_time_steps = X.relative_sample_date_hourly_cat.max() + 1
n_channels = X.sample_label.unique().shape[0]

# test if data is corrupted
check_data(X)

"""
SPLITTING DATA
Splitting is done by patient id (and not admission id) as in case of the rare multiple admissions per patient there
would be a risk of data leakage otherwise split 'pid' in TRAIN and TEST pid = unique patient_id
"""
# Reduce every patient to a single outcome (to avoid duplicates)
all_pids_with_outcome = link_patient_id_to_outcome(y, outcome)
pid_train, pid_test, y_pid_train, y_pid_test = train_test_split(all_pids_with_outcome.patient_id.tolist(),
                                                                all_pids_with_outcome.outcome.tolist(),
                                                                stratify=all_pids_with_outcome.outcome.tolist(),
                                                                test_size=test_size,
                                                                random_state=seed)

In [None]:
imaging_data_df['pid'] = imaging_data_df['case_admission_id'].apply(lambda x: x.split('_')[0])

In [None]:
n_pid_train_in_imaging_set = imaging_data_df[imaging_data_df['pid'].isin(pid_train)].shape[0]
n_pid_test_in_imaging_set = imaging_data_df[imaging_data_df['pid'].isin(pid_test)].shape[0]
print(f'Number of patients in training set in imaging subset: {n_pid_train_in_imaging_set}')
print(f'Number of patients in test set in imaging subset: {n_pid_test_in_imaging_set}')

n_pid_train_with_imaging = imaging_data_df[(imaging_data_df['pid'].isin(pid_train)) & (imaging_data_df.CTP_present == 1)].shape[0]
n_pid_test_with_imaging = imaging_data_df[(imaging_data_df['pid'].isin(pid_test)) & (imaging_data_df.CTP_present == 1)].shape[0]

print(f'Number of patients in training set with imaging data: {n_pid_train_with_imaging}')
print(f'Number of patients in test set with imaging data: {n_pid_test_with_imaging}')

In [None]:
n_cid_train_with_imaging = imaging_data_df[(imaging_data_df['pid'].isin(pid_train)) & (imaging_data_df.CBF.notnull())].case_admission_id.unique().shape[0]
n_cid_test_with_imaging = imaging_data_df[(imaging_data_df['pid'].isin(pid_test)) & (imaging_data_df.CBF.notnull())].case_admission_id.unique().shape[0]

print(f'Number of admissions in training set with imaging data: {n_cid_train_with_imaging}')
print(f'Number of admissions in test set with imaging data: {n_cid_test_with_imaging}')