In [1]:
from load_data import *
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
from datasets import *
from models import *
from train_test_utils import *
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data
path = 'full_per_visit_data_2021-03-26_processed.csv'

# Since we are predicting age and construct in a multi-task setting
age = False

seq2seq = False

# Options are 'negative_valence' and 'positive_valence'
construct = 'negative_valence'

In [3]:
# The variables we permuted for each category

sleep_variables = ['youthreport2_shq1', 'youthreport2_shq2', 'youthreport2_shq3', 'youthreport2_shq4',
                   'youthreport2_shq5', 'shq_weekday_sleep', 'shq_weekend_sleep', 'shq_weekend_bedtime_delay',
                   'shq_weekend_wakeup_delay', 'shq_sleepiness', 'shq_circadian']

demographics_variables = ['sex', 'hispanic', 'race','ses_parent_yoe', 'pds_score',
                          'bmi_zscore', 'site', 'visit_age']

druguse_variables = ['cahalan_score', 'exceeds_bl_drinking_2', 'lssaga_dsm4_youth_d04_diag', 'lssaga_dsm4_youth_d05_diag',
                     'highrisk_yss_extern', 'highrisk_yss_intern', 'highrisk_pss_extern',
                     'highrisk_pss_intern', 'youthreport1_yfhi4', 'youthreport1_yfhi3', 'youthreport1_yfhi5']

personality_variables = ['tipi_agv', 'tipi_csv', 'tipi_ems', 'tipi_etv', 'tipi_ope', 'upps_nug', 'upps_pmt',
                         'upps_psv', 'upps_pug', 'upps_sss', 'rsq_problem_solving', 'rsq_emotion_expression',
                         'rsq_acceptance', 'rsq_positive_thinking', 'rsq_emotion_regulation', 'rsq_cognitive_restructuring']

life_variables = ['leq_c_c','leq_c_cnc','leq_c_cnu','leq_c_dau','leq_c_dcu','leq_c_dnc','leq_c_dnu','leq_c_dpc',
                  'leq_c_nc','leq_c_nu','leq_c_sn','leq_c_u','ctq_ea','ctq_en','ctq_minds','ctq_pa','ctq_pn',
                  'ctq_sa']

brief_variables = ['brief_inhibit_t', 'brief_beh_shift_t', 'brief_cog_shift_t']


support_variables = ['youthreport2_chks_set2_chks3','youthreport2_chks_set2_chks4','youthreport2_chks_set4_chks5',
                     'youthreport2_chks_set4_chks6','youthreport2_chks_set5_chks7','youthreport2_chks_set5_chks8',
                     'youthreport2_chks_set5_chks9','youthreport2_pwmkcr_involvement_pwmkcr3']

neuropsych_variables = ['cnp_sfnb2_sfnb_tp','cnp_sfnb2_sfnb_fp','cnp_sfnb2_sfnb_rtc','cnp_sfnb2_sfnb_tp0','cnp_sfnb2_sfnb_fp0',
                       'cnp_sfnb2_sfnb_rtc0','cnp_sfnb2_sfnb_tp1','cnp_sfnb2_sfnb_fp1','cnp_sfnb2_sfnb_rtc1','cnp_sfnb2_sfnb_tp2',
                       'cnp_sfnb2_sfnb_fp2','cnp_sfnb2_sfnb_tn0','cnp_sfnb2_sfnb_tn1','cnp_sfnb2_sfnb_tn2','cnp_sfnb2_sfnb_fn1',
                       'cnp_sfnb2_sfnb_fn2','cnp_sfnb2_sfnb_rtc2','cnp_sfnb2_sfnb_mcr','cnp_sfnb2_sfnb_mrtc','cnp_sfnb2_sfnb_mrt',
                       'cnp_sfnb2_sfnb_meff','cnp_er40d_er40ang','cnp_er40d_er40fear','cnp_er40d_er40hap','cnp_er40d_er40noe',
                       'cnp_er40d_er40sad','cnp_er40d_er40_fpa','cnp_er40d_er40_fpf','cnp_er40d_er40_fph','cnp_er40d_er40_fpn',
                       'cnp_er40d_er40_fps','cnp_er40d_er40angrt','cnp_er40d_er40fearrt','cnp_er40d_er40haprt','cnp_er40d_er40noert',
                       'cnp_er40d_er40sadrt','dd100_logk_1d','dd100_logk_7d','dd100_logk_1mo','dd100_logk_6mo','dd1000_logk_1d',
                       'dd1000_logk_7d','dd1000_logk_1mo','dd1000_logk_6mo', 'np_ehi_result',
                       'stroop_total_mean','stroop_stroopm_rr_diffrt','stroop_conm_rr_mean','stroop_incm_rr_mean','stroop_error_sum', 'stroop_miss_sum']

In [6]:
# In this experiment we did 500 permutations and calculated the results, I just put 10 here for quicker execution
permutation_runs = 5

# Choose which category variables you want to permute
category = 'personality'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

run_with_seed = 1964
torch.manual_seed(run_with_seed)

<torch._C.Generator at 0x13f7a16d0>

In [5]:
if category == 'sleep':
    category_variables = sleep_variables
elif category == 'demographics':
    category_variables = demographics_variables
elif category == 'druguse':
    category_variables = druguse_variables
elif category == 'personality':
    category_variables = personality_variables
elif category == 'life':
    category_variables = life_variables
elif category == 'neuropsych':
    category_variables = neuropsych_variables
elif category == 'brief':
    category_variables = brief_variables
elif category == 'support':
    category_variables = support_variables
    
# Create a dataframe to keep all the results after each permutation run
write_results = pd.DataFrame(columns=['run', 'fold', 'subject_accuracy', 'subject_macro_accuracy', 'accuracy', 'balanced_accuracy', 'f1-score'])

In [7]:
for run in range(0, permutation_runs):

    # Load all data
    # To do this faster here I save the processed file and  then load it with the 'quick' param
    input_feats = load_longitudinal_tabular_data(input_path=path, write_path='', quick=True, write_csv=False)
    
    # Because we know aces_total should not be taken into account
    input_feats = input_feats.drop(columns=['aces_total'])

    labels = {}
    labels_control_diseased = {}
    ages = {}
    for key in list(np.unique(input_feats.loc[:, 'subject'])):
        subj_v = input_feats[input_feats['subject'] == key]
        labels[key] = subj_v.loc[:, construct]
        ages[key] = subj_v.loc[:, 'visit_age']

    # we can create stratified splits
    split_stratified_labels = []
    for key in list(np.unique(input_feats.loc[:, 'subject'])):
        split_stratified_labels.append(labels[key].max())

    if age:
        input_feats = input_feats.drop(columns=['visit_age'])

    partition = {}
    folds = {}
    subject_folds = {}
    mfb_folds = {}
    folds_of_the_labels = {}
    folds_of_the_ages = {}
    counter = 0

    scaler = MinMaxScaler(feature_range=(-1,1))

    kf = StratifiedKFold(n_splits=5, shuffle=False)
    X = np.array(list(np.unique(input_feats.loc[:, 'subject'])))
    y = np.array(split_stratified_labels)
    kf.get_n_splits(X, y)

    for train_index, test_index in kf.split(X, y):
        labels_fold = labels.copy()
        ages_fold = ages.copy()
        subject_ages = {}
        subject_post = {}

        train_subj = input_feats.loc[input_feats['subject'].isin(list(X[train_index]))]
        test_subj = input_feats.loc[input_feats['subject'].isin(list(X[test_index]))]
        test_subj = test_subj.reset_index(drop=True)

        # Find unique amount of subjects
        category_vars = test_subj[category_variables]

        # Here we do the feature permutation
        # Specifically, we permute all the variables only from one category across subjects randomly
        permuted_feats = test_subj.copy()
        permuted_feats = permuted_feats.reset_index(drop=True)

        permuted_feats[category_variables] = category_vars.sample(frac=1).reset_index(drop=True)

        test_subj = permuted_feats.copy()
        
        # The first 21 columns should not be used as input to the model as they were used to 
        # determine the predicted value
        X_train = train_subj.iloc[:, 21:]
        X_train = scaler.fit_transform(X_train)

        X_train = pd.DataFrame(data=X_train, columns=train_subj.columns[21:])
        X_train = X_train.set_index(train_subj.index)
        X_train.insert(0, 'subject', train_subj.loc[:, 'subject'], True)

        X_test = test_subj.iloc[:, 21:]
        X_test = scaler.transform(X_test)

        X_test = pd.DataFrame(data=X_test, columns=test_subj.columns[21:])
        X_test = X_test.set_index(test_subj.index)
        X_test.insert(0, 'subject', test_subj.loc[:, 'subject'], True)

        X_test_constructs = test_subj.loc[:, ['subject', 'visit', 'positive_valence', 'negative_valence', 'arousal',
                                              'cognitive', 'no_construct_at_all']]

        partition['test'] = list()
        partition['train'] = list()
        for subject in input_feats.subject.unique():
            if subject in list(X[train_index]):
                subj_visits = X_train[X_train['subject'] == subject]
                subject_ages[subject] = subj_visits
                partition['train'].append(subject)
            
            elif subject in list(X[test_index]):
                subj_visits = X_test[X_test['subject'] == subject]
                subject_ages[subject] = subj_visits
                partition['test'].append(subject)


        folds[counter] = partition.copy()

        # Subject-specific dataset with all the visits and post-processed for training
        for key in list(partition['train'] + partition['test']):
            df = subject_ages[key]
            df = df.iloc[:, 1:]
            subject_post[key] = df

        subject_folds[counter] = subject_post.copy()
        folds_of_the_labels[counter] = labels_fold.copy()
        folds_of_the_ages[counter] = ages_fold.copy()

        y_train = train_subj.loc[:, construct]
        number_neg_samples = np.sum(y_train.values == False)
        num_pos_samples = np.sum(y_train.values == True)
        mfb = number_neg_samples / num_pos_samples
        mfb_folds[counter] = mfb.copy()

        counter += 1

    # Parameters
    params = {'shuffle': True,
              'num_workers': 0,
              'batch_size': 1}

    results = {}
    avg_results_dict = {}

    for fold in folds.keys():

        fold_results = {}
        partition = folds[fold]
        subject_post = subject_folds[fold]
        pos_weight = mfb_folds[fold]
        labels_f = folds_of_the_labels[fold]
        ages_f = folds_of_the_ages[fold]

        validation_set = Dataset(partition['test'], subject_post, labels_f, ages_f, age)
        validation_generator = torch.utils.data.DataLoader(validation_set, **params)

        # Model depth
        feature_dim = 128
        input_dim = next(iter(validation_generator))[0].shape[2]
        output_dim = 1
        n_layers = 1
        hidden_dim = 64
        
        if age:
            model = AgeGRUNet(feature_dim=feature_dim, input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim,
                          n_layers=n_layers, device=device)
        else:
            model = GRUNet(feature_dim=feature_dim, input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim,
                          n_layers=n_layers, seq2seq=seq2seq, device=device)

        
        # Path where the models have been saved
        # We have one scenario where we have predicted age + NV in every visit and another scenario
        # where we have prediction only NV in the last visit - that one is used for the MICCAI submission
        if age:
            input_path = f'{construct}_fold_{fold}_tabular_no_aces.ckpt'
        else:
            input_path = f'{construct}_fold_{fold}_tabular_no_aces_no_age_last_label.ckpt'
            
        model.load_state_dict(torch.load(input_path, map_location=torch.device(device)))

        model.to(device)
        
        if seq2seq:
            if age:
                results[f'split{fold}'] = evaluate_all_timesteps_age_per_subject(model=model, val_loader=validation_generator, device=device)
            else:
                results[f'split{fold}'] = evaluate_all_timesteps_per_subject(model=model, val_loader=validation_generator, device=device)
        else:
                results[f'split{fold}'] = evaluate_last_timestep(model=model, val_loader=validation_generator, device=device)

        fold_results['run'] = run
        fold_results['fold'] = fold
        fold_results['subject_accuracy'] = results[f'split{fold}']['subject_accuracy']
        fold_results['subject_macro_accuracy'] = results[f'split{fold}']['subject_macro_accuracy']
        fold_results['accuracy'] = results[f'split{fold}']['accuracy']
        fold_results['balanced_accuracy'] = results[f'split{fold}']['balanced_accuracy']
        fold_results['f1-score'] = results[f'split{fold}']['f1-score']

        report_df = pd.DataFrame.from_dict(fold_results, orient='index').transpose()
        write_results = write_results.append(report_df, ignore_index=True)

In [None]:
output_path = ''
output_path = Path(output_path, f'{construct}_{category}_permutation_acc_inference_random_no_age_last').with_suffix('.csv')
write_results.to_csv(output_path, sep=',')