## Impact of access of imaging on prediction

Imaging data was evaluated for a subset (10%) of the overall GSU cohort
For this subset imaging data was extracted if available (n=149), a model was trained on the full dataset (with imaging data only available for 10% of the cohort)
Evaluation was done on the test set for those with imaging data available, once with access to imaging data and once without access to imaging data (set to missing) 

In [None]:
import pandas as pd
import os

In [None]:
imaging_available_subgroups_dir = '/Users/jk1/temp/opsum_prediction_output/transformer/3M_mrs02_with_imaging/with_imaging/with_imaging_vs_nullified_imaging_test_set_subgroups/imaging_available_subgroups'
imaging_set_to_missing_subgroups_dir = '/Users/jk1/temp/opsum_prediction_output/transformer/3M_mrs02_with_imaging/with_imaging/with_imaging_vs_nullified_imaging_test_set_subgroups/nullified_imaging_subgroup'

In [None]:
# loop through directories and find subgroup_results.csv file
imaging_available_all_subgroups_df = pd.DataFrame()
for root, dirs, files in os.walk(imaging_available_subgroups_dir):
    for file in files:
        if file.endswith('subgroup_results.csv'):
            subgroup_df = pd.read_csv(os.path.join(root, file))
            imaging_available_all_subgroups_df = pd.concat([imaging_available_all_subgroups_df, subgroup_df])
            
imaging_set_to_missing_all_subgroups_df = pd.DataFrame()
for root, dirs, files in os.walk(imaging_set_to_missing_subgroups_dir):
    for file in files:
        if file.endswith('subgroup_results.csv'):
            subgroup_df = pd.read_csv(os.path.join(root, file))
            imaging_set_to_missing_all_subgroups_df = pd.concat([imaging_set_to_missing_all_subgroups_df, subgroup_df])

In [None]:
imaging_available_all_subgroups_df = imaging_available_all_subgroups_df[imaging_available_all_subgroups_df.fold == imaging_available_all_subgroups_df.selected_fold_on_cv.mode()[0]]
imaging_set_to_missing_all_subgroups_df = imaging_set_to_missing_all_subgroups_df[imaging_set_to_missing_all_subgroups_df.fold == imaging_set_to_missing_all_subgroups_df.selected_fold_on_cv.mode()[0]]

In [None]:
selected_columns = ['auc_test', 'auc_test_lower_ci', 'auc_test_upper_ci', 'matthews_test',
       'matthews_test_lower_ci', 'matthews_test_upper_ci', 'accuracy_test',
       'accuracy_test_lower_ci', 'accuracy_test_upper_ci', 'precision_test',
       'precision_test_lower_ci', 'precision_test_upper_ci', 'recall_test',
       'recall_test_lower_ci', 'recall_test_upper_ci', 'specificity_test',
       'specificity_test_lower_ci', 'specificity_test_upper_ci',
       'neg_pred_value_test', 'neg_pred_value_test_lower_ci',
       'neg_pred_value_test_upper_ci', 'outcome', 'subgroup_split', 'subgroup', 'n_pos_samples', 'n_total_samples']

In [None]:
imaging_available_all_subgroups_df = imaging_available_all_subgroups_df[selected_columns]
imaging_set_to_missing_all_subgroups_df = imaging_set_to_missing_all_subgroups_df[selected_columns]

In [None]:
def preprocess_df(df, subgroup_split, subgroup, n_pos_samples=None, total_samples=None):
    preprocessed_df = pd.DataFrame()
    preprocessed_df['Subgroup'] = [subgroup]
    preprocessed_df['Subgroup split'] = [subgroup_split]

    # report result rounded to 3 decimal places with 95% CI
    preprocessed_df['ROC AUC'] = [f'{round(df["auc_test"].values[0], 3):.3f} ({round(df["auc_test_lower_ci"].values[0], 3):.3f}-{round(df["auc_test_upper_ci"].values[0], 3):.3f})']
    preprocessed_df["Matthew's Coefficient"] = [f'{round(df["matthews_test"].values[0], 3):.3f} ({round(df["matthews_test_lower_ci"].values[0], 3):.3f}-{round(df["matthews_test_upper_ci"].values[0], 3):.3f})']
    preprocessed_df["Accuracy"] = [f'{round(df["accuracy_test"].values[0], 3):.3f} ({round(df["accuracy_test_lower_ci"].values[0], 3):.3f}-{round(df["accuracy_test_upper_ci"].values[0], 3):.3f})']
    preprocessed_df["Precision (positive predictive value)"] = [f'{round(df["precision_test"].values[0], 3):.3f} ({round(df["precision_test_lower_ci"].values[0], 3):.3f}-{round(df["precision_test_upper_ci"].values[0], 3):.3f})']
    preprocessed_df["Recall (Sensitivity)"] = [f'{round(df["recall_test"].values[0], 3):.3f} ({round(df["recall_test_lower_ci"].values[0], 3):.3f}-{round(df["recall_test_upper_ci"].values[0], 3):.3f})']
    if 'specificity_test' in df.columns:
        preprocessed_df["Specificity"] = [f'{round(df["specificity_test"].values[0], 3):.3f} ({round(df["specificity_test_lower_ci"].values[0], 3):.3f}-{round(df["specificity_test_upper_ci"].values[0], 3):.3f})']
    else:
        preprocessed_df["Specificity"] = [f'NA']

    if (n_pos_samples is not None) and (total_samples is not None):
        preprocessed_df['Number of events'] = [f'{int(n_pos_samples)} ({round(n_pos_samples/total_samples*100, 1)}%)']

    return preprocessed_df

In [None]:
preprocessed_imaging_available_all_subgroups_df = pd.DataFrame()
for split in imaging_available_all_subgroups_df.subgroup_split.unique():
    split_df = imaging_available_all_subgroups_df[imaging_available_all_subgroups_df.subgroup_split == split]
    preprocessed_imaging_available_all_subgroups_df = pd.concat([preprocessed_imaging_available_all_subgroups_df, preprocess_df(split_df, split, split_df.subgroup.values[0], split_df.n_pos_samples.values[0], split_df.n_total_samples.values[0])])
    
preprocessed_imaging_set_to_missing_all_subgroups_df = pd.DataFrame()
for split in imaging_set_to_missing_all_subgroups_df.subgroup_split.unique():
    split_df = imaging_set_to_missing_all_subgroups_df[imaging_set_to_missing_all_subgroups_df.subgroup_split == split]
    preprocessed_imaging_set_to_missing_all_subgroups_df = pd.concat([preprocessed_imaging_set_to_missing_all_subgroups_df, preprocess_df(split_df, split, split_df.subgroup.values[0], split_df.n_pos_samples.values[0], split_df.n_total_samples.values[0])])

In [None]:
preprocessed_imaging_available_all_subgroups_df

In [None]:
preprocessed_imaging_set_to_missing_all_subgroups_df

In [None]:
preprocessed_imaging_available_all_subgroups_df['Imaging'] = 'available'
preprocessed_imaging_set_to_missing_all_subgroups_df['Imaging'] = 'unavailable'

In [None]:
comparison_df = pd.concat([preprocessed_imaging_available_all_subgroups_df[preprocessed_imaging_available_all_subgroups_df['Subgroup split'] == 'with_imaging_available'], preprocessed_imaging_set_to_missing_all_subgroups_df[preprocessed_imaging_set_to_missing_all_subgroups_df['Subgroup split'] == 'with_imaging_available']])

In [None]:
comparison_df.drop(columns=['Subgroup', 'Subgroup split'], inplace=True)

In [None]:
comparison_df

In [None]:
# comparison_df.to_csv(os.path.join(imaging_available_subgroups_dir, 'imaging_available_vs_unavailable.csv'), index=False)