#### Notebook to analyse model on dataset of only patients MRI-negative and Histology confirmed

Support used for JAMA revisions

In [1]:
import os
import pandas as pd 
import sys
sys.path.append('/home/co-ripa1/rds/hpc-work/scripts/meld_classifier')
import meld_classifier.paths as paths


Setting MELD_DATA_PATH to /rds/project/kw350/rds-kw350-meld/meld_data/Data
Setting BASE_PATH to /rds/project/kw350/rds-kw350-meld/meld_data/Data
Setting EXPERIMENT_PATH to /rds/project/kw350/rds-kw350-meld/experiments/co-ripa1/
Setting FS_SUBJECTS_PATH to /rds/project/kw350/rds-kw350-meld/meld_data/Data
Setting BASE_PATH to /rds/project/kw350/rds-kw350-meld/meld_data/Data


## Create the dataset

In [21]:
# get the original dataset with train and test split
orig_df = pd.read_csv(os.path.join(paths.BASE_PATH, 'MELD_dataset_V6.csv'), index_col=0)

#get the list of patients mri neg and histo confirmed
patients_mrineghisto=  pd.read_csv(os.path.join(paths.BASE_PATH, 'list_patients_mrineg_histo.csv'))['ID']

In [26]:
# Extract only patients mrineg and histo confirmed

new_df = orig_df[orig_df['subject_id'].isin(patients_mrineghisto)]
new_df = new_df.loc[:, ~new_df.columns.str.contains('^Unnamed')]
new_df = new_df.reset_index(drop=True)
new_df



Unnamed: 0,subject_id,split
0,MELD2_H7_3T_FCD_007,trainval
1,MELD2_H7_3T_FCD_008,trainval
2,MELD_H10_3T_FCD_0001,trainval
3,MELD_H10_3T_FCD_0002,test
4,MELD_H10_3T_FCD_0003,trainval
...,...,...
130,MELD_H4_15T_FCD_0004,trainval
131,MELD_H4_15T_FCD_0005,trainval
132,MELD_H4_15T_FCD_0010,trainval
133,MELD_H4_3T_FCD_0007,test


In [28]:
#save
new_df.to_csv(os.path.join(paths.BASE_PATH, 'MELD_dataset_V6_mrineg&histo.csv'))

## Analyse the performances

In [2]:
sys.path.append('/home/co-ripa1/rds/hpc-work/scripts/meld_classifier')
from meld_classifier.meld_cohort import MeldCohort,MeldSubject
import h5py
import pandas as pd
import numpy as np
from meld_graph.evaluation import load_prediction

Setting EXPERIMENT_PATH to /rds/project/kw350/rds-kw350-meld/experiments_graph/kw350


In [4]:
experiment_dir = '/rds/project/kw350/rds-kw350-meld/experiments_graph/co-ripa1'
model ='24-08-01_MRIN_dcp'   
pred_file = os.path.join(experiment_dir,model,'s_0','fold_all','results_best_model', 'predictions.hdf5')
cohort= MeldCohort(hdf5_file_root='{site_code}_{group}_featurematrix_combat_6_kernels_noCombat.hdf5',
                            dataset='MELD_dataset_v6.csv')

with h5py.File(pred_file, "r") as f:
    subjects = list(f.keys())

In [5]:
df=pd.DataFrame()
subjects_dictionary={}
values={}
for si,subj in enumerate(subjects):
    if si%100==0:
        print(si)
    values['ID']=subj
    if "H101" in subj:
        s = MeldSubject(subj,cohort=cohort2)
    else:
        s = MeldSubject(subj,cohort=cohort)
    values['group']= True if s.group=='patient' else False
    labels_hemis = {}
    dists={}
    labels = np.zeros(len(cohort.cortex_label)*2)
    for hemi in ['lh','rh']:
        dists[hemi], labels_hemis[hemi] = s.load_feature_lesion_data(
                    features=['.on_lh.boundary_zone.mgh'], hemi=hemi, features_to_ignore=[]
                )
        if np.sum(dists[hemi])==0:
            dists[hemi] +=200
    labels = np.hstack([labels_hemis['lh'][cohort.cortex_mask],labels_hemis['rh'][cohort.cortex_mask]])
    borderzones = np.vstack([dists['lh'][cohort.cortex_mask,:],dists['rh'][cohort.cortex_mask,:]]).ravel()<20
    #load pred from graph classifier

    result_hemis = load_prediction(subj,pred_file, dset='prediction_clustered')
    result = np.hstack([result_hemis['lh'],result_hemis['rh']])
    values['model']='graph'
    
    #add detection with borderzone
    if labels.sum()>0:
        values['detected'] = np.logical_and(result, borderzones).any()
        clusters_in_borderzone = set(result[borderzones.astype('bool')])
        #remove the 0 cluster
        if 0 in clusters_in_borderzone:
            clusters_in_borderzone.remove(0)
        values['number TP clusters'] = len(clusters_in_borderzone)
        values['size_pred'] = len(result[borderzones.astype('bool')])
        values['IOU'] = (np.logical_and(result, labels)).sum() / (np.logical_or(result, labels)).sum() 
        values['IOU_plus'] = (np.logical_and(result, borderzones)).sum() / (np.logical_or(result, borderzones)).sum() 
    
    else:
        values['number TP clusters'] = 0
        values['IOU'] = np.nan
        values['IOU_plus'] = np.nan
    
    # add number of FP clusters : total clusters - TP clusters
    values['number FP clusters']=len(set(result_hemis['lh']))+len(set(result_hemis['rh']))-2-values['number TP clusters']
    df=pd.concat([df,pd.DataFrame([values])])
    
    
df = df.reset_index()
df.head()

0


Unnamed: 0,index,ID,group,model,detected,number TP clusters,size_pred,IOU,IOU_plus,number FP clusters
0,0,MELD_H10_3T_FCD_0002,True,graph,True,1,17009,0.599766,0.221706,0
1,0,MELD_H10_3T_FCD_0009,True,graph,False,0,20338,0.0,0.0,1
2,0,MELD_H10_3T_FCD_0016,True,graph,True,1,29340,0.397996,0.14925,0
3,0,MELD_H11_3T_FCD_0002,True,graph,True,1,14438,0.035077,0.193221,2
4,0,MELD_H11_3T_FCD_0006,True,graph,True,1,6713,0.031262,0.220445,1


In [8]:
def df_stats(df):
    sensitivity = np.mean(df['detected'][df['group']])
    specificity = (df['number FP clusters'][df['group']==0]==0).mean()
    total_detected = np.sum(df['number TP clusters'][df['group']])
    total_predicted = np.sum(df['number FP clusters'][df['group']])
    ppv = total_detected / (total_predicted + total_detected)
    return np.round(sensitivity,2),np.round(specificity,2),np.round(ppv,2)

# bootstrapped confidence intervals
def bootstrap_CI(df, n=10000, func=df_stats):
    """Calculate confidence intervals for a given function"""
    bootstrapped = []
    for i in range(n):
        bootstrapped.append(func(df.sample(len(df), replace=True)))
    return np.percentile(np.array(bootstrapped), [2.5, 97.5],axis=0)

In [9]:
print('all together')
dfsub = df.copy()
sensitivity, specificity, ppv = df_stats(dfsub)
print('sensitivity specificity ppv')
print(sensitivity,specificity,ppv)
print(bootstrap_CI(dfsub))

all together
sensitivity specificity ppv
0.78 nan 0.43
[[0.67  nan 0.36]
 [0.88  nan 0.51]]


In [11]:
len(df['ID'])

58