In [1]:
random_seeds = [42, 100, 0, 10, 12, 20, 50, 9, 30, 51]
run = 0

In [2]:
username = 'meganorm-yverduyn'
datasets = {
    'CMI': {
        'base_dir': '/project/meganorm/Data/EEG_CMI/EEG_BIDS', #Final
        #'base_dir': '/home/meganorm-yverduyn/Dev/BIDS_CMI',  #To test 
        'task': 'task-eyesclosed', 
        'ending': 'eeg.set'
    }
    }

package_path = f'/home/{username}/Code/MEGaNorm/'

In [3]:
import os
os.chdir(package_path)
from utils.parallel import submit_jobs, check_jobs_status, collect_results
from datasets.CMI import load_covariates_CMI
from utils.nm import hbr_data_split, estimate_centiles
from plots.plots import plot_nm_range_site, plot_comparison, plot_neurooscillochart, plot_age_dist2, plot_growthcharts, plot_quantile_gauge, box_plot_auc
from utils.nm import model_quantile_evaluation, calculate_oscilochart, prepare_test_data, cal_stats_for_gauge
from utils.IO import merge_datasets_with_regex, merge_fidp_demo, merge_datasets_with_glob
from datasets.mne_bids_conversion import make_demo_file_bids
import pandas as pd
import json
from pcntoolkit.normative_parallel import execute_nm, rerun_nm, collect_nm
import warnings
import pickle  
import numpy as np
from pcntoolkit.util.utils import z_to_abnormal_p, anomaly_detection_auc
from scipy.stats import false_discovery_control
warnings.filterwarnings("ignore")

In [4]:
def make_config(project, path=None):

    # preprocess configurations =================================================
    # downsample data
    config = dict()

    # You could also set layout to None to have high 
    # choices: all, lobe, None
    config["which_layout"] = "all"

    # which sensor type should be used
    # choices: meg, mag, grad, eeg, opm
    config["which_sensor"] = "eeg"
    # config['fs'] = 500

    # ICA configuration
    config['ica_n_component'] = 30
    config['ica_max_iter'] = 800
    config['ica_method'] = "infomax" #bc IcaLabel works best with this --> is this oke? should also work with fastica tho 

    # lower and upper cutoff frequencies in a bandpass filter
    config['cutoffFreqLow'] = 1
    config['cutoffFreqHigh'] = 45

    config["resampling_rate"] = 1000
    config["digital_filter"] = True
    config["notch_filter"] = False

    config["apply_ica"] = True

    config["auto_ica_corr_thr"] = 0.8

    config["rereference_method"]= "average"
    
    # variance threshold across time
    config["mag_var_threshold"] = 4e-12
    config["grad_var_threshold"] = 4000e-13
    config["eeg_var_threshold"] = 40e-6  
    # flatness threshold across time
    config["mag_flat_threshold"] = 10e-15
    config["grad_flat_threshold"] = 10e-15
    config["eeg_flat_threshold"] = 10e-6 # check with Mosi 

    # segmentation ==============================================
    #start time of the raw data to use in seconds, this is to avoid possible eye blinks in close-eyed resting state. 
    config['segments_tmin'] = 0
    # end time of the raw data to use in seconds, this is to avoid possible eye blinks in close-eyed resting state.
    config['segments_tmax'] = -0.000000001 #to not exclude end time 
    # length of EEG segments in seconds
    config['segments_length'] = 10
    # amount of overlap between EEG sigals in seconds
    config['segments_overlap'] = 2

    # PSD ==============================================
    # Spectral estimation method
    config['psd_method'] = "welch"
    # amount of overlap between windows in Welch's method
    config['psd_n_overlap'] = 1
    config['psd_n_fft'] = 2
    # number of samples in psd
    config["psd_n_per_seg"] = 2

    # fooof analysis configurations ==============================================
    # Desired frequency range to run FOOOF
    config['fooof_freqRangeLow'] = 3
    config['fooof_freqRangeHigh'] = 40
    # which mode should be used for fitting; choices (knee, fixed)
    config["aperiodic_mode"] = "knee"
    # minimum acceptable peak width in fooof analysis
    config["fooof_peak_width_limits"] = [1.0, 12.0]
    #Absolute threshold for detecting peaks
    config['fooof_min_peak_height'] = 0
    #Relative threshold for detecting peaks
    config['fooof_peak_threshold'] = 2

    # feature extraction ==========================================================
    # Define frequency bands
    config['freq_bands'] = {
                            'Theta': (3, 8),
                            'Alpha': (8, 13),
                            'Beta': (13, 30),
                            'Gamma': (30, 40),
                            # 'Broadband': (3, 40)
                            }

    # Define individualized frequency range over main peaks in each freq band
    config['individualized_band_ranges'] = { 
                                            'Theta': (-2, 3),
                                            'Alpha': (-2, 3), # change to (-4,2)
                                            'Beta': (-8, 9),
                                            'Gamma': (-5, 5)
                                            }

    # least acceptable R squred of fitted models
    config['min_r_squared'] = 0.9 
 
    config['feature_categories'] = {
                                    "Offset":False,
                                    "Exponent":False,
                                    "Peak_Center":True,
                                    "Peak_Power":True,
                                    "Peak_Width":True,
                                    "Adjusted_Canonical_Relative_Power":True, 
                                    "Adjusted_Canonical_Absolute_Power":False,
                                    "Adjusted_Individualized_Relative_Power":False,
                                    "Adjusted_Individualized_Absolute_Power":False,
                                    "OriginalPSD_Canonical_Relative_Power":True, 
                                    "OriginalPSD_Canonical_Absolute_Power":False,
                                    "OriginalPSD_Individualized_Relative_Power":False,
                                    "OriginalPSD_Individualized_Absolute_Power":False,
                                    }
    
    config["fooof_res_save_path"] = None

    config["random_state"] = 42

    if path is not None:
        out_file = open(os.path.join(path, project + ".json"), "w") 
        json.dump(config, out_file, indent = 6) 
        out_file.close()

    return config

In [5]:
project = "CMI"
run = 0 
project_dir = f'/home/{username}/Results/{project}/'

mainParallel_path = os.path.join(package_path, 'src', 'mainParallel.py')

features_dir = os.path.join(project_dir, 'Features')
features_log_path = os.path.join(features_dir, 'log')
features_temp_path = os.path.join(features_dir,'temp')

nm_processing_dir = os.path.join(project_dir, 'NM', 'Run_' + str(run))

job_configs = {'log_path':features_log_path, 'module':'mne', 'time':'1:00:00', 'memory':'20GB', 
                'partition':'normal', 'core':1, 'node':1, 'batch_file_name':'batch_job'}

if not os.path.isdir(features_log_path):
    os.makedirs(features_log_path)

if not os.path.isdir(features_temp_path):
    os.makedirs(features_temp_path)
    
if not os.path.isdir(nm_processing_dir):
    os.makedirs(nm_processing_dir)
    
configs = make_config(project, project_dir)

subjects = merge_datasets_with_glob(datasets)


In [None]:
### Parallel feature extraction  

# # Running Jobs
start_time = submit_jobs(mainParallel_path, features_dir, subjects, 
                features_temp_path, job_configs=job_configs, config_file=os.path.join(project_dir, project+'.json'))
# Checking jobs
failed_jobs = check_jobs_status(username, start_time)

falied_subjects = {failed_job:subjects[failed_job] for failed_job in failed_jobs}

while len(failed_jobs)>0:
    # Re-running Jobs
    start_time = submit_jobs(mainParallel_path, features_dir, falied_subjects, 
                features_temp_path, job_configs=job_configs, config_file=os.path.join(project_dir, project+'.json'))
    # Checking jobs
    failed_jobs = check_jobs_status(username, start_time)

In [None]:
collect_results(features_dir, subjects, features_temp_path, file_name='all_features')

In [6]:
#Load covariates CMI #TODO - this step is probably redundant, check if it can be integrated with next step  
CMI_covariates_path= "/project/meganorm/Data/EEG_CMI/info/covariates.csv"
save_dir = "/project/meganorm/Data/EEG_CMI/EEG_BIDS/covariates.tsv"
load_covariates_CMI(CMI_covariates_path, save_dir)


In [7]:
#Create participants.tsv 
file_dir = "/project/meganorm/Data/EEG_CMI/EEG_BIDS/covariates.tsv"
save_dir = "/project/meganorm/Data/EEG_CMI/EEG_BIDS/participants_bids.tsv"
make_demo_file_bids(file_dir, 
                        save_dir, 
                        0, 
                        1, 
                        {"col_name": "sex", "col_id": 2, "mapping":{0: "Male", 1: "Female"}, "single_value":None}, 
                        {"col_name": "site", "col_id": 3, "mapping":{0: "0", 1: "1", 2: "2", 3: "3", 4: "4"}, "single_value":None},
                        {"col_name": "eyes", "col_id": None, "mapping": None, "single_value":"eyes_closed"},
                        {"col_name": "diagnosis", "col_id": 4, "mapping": {
                            "ADHD-Combined Type": "adhd combined type",
                            "Generalized Anxiety Disorder": "generalized anxiety disorder",
                            "ADHD-Inattentive Type": "adhd inattentive type",
                            "Specific Learning Disorder with Impairment in Reading": "specific learning disorder with impairment in reading",
                            "Disruptive Mood Dysregulation Disorder": "disruptive mood dysregulation disorder",
                            "Oppositional Defiant Disorder": "oppositional defiant disorder",
                            "Major Depressive Disorder": "major depressive disorder",
                            "Tourettes Disorder": "tourettes disorder",
                            "Other Specified Anxiety Disorder": "other specified anxiety disorder",
                            "Other Specified Attention-Deficit/Hyperactivity Disorder": "other specified attention deficit hyperactivity disorder",
                            "No Diagnosis Given": "control",
                            "Autism Spectrum Disorder": "autism spectrum disorder",
                            "Language Disorder": "language disorder",
                            "Specific Learning Disorder with Impairment in Mathematics": "specific learning disorder with impairment in mathematics",
                            "No Diagnosis Given: Incomplete Eval": "no diagnosis given incomplete eval",
                            "Separation Anxiety": "separation anxiety",
                            "Social (Pragmatic) Communication Disorder": "social pragmatic communication disorder",
                            "Provisional Tic Disorder": "provisional tic disorder",
                            "Social Anxiety (Social Phobia)": "social anxiety social phobia",
                            "Specific Phobia": "specific phobia",
                            "Borderline Intellectual Functioning": "borderline intellectual functioning",
                            "ADHD-Hyperactive/Impulsive Type": "adhd hyperactive impulsive type",
                            "Intellectual Disability-Moderate": "intellectual disability moderate",
                            "Intellectual Disability-Mild": "intellectual disability mild",
                            "Adjustment Disorders": "adjustment disorders",
                            "Bipolar I Disorder": "bipolar i disorder",
                            "Obsessive-Compulsive Disorder": "obsessive compulsive disorder",
                            "Conduct Disorder-Childhood-onset type": "conduct disorder childhood onset type",
                            "Selective Mutism": "selective mutism",
                            "Other Specified Depressive Disorder": "other specified depressive disorder",
                            "Unspecified Attention-Deficit/Hyperactivity Disorder": "unspecified attention deficit hyperactivity disorder",
                            "Other Specified Disruptive, Impulse-Control, and Conduct Disorder": "other specified disruptive impulse control and conduct disorder",
                            "Persistent Depressive Disorder (Dysthymia)": "persistent depressive disorder dysthymia",
                            "Other Specified Trauma- and Stressor-Related Disorder": "other specified trauma and stressor related disorder",
                            "Other Specified Tic Disorder": "other specified tic disorder",
                            "Posttraumatic Stress Disorder": "posttraumatic stress disorder",
                            "Excoriation (Skin-Picking) Disorder": "excoriation skin picking disorder",
                            "Substance/Medication-Induced Bipolar and Related Disorder": "substance medication induced bipolar and related disorder",
                            "Specific Learning Disorder with Impairment in Written Expression": "specific learning disorder with impairment in written expression",
                            "Enuresis": "enuresis",
                            "Major Neurocognitive Disorder Due to Epilepsy": "major neurocognitive disorder due to epilepsy",
                            "Speech Sound Disorder": "speech sound disorder",
                            "Encopresis": "encopresis",
                            "Bipolar II Disorder": "bipolar ii disorder",
                            "Intermittent Explosive Disorder": "intermittent explosive disorder",
                            "Persistent (Chronic) Motor or Vocal Tic Disorder": "persistent chronic motor or vocal tic disorder",
                            "Other Specified Neurodevelopmental Disorder": "other specified neurodevelopmental disorder",
                            "Unspecified Anxiety Disorder": "unspecified anxiety disorder",
                            "Other Specified Feeding or Eating Disorder": "other specified feeding or eating disorder",
                            "Cannabis Use Disorder": "cannabis use disorder",
                            "Bulimia Nervosa": "bulimia nervosa",
                            "Avoidant/Restrictive Food Intake Disorder": "avoidant restrictive food intake disorder",
                            " ": "unspecified",
                            "Reactive Attachment Disorder": "reactive attachment disorder",
                            "Unspecified Neurodevelopmental Disorder": "unspecified neurodevelopmental disorder",
                            "Agoraphobia": "agoraphobia",
                            "Depressive Disorder Due to Another Medical Condition": "depressive disorder due to another medical condition",
                            "Delirium due to another medical condition": "delirium due to another medical condition",
                            "Specific Learning Disorder with Impairment in Reading ": "specific learning disorder with impairment in reading",
                            "Cyclothymic Disorder": "cyclothymic disorder",
                            "Schizophrenia": "schizophrenia",
                            "Delirium due to multiple etiologies": "delirium due to multiple etiologies",
                            "Gender Dysphoria in Adolescents and Adults": "gender dysphoria in adolescents and adults",
                            "Other Specified Obsessive-Compulsive and Related Disorder": "other specified obsessive compulsive and related disorder",
                            "Developmental Coordination Disorder": "developmental coordination disorder",
                            "Acute Stress Disorder": "acute stress disorder"}, "single_value":None})


In [None]:
### Data preparation for Normative Modeling
base_dirs = [values["base_dir"] for values in datasets.values()]
dataset_names = list(datasets.keys())
merged_data, data_patient = merge_fidp_demo(base_dirs, features_dir, dataset_names, include_patients=False)
biomarker_num = hbr_data_split(merged_data, nm_processing_dir, drop_nans=True, batch_effects=['sex', 'site'], random_seed=random_seeds[run], train_split=0.99)
print(biomarker_num)


In [20]:
### Setting up NM configs

python_path = '/project/meganorm/Software/Miniconda3/envs/test/bin/python' 

hbr_configs = {
                'homo_Gaussian_linear':{'model_type':'linear', 'likelihood':'Normal', 'linear_sigma':'False',
                                        'random_slope_mu':'False', 'linear_epsilon':'False', 'linear_delta':'False'}, 
                'homo_Gaussian_bspline':{'model_type':'bspline', 'likelihood':'Normal', 'linear_sigma':'False',
                                        'random_slope_mu':'False', 'linear_epsilon':'False', 'linear_delta':'False'}, 
                'homo_SHASH_linear':{'model_type':'linear', 'likelihood':'SHASHb', 'linear_sigma':'False',
                                    'random_slope_mu':'False', 'linear_epsilon':'False', 'linear_delta':'False'}, 
                'homo_SHASH_bspline':{'model_type':'bspline', 'likelihood':'SHASHb', 'linear_sigma':'False',
                                    'random_slope_mu':'False', 'linear_epsilon':'False', 'linear_delta':'False'}, 
                'hetero_Gaussian_linear':{'model_type':'linear', 'likelihood':'Normal', 'linear_sigma':'True',
                                        'random_slope_mu':'False', 'linear_epsilon':'False', 'linear_delta':'False'},
                'hetero_Gaussian_bspline':{'model_type':'bspline', 'likelihood':'Normal', 'linear_sigma':'True',
                                        'random_slope_mu':'False', 'linear_epsilon':'False', 'linear_delta':'False'},
                'hetero_SHASH_linear':{'model_type':'linear', 'likelihood':'SHASHb', 'linear_sigma':'True',
                                    'random_slope_mu':'False', 'linear_epsilon':'True', 'linear_delta':'True'},
                'hetero_SHASH_bspline':{'model_type':'bspline', 'likelihood':'SHASHb', 'linear_sigma':'True',
                                        'random_slope_mu':'False', 'linear_epsilon':'True', 'linear_delta':'True'},
            }

inscaler='None' 
outscaler='minmax' 
batch_size = 1
outputsuffix = '_estimate'

respfile = os.path.join(nm_processing_dir, 'y_train.pkl')
covfile = os.path.join(nm_processing_dir, 'x_train.pkl')

testrespfile_path = os.path.join(nm_processing_dir, 'y_test.pkl')
testcovfile_path = os.path.join(nm_processing_dir, 'x_test.pkl')

trbefile = os.path.join(nm_processing_dir, 'b_train.pkl')
tsbefile = os.path.join(nm_processing_dir, 'b_test.pkl')

memory = '2gb'
duration = '5:00:00'
cluster_spec = 'slurm'

In [None]:
### Running NM

#for method in hbr_configs.keys():
method = 'hetero_SHASH_bspline'
processing_dir = os.path.join(nm_processing_dir, method) + '/'
nm_log_path = os.path.join(processing_dir, 'log') + '/'

if not os.path.isdir(processing_dir):
    os.makedirs(processing_dir)
if not os.path.isdir(nm_log_path):
    os.makedirs(nm_log_path)
    
execute_nm(processing_dir, python_path,
            'NM', covfile, respfile, batch_size, memory, duration, alg='hbr', 
            log_path=nm_log_path, binary=True, testcovfile_path=testcovfile_path, 
            testrespfile_path=testrespfile_path,trbefile=trbefile, tsbefile=tsbefile, 
            model_type=hbr_configs[method]['model_type'], likelihood=hbr_configs[method]['likelihood'],  
            linear_sigma=hbr_configs[method]['linear_sigma'], random_slope_mu=hbr_configs[method]['random_slope_mu'],
            linear_epsilon=hbr_configs[method]['linear_epsilon'], linear_delta=hbr_configs[method]['linear_delta'], 
            savemodel='True', inscaler=inscaler, outscaler=outscaler, outputsuffix=outputsuffix, 
            interactive='auto', cluster_spec=cluster_spec, nuts_sampler="nutpie", n_cores_per_batch="2")


In [None]:
#collect_nm(processing_dir, "NM", collect=True, binary=True, batch_size=1)

In [None]:
### Evaluating quantiles using MACE

mace, best_models, bio_ids = model_quantile_evaluation(hbr_configs, nm_processing_dir, testcovfile_path, 
                              testrespfile_path, tsbefile, biomarker_num, plot=False, outputsuffix='estimate')

#plot_comparison(nm_processing_dir, hbr_configs, biomarker_num)

In [None]:
## Plotting ranges
#for config in hbr_configs.keys():

processing_path = os.path.join(nm_processing_dir, method)

q = estimate_centiles(processing_path, biomarker_num, quantiles=[0.05, 0.25, 0.5, 0.75, 0.95],
                          batch_map={0:{'Male':0, 'Female':1}, 1:{'Site1':0,'Site2':1, 'Site3':2}}, 
                          age_range=[0, 25])
plot_nm_range_site(processing_path, nm_processing_dir, experiment_id=10)

# exp 0: ap - periodic > 0 => 0 and hetero_SHASH_bspline
# exp 1: hetero_SHASH_bspline
# exp 2:  ap - periodic > 0 => 0 and hetero_SHASH_bspline and -inf are removed 

In [None]:
# plot age distribution for different sites and train/test/validation partitions
base_dir = "/home/{username}/Results/{project}/NM"
plot_age_dist2(base_dir)

ANOMALY DETECTION

In [None]:
random_seeds = [0]
for i in range(len(random_seeds)):

    nm_processing_dir_temp = nm_processing_dir.replace("Run_0", f"Run_{i}")
    processing_dir_temp = processing_dir.replace("Run_0", f"Run_{i}")

    prefix = "clinicalpredict_"
    biomarker_num = prepare_test_data(data_patient.drop('diagnosis', axis=1),
                                nm_processing_dir_temp, 
                                drop_nans=True, 
                                batch_effects=['sex', 'site'], 
                                prefix=prefix)
    

    testrespfile_path = os.path.join(nm_processing_dir_temp, prefix + 'y_test.pkl')
    testcovfile_path = os.path.join(nm_processing_dir_temp, prefix + 'x_test.pkl')
    tsbefile = os.path.join(nm_processing_dir_temp, prefix + 'b_test.pkl')

In [None]:
df = pd.DataFrame(columns=merged_data.columns[3:])
df_auc = pd.DataFrame(columns=merged_data.columns[3:])


for i in range(len(random_seeds)):

    nm_processing_dir_temp = nm_processing_dir.replace("Run_0", f"Run_{i}")
    processing_dir_temp = processing_dir.replace("Run_0", f"Run_{i}")

    with open(os.path.join(processing_dir_temp, "Z_clinicalpredict.pkl"), "rb") as file:
        z_patient = pickle.load(file)

    with open(os.path.join(processing_dir_temp,"Z_estimate.pkl"), "rb") as file:
        z_healthy = pickle.load(file)

    with open(os.path.join(nm_processing_dir_temp, "b_test.pkl"), "rb") as file:
        b_healthy = pickle.load(file)


    z_healthy = z_healthy.iloc[np.where(b_healthy["site"]==3)[0], :]

    data_patient = data_patient.dropna(axis=0)
    z_patient = z_patient.iloc[np.where(data_patient["diagnosis"] == "adhd combined type")[0], :]


    p_patient = z_to_abnormal_p(z_patient)
    p_healthy = z_to_abnormal_p(z_healthy)

    p = np.concatenate([p_patient, p_healthy])
    print(p.shape)

    labels = np.concatenate([np.ones(p_patient.shape[0]), np.zeros(p_healthy.shape[0])])
    print(labels.shape)

    auc, p_val = anomaly_detection_auc(p, labels, n_permutation=1000)
    
    p_val = false_discovery_control(p_val)

    df.loc[i] = p_val
    df_auc.loc[i] = auc