## Altered pipeline using cross_validate and base classifiers (no hyperparemeters or search)

### Importing

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import time
import os
from os import path as op
from pathlib import Path
import yaml
from yaml import CLoader as Loader
from glob import glob
import matplotlib.pyplot as plt
import scipy.stats as ss
# MNE
import mne
from mne_bids import write_raw_bids, BIDSPath, update_sidecar_json
from mne_bids.stats import count_events
from mne import io, EvokedArray
from mne.decoding import Vectorizer, get_coef, LinearModel
# Scikit-learn
from sklearn.utils.fixes import loguniform
from sklearn.utils import compute_class_weight
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import PrecisionRecallDisplay, ConfusionMatrixDisplay, classification_report, make_scorer, balanced_accuracy_score, fbeta_score, precision_recall_curve, precision_score, recall_score, accuracy_score, roc_auc_score, f1_score, matthews_corrcoef, confusion_matrix
    # Classifiers
from sklearn import svm
from sklearn.svm import LinearSVC    
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier


mne.set_log_level(verbose='Warning')

In [8]:
conditions = ['Neutral/Upright/Faces/Target','Neutral/Upright/Faces/Standard',
              'Neutral/Upright/Silhouettes/Target','Neutral/Upright/Silhouettes/Standard',
              
              'Green/Upright/Faces/Target','Green/Upright/Faces/Standard',
              'Green/Upright/Silhouettes/Target','Green/Upright/Silhouettes/Standard',
              
              'Neutral/Inverted/Faces/Target','Neutral/Inverted/Faces/Standard',
              'Neutral/Inverted/Silhouettes/Target','Neutral/Inverted/Silhouettes/Standard',
              
              'Green/Inverted/Faces/Target','Green/Inverted/Faces/Standard',
              'Green/Inverted/Silhouettes/Target','Green/Inverted/Silhouettes/Standard',
              
              'Target', 'Standard'
             ]

coi = ['Target', 'Standard']

contrasts = {'Neutral/Upright/Faces':['Neutral/Upright/Faces/Target','Neutral/Upright/Faces/Standard'],
             'Neutral/Upright/Silhouettes':['Neutral/Upright/Silhouettes/Target','Neutral/Upright/Silhouettes/Standard'],
             
             'Green/Upright/Faces':['Green/Upright/Faces/Target','Green/Upright/Faces/Standard'],
             'Green/Upright/Silhouettes':['Green/Upright/Silhouettes/Target','Green/Upright/Silhouettes/Standard'],
             
             'Neutral/Inverted/Faces':['Neutral/Inverted/Faces/Target','Neutral/Inverted/Faces/Standard'],
             'Neutral/Inverted/Silhouettes':['Neutral/Inverted/Silhouettes/Target','Neutral/Inverted/Silhouettes/Standard'],
             
             'Green/Inverted/Faces':['Green/Inverted/Faces/Target','Green/Inverted/Faces/Standard'],
             'Green/Inverted/Silhouettes':['Green/Inverted/Silhouettes/Target','Green/Inverted/Silhouettes/Standard'],
             
             'Target-Nontarget':['Target', 'Standard']
            }


### Yaml + Pathing

In [9]:
## YAML
bids_root = '../..'

cfg_file = op.join(bids_root, 'config.yml')
with open(cfg_file, 'r') as f:
    config = yaml.load(f, Loader=Loader)

study_name = config['study_name']
task = config['task']
data_type = config['data_type']
eog = config['eog']
montage_fname = config['montage_fname']
n_jobs = 22

epoch_p =  {k: v for d in config['preprocessing_settings']['epoch'] for k, v in d.items()}

cl_p = {k: v for d in config['classification'] for k, v in d.items()}

## Pathing
source_path = op.join(bids_root, 'derivatives', 'erp_preprocessing')

derivatives_path = op.join(bids_root, 'derivatives', 'erp_classification_test_' + str(cl_p['test_size'])[-1] + '0_pct')
if Path(derivatives_path).exists() == False:
    Path(derivatives_path).mkdir(parents=True)

out_path = op.join(derivatives_path, 'data')
if Path(out_path).exists() == False:
    Path(out_path).mkdir(parents=True)

report_path = op.join(derivatives_path, 'reports')
if Path(report_path).exists() == False:
    Path(report_path).mkdir(parents=True)

fig_path = op.join(derivatives_path, 'figures')
if Path(fig_path).exists() == False:
    Path(fig_path).mkdir(parents=True) 

tab_path = op.join(derivatives_path, 'tables')
if Path(tab_path).exists() == False:
    Path(tab_path).mkdir(parents=True) 
    
epochs_suffix = '-epo.fif'

## Output files
out_file = op.join(tab_path, 'classification_overall_results.csv')
summary_file =  op.join(tab_path, 'classification_accuracy_summary.csv')
plot_stem = op.join(fig_path, 'plot_')
fig_format = 'pdf'

### Instantiating classifiers, parameter grids, and scoring metrics.

In [10]:
scaler = StandardScaler()
vectorizer = Vectorizer()

svm = LinearSVC(random_state=42, max_iter=5000, dual=True)
lda = LinearDiscriminantAnalysis()
rf = RandomForestClassifier(random_state=42, n_jobs=10)


classifiers = {'SVM': svm,
               'LDA': lda,
               'RF': rf
              }

## SCORING
scoring = {'Prec': make_scorer(precision_score, zero_division=0),
           'Bal_Acc': make_scorer(balanced_accuracy_score),
           'Acc': make_scorer(accuracy_score),
           'Recall': make_scorer(recall_score),
           'ROC': make_scorer(roc_auc_score),
           'Matthews_Coef': make_scorer(matthews_corrcoef),
           'Fbeta_0.5': make_scorer(fbeta_score, beta = 0.5),
           'Fbeta_1.5': make_scorer(fbeta_score, beta = 1.5),
           'F1_score': make_scorer(f1_score, zero_division=0)
          }


### Subjects + Loading em' in

In [None]:
## For Running All Participants' Data in Batch
prefix = 'sub-'
subjects = sorted([s[-7:] for s in glob(source_path + '/' + prefix + '*')])
print("n subjects = ", len(subjects))


## Reading in data
epochs = {}
print('Loading Subjects:', subjects)
for subject in subjects:
    raw_path = op.join(bids_root, 'derivatives', 'erp_preprocessing', subject, 'eeg')
    raw_subj = glob(op.join(raw_path + '/' + '*-epo.fif'))
    
    
    epochs[subject] = mne.read_epochs(raw_subj.pop(), proj=False, verbose=False, preload=True)

    
    # Correcting for presentation delay
    epochs[subject]._raw_times = epochs[subject]._raw_times - epoch_p['tshift']
    epochs[subject]._times_readonly = epochs[subject]._times_readonly - epoch_p['tshift']


In [None]:
epochdf = pd.DataFrame()
epochlist = []

for subject in subjects:

    raw_path = op.join(bids_root, 'derivatives', 'erp_preprocessing', subject, 'eeg')
    
    raw_subj = glob(op.join(raw_path + '/' + '*-epo.fif'))
    
    epochs = mne.read_epochs(raw_subj.pop(), proj=False, preload=True)
    num_epochs = len(epochs)

    print(f'Subject {subject} has: {num_epochs}')
    
    epochlist.append(pd.DataFrame({'subject':subject,
                                   '# epochs':num_epochs}, index=[0]))
    
epochdf = pd.concat(epochlist)
epochdf.to_csv('participant_epoch_data.csv')

### Batch Loop

In [None]:
%%time
%xmode Verbose

# Making the crossvalidation to be used in the RandomizedSearch
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for subject in subjects:
    print('\n-------\n\033[;40m' + subject + '\033[m')
    
    # Clearing out the saved data for each participant
    data_table = pd.DataFrame()
    data_table_list = []
    
    for contr, conds in contrasts.items():
        print('-------\n\033[94;40m' + contr + '\033[m')
        subj_epochs = epochs[subject][conds]
        
        # Create a list of labels from event codes mapped to event_id
        event_id_rev = dict(zip(subj_epochs.event_id.values(), subj_epochs.event_id.keys()))
        labels_all = [event_id_rev[e] for e in subj_epochs.events[:, 2]]
        labels_all = pd.DataFrame(labels_all)[0].str.split('/', expand=True).rename(columns={0:'Colour', 1:'Orientation', 2:'Type', 3:'Status', 4:'Location'} )
        label_map = {'Target':1, 'Standard':0}
        labels_all['labels'] = labels_all['Status'].map(label_map)
        labels = labels_all['labels']
        
        # Extract data from subj_epochs and vectorize 
        D = subj_epochs.get_data()
        
        # Create train-test split
        X_train, X_test, y_train, y_test = train_test_split(D, 
                                                            labels,
                                                            stratify=labels,
                                                            test_size=cl_p['test_size'], 
                                                            random_state=42,
                                                            shuffle=True
                                                           )

        # Classifier Loop
        for c_name, c in classifiers.items():
            print('-------\nRunning classifier: \033[1;91;40m' + c_name + '\033[m')

            # Making the Pipeline
            clf = Pipeline([('Vectorizer', vectorizer),
                             ('Scaler', scaler),
                             (c_name, c)                                 
                             ])

            # Cross validating
            cv_cv = cross_validate(clf, X_train, y_train, 
                                   cv=cv,
                                   scoring=scoring,
                                   return_train_score=True, # Determines if Training scores are included in .cv_results_
                                   n_jobs=5,
                                   error_score='raise' # For debugging purposes
                                   )

            print('Training Classifier')
            clf = clf.fit(X_train, y_train)

            print('Predicting...')
            y_pred = clf.predict(X_test)

            print('Scoring...')        
            print(classification_report(y_test, y_pred))

            # Confusion Matrix Generation and Visualization within the loop -> saved to csv as "[[TN, FN] [FP, TP]]"
            cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
            cmd = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
            cmd.plot()
            plt.title(subject + '_' + str(contr).replace('/', '_') + '_' + c_name)
            plt.show()

            # Saving CV results to a DataFrame 
            results = pd.DataFrame(cv_cv)

            data_table_list.append(pd.DataFrame({'participant_id': subject,
                                              'Condition': contr,
                                              'Classifier': c_name,

                                              # Confusion_matrix saved in format: [[TN, FN] [FP, TP]]
                                              'Confusion_Matrix': str(cm),                                                 

                                              'CV_Train_Bal_Accuracy': results['train_Bal_Acc'].round(3) * 100,
                                              'CV_Test_Bal_Accuracy': results['test_Bal_Acc'].round(3) * 100,
                                              'Test_Bal_Accuracy': round(balanced_accuracy_score(y_test, y_pred), 3) * 100,

                                              'CV_Train_Accuracy': results['train_Acc'].round(3) * 100,
                                              'CV_Test_Accuracy': results['test_Acc'].round(3) * 100,
                                              'Test_Accuracy': round(accuracy_score(y_test, y_pred), 3) * 100, 

                                              'CV_Train_Precision': results['train_Prec'].round(3) * 100,
                                              'CV_Test_Precision': results['test_Prec'].round(3) * 100,                                                
                                              'Test_Precision': round(precision_score(y_test, y_pred, zero_division=0), 3) * 100,    

                                              'CV_Train_Matthews_coef': results['train_Matthews_Coef'].round(3),
                                              'CV_Test_Matthews_coef': results['test_Matthews_Coef'].round(3),
                                              'Matthews_Coef': round(matthews_corrcoef(y_test, y_pred), 3),

                                              'CV_Train_Recall': results['train_Recall'].round(3) * 100,
                                              'CV_Test_Recall': results['test_Recall'].round(3) * 100,
                                              'Test_recall': round(recall_score(y_test, y_pred), 3) * 100,

                                              'CV_Train_Fbeta_0.5': results['train_Fbeta_0.5'].round(3),
                                              'CV_Train_Fbeta_0.5': results['train_Fbeta_0.5'].round(3),
                                              'Fbeta_0.5': round(fbeta_score(y_test, y_pred, beta = 0.5, zero_division=0), 3),

                                              'CV_Train_Fbeta_1.5': results['train_Fbeta_1.5'].round(3),
                                              'CV_Test_Fbeta_1.5': results['test_Fbeta_1.5'].round(3),
                                              'Fbeta_1.5': round(fbeta_score(y_test, y_pred, beta = 1.5, zero_division=0), 3),

                                              'CV_Train_F1': results['train_F1_score'].round(3),
                                              'CV_Test_F1': results['test_F1_score'].round(3),
                                              'F1_score': round(f1_score(y_test, y_pred, zero_division=0), 3),

                                              'CV_Train_ROC_AUC': results['train_ROC'].round(3),
                                              'CV_Test_ROC_AUC': results['test_ROC'].round(3),                                             
                                              'Test_ROC_AUC': round(roc_auc_score(y_test, y_pred), 3),

                                              'Mean Fit Time': results['fit_time'].round(3),
                                              'Mean Score Time': results['score_time'].round(3)
                                             }, index=[0]
                                            )
                               )

    # Saving Data to CSV Per Participant
    data_table = pd.concat(data_table_list)
    data_table.to_csv(f'[Directory] {str(subject)} new_Data.csv')