---
# Intertrial-Variability: Classification and Correlation

In [None]:
import sys; sys.path.insert(1, '../')
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from hyperopt import hp, Trials, fmin, tpe, STATUS_OK
import os
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from pathlib import Path
from scripts.util import hyperopt_train_test, f

---
## Define Functions

In [None]:
# range of parameters to be tested
space4svm = {
'C': hp.uniform('C', 0.0001, 100),
'gamma': hp.uniform('gamma', 0.0001, 100),
}

# find best parameters
# function to hyperoptimise parameters
def hyperopt_train_test(params):
    model = SVC(kernel = 'linear', **params)
    #scaler for data
    scaler = StandardScaler()
    #make pipeline inorder to have scaling be part of cv
    pipeline = make_pipeline(scaler,model)
    # set in cross-validation
    cv = LeaveOneOut()
    result = cross_val_score(pipeline,cv_set,y_cv,cv=cv,scoring='accuracy')
    return result.mean()
    
def f(params):
    acc = hyperopt_train_test(params)
    return {'loss': -acc, 'status': STATUS_OK}

## Read Feature Table

In [None]:
df = pd.read_csv(os.path.join("..", "processed", "dataframe_asd_2.csv") )
df.head()

In [None]:
# empty df to save accuracies in
results_eval = pd.DataFrame(columns=['ITV_ROI','ETV-slope','ETV-variability',
                                     'ETV-variability_detrended','ITV-ratio','All Metrics'])

# Classification

In [None]:
n_crossvals = 20  # how often the hyperparameter optimization + LOO CV is repeated
max_evals = 60  # Number of iterations for hyperparameter optimization

## Get accuracies for all metrics separately

In [None]:
# classification labels
y = np.array([0 if i == 'Control' else 1 for i in df['Group']])
# metrics to get accuracy from
metric = ['ITV_ROI','ETV-slope','ETV-variability','ETV-variability_detrended','ITV-ratio']

# empty list to evaluation scores in to
evals = []

# loop through all metrics
for m in metric:
    # empty list to put specific metric eval scores in to
    metric_eval = []
    # do loo-cv for each metric 20 times 
    for i in range(n_crossvals):
        # empty list to put evaluation score in to
        evaluation_set = []
        # convert df to array
        X = df[m].to_numpy()
        # loop through array of data
        for evalu_index, evalu in enumerate(X):
            scaler = StandardScaler()
            #create a training fold with all but 1 participanrt
            cv_set = X[np.where(np.where(X==X)[0]!=evalu_index)]
            cv_set = cv_set.reshape(-1, 1)
            y_cv = y[np.where(np.where(y==y)[0]!=evalu_index)]
            y_true = y[evalu_index]
            
            # get best parameters
            trials = Trials()
            best = fmin(f, space4svm, algo=tpe.suggest, max_evals=max_evals, trials=trials)

            # instantiate classifier
            clf = SVC(kernel = 'linear', **best)
            # scale training set 
            scaler.fit_transform(cv_set)
            # train classifier
            clf.fit(cv_set,y_cv)
            # scale evaluation set based on training scaling
            evalu = evalu.reshape(1, -1)
            scaler.transform(evalu)
            # predict
            y_pred = clf.predict(evalu)
            # if prediction matches true label
            if y_pred == y_true:
                result = 1
            elif y_pred != y_true:
                result = 0
            # save evaluation set prediction
            evaluation_set.append(result)
        # get score of all folds
        eval_score = sum(evaluation_set) / len(evaluation_set)
        metric_eval.append(eval_score)
    evals.append(metric_eval)
    # save median of the 20 runs of each metric loo-cv
    results_eval.loc['LOO Score (Eval)', m] = np.median(metric_eval)

## Get accuracies for all metrics combined & average distances to hyperplane

In [None]:
# do the same procedure as above but for all metrics instead of just one
# also get average distances
y = np.array([0 if i == 'Control' else 1 for i in df['Group']])
metric = ['ITV_ROI','ETV-slope','ETV-variability','ETV-variability_detrended','ITV-ratio'] #
evaluation_set = []
results_distances = np.zeros([len(y)])
X = df[metric].to_numpy()

evals_all = []
for i in range(n_crossvals):
    for evalu_index, evalu in enumerate(X):
        scaler = StandardScaler()
        indices_X = np.where(np.where(X[:,0]==X[:,0])[0]!=evalu_index)
        indices_y = np.where(np.where(y==y)[0]!=evalu_index)
        cv_set = X[indices_X]
        y_cv = y[indices_y]
        y_true = y[evalu_index]
        
        # get best parameters
        trials = Trials()
        best = fmin(f, space4svm, algo=tpe.suggest, max_evals=max_evals, trials=trials)

        clf = SVC(kernel = 'linear', **best)
        scaler.fit_transform(cv_set)
        clf.fit(cv_set,y_cv)
        # get distances of cv set
        dist = clf.decision_function(cv_set)
        # add distances to single results array
        results_distances[indices_y] += dist
        evalu = evalu.reshape(1, -1)
        scaler.transform(evalu)
        y_pred = clf.predict(evalu)
        if y_pred == y_true:
            result = 1
        elif y_pred != y_true:
            result = 0
        evaluation_set.append(result)

        eval_score = sum(evaluation_set) / len(evaluation_set)
    evals_all.append(eval_score)
    results_eval.loc['LOO Score (Eval)', 'All Metrics'] = np.median(evals_all)

# Hyperplane Distance

* Calculate the distance between each participant and the hyperplane of the SVM

In [None]:
# get mean distances & save to dataframe
# divide the distances by number of times they have been calculated to get mean
# because of leave one out they have been calculated 34 times * 20
final_dists = results_distances / ((len(y)-1)*20)

results_dists = pd.DataFrame(columns=['Distance'])
for sub_idx, sub in enumerate(df['Subject']):
    results_dists.loc[sub, 'Distance'] = final_dists[sub_idx]


# Classification Results:
print("Classification Results")
display(results_eval)

print("Distances")
display(results_dists)

# save all accuracies df as csv
# results_eval.to_csv('results_eval_acc.csv')
# save distances
# results_dists.to_csv('results_dists_34.csv')

# Correlation between metrics and AQ/EQ

In [None]:
results_correlations = pd.DataFrame(columns=['ITV_ROI/AQ','ITV_ROI/EQ','ETV-slope/AQ','ETV-slope/EQ',
                                             'ETV-variability/AQ','ETV-variability/EQ','ETV-variability_detrended/AQ',
                                             'ETV-variability_detrended/EQ','ITV-ratio/AQ','ITV-ratio/EQ'])
                                             
# loop through every metric and quotient to correlate aq and eq with metric
metric = ['ITV_ROI','ETV-slope','ETV-variability','ETV-variability_detrended','ITV-ratio']
Q = ['AQ','EQ']
for m in metric:
    for q in Q:
        column_name = f'{m}/{q}'
        # correlate metric with AQ or EQ
        r, p = pearsonr(df[m],df[q])
        results_correlations.loc['r',column_name] = r
        results_correlations.loc['p',column_name] = p

display(results_correlations)

# Correlation between Hyperplane distance and AQ/EQ

In [None]:
# empty dataframe to put distance correlations in to
results_dist_correlation = pd.DataFrame(columns=['Distance/AQ','Distance/EQ'])

# loop through 
quotient = ['AQ', 'EQ']

for q in quotient:
    quot = df[q]
    dist = results_dists['Distance']
    r, p = pearsonr(dist,quot)
    results_dist_correlation.loc['r',f'Distance/{q}'] = r
    results_dist_correlation.loc['p',f'Distance/{q}'] = p
display(results_dist_correlation)