## Leave one out classification
This file runs a leave one out classification in each of the clusters to check if the discriminative performance in each of the clusters is better than the classification performance in the whole.

The process goes like this: given a set clustering, we perform leave one out classificaton tests in each of the cluster for the following problems and with the following classifiers:
* In the AD/CN, AD/MCI and MCI/AD tasks.
* Using linear regression, linear SVM, RBF SVM and random forests.

We use leave one out validation because other forms of validation, such as 10-fold CV, would not work, as we do not have enough data in some of the clusters/problems to work with.

It is not very useful, because sample sizes of each label are very dispar. Need to weight it someway.

In [1]:
# Include and load packages, config files

import numpy as np
import simlr_ad
import pandas as pd
from utils.data_utils import load_all_data
from utils.utils import compute_simlr, feat_ranking

# Parameters of the procedure
clusters = 3
rd_seed = 1714                                          # Random seed for experiment replication

# Paths
existing_cluster = True                               # Compute the clustering again or use an existing one
cluster_path = "results/extendeddata_cluster/cluster_data.csv"   # Path of the existing cluster, if applicable
covariate_path = "data/useddata_homo_abeta_plasma_meta.csv"                 # Path of the covariance data frame (.csv)
feature_path = "data/UCSDVOL.csv"                     # Path of the feature path (.csv)

covariate_data, cov_names, feature_data, feature_names = load_all_data(covariate_path, feature_path)
feature_data['DX'] = covariate_data.DX_bl.values

if existing_cluster:
    # Load existent
    c_data = pd.read_csv(cluster_path)
else:
    # Compute base clustering
    y_b, S, F, ydata, alpha = compute_simlr(
        np.array(covariate_data_new[cov_names]), clusters)



We need to define two loops:
* For each cluster,
* For each possible problem in the cluster AD/MCI, AD/CN MCI/AD

And, in each of the iterations, do a leave one out classification procedure with each of the classifiers:
* linear reg
* log reg
* lin svm
* rbf svm

In [13]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_validate
from sklearn import linear_model, svm, ensemble
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

def specificity_score(y_true, y_pred):
    """Only binary classification."""
    CM = confusion_matrix(y_true, y_pred,labels=[0,1])
    TN = CM[0][0]
    FP = CM[0][1]
    specificity = TN / (FP + TN)
    return specificity

def sensitivity_score(y_true, y_pred):
    """Only binary classification."""
    CM = confusion_matrix(y_true, y_pred,labels=[0,1])
    FN = CM[1][0]
    TP = CM[1][1]
    sensitivity = TP / (TP + FN)
    return sensitivity

def non_score(y_true, y_pred):
    return y_pred[0]

scoring_dict = {'pred': make_scorer(non_score)}

for c in range(1,clusters+1):
    # Select data clusters
    data_c = feature_data[c_data.C.values == c]
    probs = [('AD', 'LMCI'), ('CN', 'AD'), ('LMCI', 'CN')]
    for p in probs:
        print('Results for cluster: ' + str(c))
        print('For the classification problem of ' + p[0] + ' vs ' + p[1])
        # For each problem
        x_1 = data_c[data_c.DX.values == p[0]]
        x_2 = data_c[data_c.DX.values == p[1]]
        x_1 = x_1[feature_names].values.tolist()
        x_2 = x_2[feature_names].values.tolist()
        X = x_1 + x_2
        print(p[0] + ' samples: ' + str(len(x_1)))
        print(p[1] + ' samples: ' + str(len(x_2)))
        Y = np.concatenate((np.zeros(len(x_1), dtype=np.float64), np.ones(len(x_2), dtype=np.float64)))
        loo = LeaveOneOut()
        splits = loo.get_n_splits(X)
        
        # SVM        
        clf = svm.LinearSVC(class_weight='balanced')
        sc1 = cross_validate(clf, X, Y, cv=loo, scoring=scoring_dict, return_train_score=False)
        
        acc = accuracy_score(Y, sc1['test_pred'])
        sens = sensitivity_score(Y, sc1['test_pred'])
        spec = specificity_score(Y, sc1['test_pred'])
        
        print('Accuracy of linear SVM: ' + str(acc))
        print('Sensitivity of linear SVM: ' + str(sens))
        print('Specifitity of linear SVM: ' + str(spec))

        # Decisio nTree
        clf = DecisionTreeClassifier(max_depth=5, class_weight='balanced')
        sc2 = cross_validate(clf, X, Y, cv=loo, scoring=scoring_dict, return_train_score=False)
        
        acc = accuracy_score(Y, sc2['test_pred'])
        sens = sensitivity_score(Y, sc2['test_pred'])
        spec = specificity_score(Y, sc2['test_pred'])
        
        print('Accuracy of DecisionTreeClassifier: ' + str(acc))
        print('Sensitivity of DecisionTreeClassifier: ' + str(sens))
        print('Specifitity of DecisionTreeClassifier: ' + str(spec))
        
        
        # Naive Bayes
        clf = GaussianNB()
        sc3 = cross_validate(clf, X, Y, cv=loo, scoring=scoring_dict, return_train_score=False)
        acc = accuracy_score(Y, sc3['test_pred'])
        sens = sensitivity_score(Y, sc3['test_pred'])
        spec = specificity_score(Y, sc3['test_pred'])
        
        print('Accuracy of Naive Bayes: ' + str(acc))
        print('Sensitivity of Naive Bayes: ' + str(sens))
        print('Specifitity of Naive Bayes: ' + str(spec))
        
        # RBF SVM
        clf = svm.SVC(class_weight='balanced')
        sc4 = cross_validate(clf, X, Y, cv=loo, scoring=scoring_dict, return_train_score=False)
        acc = accuracy_score(Y, sc4['test_pred'])
        sens = sensitivity_score(Y, sc4['test_pred'])
        spec = specificity_score(Y, sc4['test_pred'])
        
        print('Accuracy of RBF SVM: ' + str(acc))
        print('Sensitivity of RBF SVM: ' + str(sens))
        print('Specifitity of RBF SVM: ' + str(spec))


Results for cluster: 1
For the classification problem of AD vs LMCI
AD samples: 20
LMCI samples: 68
Accuracy of linear SVM: 0.7272727272727273
Sensitivity of linear SVM: 0.75
Specifitity of linear SVM: 0.65
Accuracy of DecisionTreeClassifier: 0.7272727272727273
Sensitivity of DecisionTreeClassifier: 0.8088235294117647
Specifitity of DecisionTreeClassifier: 0.45
Accuracy of Naive Bayes: 0.7613636363636364
Sensitivity of Naive Bayes: 0.8382352941176471
Specifitity of Naive Bayes: 0.5
Accuracy of RBF SVM: 0.7272727272727273
Sensitivity of RBF SVM: 0.8088235294117647
Specifitity of RBF SVM: 0.45
Results for cluster: 1
For the classification problem of CN vs AD
CN samples: 24
AD samples: 20
Accuracy of linear SVM: 0.8181818181818182
Sensitivity of linear SVM: 0.8
Specifitity of linear SVM: 0.8333333333333334
Accuracy of DecisionTreeClassifier: 0.7954545454545454
Sensitivity of DecisionTreeClassifier: 0.8
Specifitity of DecisionTreeClassifier: 0.7916666666666666
Accuracy of Naive Bayes: 0.84

In [14]:
## For all the clusters
probs = [('AD', 'LMCI'), ('CN', 'AD'), ('LMCI', 'CN')]
data_c = feature_data
print('Results for all the clusters: ' + str(c))
for p in probs:
    print('For the classification problem of ' + p[0] + ' vs ' + p[1])
    # For each problem
    x_1 = data_c[data_c.DX.values == p[0]]
    x_2 = data_c[data_c.DX.values == p[1]]
    x_1 = x_1[feature_names].values.tolist()
    x_2 = x_2[feature_names].values.tolist()
    X = x_1 + x_2
    print(p[0] + ' samples: ' + str(len(x_1)))
    print(p[1] + ' samples: ' + str(len(x_2)))
    Y = np.concatenate((np.zeros(len(x_1), dtype=np.float64), np.ones(len(x_2), dtype=np.float64)))
    loo = LeaveOneOut()
    splits = loo.get_n_splits(X)

    # SVM        
    clf = svm.LinearSVC(class_weight='balanced')
    sc1 = cross_validate(clf, X, Y, cv=loo, scoring=scoring_dict)
    acc = accuracy_score(Y, sc1['test_pred'])
    sens = sensitivity_score(Y, sc1['test_pred'])
    spec = specificity_score(Y, sc1['test_pred'])

    print('Accuracy of linear SVM: ' + str(acc))
    print('Sensitivity of linear SVM: ' + str(sens))
    print('Specifitity of linear SVM: ' + str(spec))

    # Decisio nTree
    clf = DecisionTreeClassifier(max_depth=5, class_weight='balanced')
    sc2 = cross_validate(clf, X, Y, cv=loo, scoring=scoring_dict)
    acc = accuracy_score(Y, sc2['test_pred'])
    sens = sensitivity_score(Y, sc2['test_pred'])
    spec = specificity_score(Y, sc2['test_pred'])

    print('Accuracy of DecisionTreeClassifier: ' + str(acc))
    print('Sensitivity of DecisionTreeClassifier: ' + str(sens))
    print('Specifitity of DecisionTreeClassifier: ' + str(spec))

    # Naive Bayes
    clf = GaussianNB()
    sc3 = cross_validate(clf, X, Y, cv=loo, scoring=scoring_dict)
    acc = accuracy_score(Y, sc3['test_pred'])
    sens = sensitivity_score(Y, sc3['test_pred'])
    spec = specificity_score(Y, sc3['test_pred'])

    print('Accuracy of Naive Bayes: ' + str(acc))
    print('Sensitivity of Naive Bayes: ' + str(sens))
    print('Specifitity of Naive Bayes: ' + str(spec))

    # RBF SVM
    clf = svm.SVC(class_weight='balanced')
    sc4 = cross_validate(clf, X, Y, cv=loo, scoring=scoring_dict)
    acc = accuracy_score(Y, sc4['test_pred'])
    sens = sensitivity_score(Y, sc4['test_pred'])
    spec = specificity_score(Y, sc4['test_pred'])

    print('Accuracy of RBF SVM: ' + str(acc))
    print('Sensitivity of RBF SVM: ' + str(sens))
    print('Specifitity of RBF SVM: ' + str(spec))


Results for all the clusters: 3
For the classification problem of AD vs LMCI
AD samples: 85
LMCI samples: 161
Accuracy of linear SVM: 0.6341463414634146
Sensitivity of linear SVM: 0.6645962732919255
Specifitity of linear SVM: 0.5764705882352941
Accuracy of DecisionTreeClassifier: 0.5528455284552846
Sensitivity of DecisionTreeClassifier: 0.5962732919254659
Specifitity of DecisionTreeClassifier: 0.47058823529411764
Accuracy of Naive Bayes: 0.6585365853658537
Sensitivity of Naive Bayes: 0.7577639751552795
Specifitity of Naive Bayes: 0.47058823529411764
Accuracy of RBF SVM: 0.6382113821138211
Sensitivity of RBF SVM: 0.6770186335403726
Specifitity of RBF SVM: 0.5647058823529412
For the classification problem of CN vs AD
CN samples: 52
AD samples: 85
Accuracy of linear SVM: 0.8467153284671532
Sensitivity of linear SVM: 0.8470588235294118
Specifitity of linear SVM: 0.8461538461538461
Accuracy of DecisionTreeClassifier: 0.8321167883211679
Sensitivity of DecisionTreeClassifier: 0.83529411764705