In [50]:
import pandas as pd
import numpy as np
import warnings

data = pd.read_csv('../data/ERP_data.csv')
labels = list(data.columns.values)
del labels[0]
del labels[0]

targets = data['Phenotype']
del data['Subject']
del data['Phenotype']

In [51]:
from sklearn import preprocessing
from sklearn import feature_selection
from sklearn import cross_validation

folds = 10

imp = preprocessing.Imputer()
data = imp.fit_transform(data, targets)
data = preprocessing.scale(data)
anova_selection_logreg = feature_selection.SelectKBest(feature_selection.f_classif, k=21)
anova_selection_svm = feature_selection.SelectKBest(feature_selection.f_classif, k=16)
anova_selection_neighbors = feature_selection.SelectKBest(feature_selection.f_classif, k=22)
anova_selection_rf = feature_selection.SelectKBest(feature_selection.f_classif, k=11)
anova_selection_gboost = feature_selection.SelectKBest(feature_selection.f_classif, k=12)

In [52]:
from sklearn import linear_model, svm, neighbors, ensemble, naive_bayes
from sklearn import pipeline
from sklearn import metrics

logreg_model = linear_model.LogisticRegression(C=.001)
svm_model = svm.SVC(C=0.66)
neighbors_model = neighbors.KNeighborsClassifier(n_neighbors=13)
naive_bayes_model = naive_bayes.GaussianNB()
rf_model = ensemble.RandomForestClassifier(n_estimators=20, min_samples_split=3, min_samples_leaf=2)
gboost_model = ensemble.GradientBoostingClassifier(min_samples_split=35, learning_rate=0.5, n_estimators=110)
stack_logreg_model = linear_model.LogisticRegression(C=0.1)
stack_gboost_model = ensemble.GradientBoostingClassifier()

pipes = []
pipes.append(pipeline.make_pipeline(anova_selection_logreg, logreg_model))
pipes.append(pipeline.make_pipeline(anova_selection_svm, svm_model))
pipes.append(pipeline.make_pipeline(anova_selection_neighbors, neighbors_model))
pipes.append(pipeline.make_pipeline(anova_selection_neighbors, naive_bayes_model))
pipes.append(pipeline.make_pipeline(anova_selection_rf, rf_model))
pipes.append(pipeline.make_pipeline(anova_selection_gboost, gboost_model))
le = preprocessing.LabelEncoder()

accuracies = []
precisions = []
recalls = []
fscores = []
confusions = []
warnings.filterwarnings("ignore")
for i in range(200):
    cross_val = cross_validation.StratifiedKFold(targets, n_folds=folds, shuffle=True)
    y_truth = []
    y_preds = []
    for train, test in cross_val:
        X_train, y_train, y_test = data[train], targets[train], targets[test]
        pred_features = np.transpose([le.fit_transform(clf.fit(X_train, y_train).predict(data)) for clf in pipes])
        y_truth.extend(y_test)
        preds = stack_logreg_model.fit(pred_features[train], y_train).predict(pred_features[test])
        y_preds.extend(preds)
    
    accuracy = metrics.accuracy_score(y_truth, y_preds)
    precision, recall, fscore, support = metrics.precision_recall_fscore_support(
        y_truth, y_preds, average='binary', pos_label='AD')
    confusion = metrics.confusion_matrix(y_truth, y_preds, labels=['HC', 'AD'])
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    fscores.append(fscore)
    confusions.append(confusion)

print("Accuracy: {0}, with std: {1}".format(np.mean(accuracies), np.std(accuracies)))
print("Precision: {0}".format(np.mean(precisions)))
print("Recall: {0}".format(np.mean(recalls)))
print("F1 Score: {0}".format(np.mean(fscores)))
print("Confusion Matrix:\n   HC     AD\n{0}".format(np.mean(confusions, axis=0)))

Accuracy: 0.6856532663316582, with std: 0.024186976831265118
Precision: 0.7160774065244155
Recall: 0.6111616161616162
F1 Score: 0.6589781958656528
Confusion Matrix:
   HC     AD
[[ 75.94   24.06 ]
 [ 38.495  60.505]]
