In [13]:
import pandas as pd
import numpy as np
import warnings

data = pd.read_csv('../data/ERP_data.csv')
labels = list(data.columns.values)
del labels[0]
del labels[0]

targets = data['Phenotype']
del data['Subject']
del data['Phenotype']

In [14]:
from sklearn import preprocessing
from sklearn import feature_selection
from sklearn import cross_validation

folds = 10

imp = preprocessing.Imputer()
data = imp.fit_transform(data, targets)
data = preprocessing.scale(data)

big_feature_data = feature_selection.SelectKBest(feature_selection.f_classif, k=22).fit_transform(data, targets)
med_feature_data = feature_selection.SelectKBest(feature_selection.f_classif, k=16).fit_transform(data, targets)
small_feature_data = feature_selection.SelectKBest(feature_selection.f_classif, k=12).fit_transform(data, targets)

In [15]:
from sklearn import linear_model, svm, neighbors, ensemble, naive_bayes, discriminant_analysis
from sklearn import pipeline
from sklearn import metrics

logreg_model = linear_model.LogisticRegression(C=.001)
svm_model = svm.SVC(C=0.66)
neighbors_model = neighbors.KNeighborsClassifier(n_neighbors=13)
naive_bayes_model = naive_bayes.GaussianNB()
lda_model = discriminant_analysis.LinearDiscriminantAnalysis(solver='eigen', shrinkage=0.2)
rf_model = ensemble.RandomForestClassifier(n_estimators=20, min_samples_split=3, min_samples_leaf=2)
adaboost_model = ensemble.AdaBoostClassifier()
gboost_model = ensemble.GradientBoostingClassifier(min_samples_split=35, learning_rate=0.5)
stack_logreg_model = linear_model.LogisticRegression(C=0.1)
# stack_gboost_model = ensemble.GradientBoostingClassifier()

pairs = []
pairs.append((big_feature_data, logreg_model))
pairs.append((med_feature_data, svm_model))
pairs.append((big_feature_data, neighbors_model))
pairs.append((big_feature_data, naive_bayes_model))
pairs.append((big_feature_data, lda_model))
pairs.append((small_feature_data, rf_model))
pairs.append((small_feature_data, adaboost_model))
pairs.append((small_feature_data, gboost_model))
le = preprocessing.LabelEncoder()

accuracies = []
precisions = []
recalls = []
fscores = []
confusions = []
warnings.filterwarnings("ignore")
for i in range(200):
    cross_val = cross_validation.StratifiedKFold(targets, n_folds=folds, shuffle=True)
    y_truth = []
    y_preds = []
    for train, test in cross_val:
        y_train, y_test = targets[train], targets[test]
        pred_features = np.transpose([le.fit_transform(clf.fit(dataset[train], y_train).predict(dataset)) 
                                      for dataset, clf in pairs])
        y_truth.extend(y_test)
        preds = stack_logreg_model.fit(pred_features[train], y_train).predict(pred_features[test])
        y_preds.extend(preds)
    
    accuracy = metrics.accuracy_score(y_truth, y_preds)
    precision, recall, fscore, support = metrics.precision_recall_fscore_support(
        y_truth, y_preds, average='binary', pos_label='AD')
    confusion = metrics.confusion_matrix(y_truth, y_preds, labels=['HC', 'AD'])
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    fscores.append(fscore)
    confusions.append(confusion)

print("Accuracy: {0}, with std: {1}".format(np.mean(accuracies), np.std(accuracies)))
print("Precision: {0}".format(np.mean(precisions)))
print("Recall: {0}".format(np.mean(recalls)))
print("F1 Score: {0}".format(np.mean(fscores)))
print("Confusion Matrix:\n   HC     AD\n{0}".format(np.mean(confusions, axis=0)))

Accuracy: 0.736859296482412, with std: 0.01702752159014164
Precision: 0.7520812452210499
Recall: 0.7034848484848485
F1 Score: 0.7267108922193721
Confusion Matrix:
   HC     AD
[[ 76.99   23.01 ]
 [ 29.355  69.645]]
