In [44]:
import pandas as pd
import numpy as np
from IPython.display import HTML
import warnings

data = pd.read_csv('../data/ERP_data.csv')
labels = list(data.columns.values)
del labels[0]
del labels[0]

targets = data['Phenotype']
del data['Subject']
del data['Phenotype']

In [45]:
from sklearn import preprocessing, feature_selection, cross_validation, linear_model

folds = 10

imp = preprocessing.Imputer()
data = imp.fit_transform(data, targets)
data = preprocessing.scale(data)
anova_selection = feature_selection.SelectKBest(feature_selection.f_classif, k=21)
features_selected = anova_selection.fit(data, targets).get_support(indices=True)
labels = pd.Series(labels)[features_selected]
print(labels)
data = anova_selection.transform(data)
data = pd.DataFrame(data, columns=labels)

clf = linear_model.LogisticRegression(C=.001)

features_selected = []
for i in range(50):
    cross_val = cross_validation.StratifiedKFold(targets, n_folds=folds, shuffle=True)
    rfecv = feature_selection.RFECV(clf, cv=cross_val)
    rfecv.fit(data, targets)
    features_selected.append(rfecv.get_support())
    
feature_probs = np.mean(np.asarray(features_selected).astype(float), axis=0)
print(feature_probs)
features_selected = (feature_probs >= 0.5).nonzero()[0]
labels = pd.Series(np.asarray(labels))[features_selected]
print(labels)
data = data[labels]

4         P200DAFz
22         P3aDACz
32         P3bTAPz
34         P3aDAPz
40        P200DAF3
58         P3aDAP3
64        P200DAF4
82         P3aDAP4
87     N100SAvgAFz
92     P200DAvgAFz
96      P3aDAvgAFz
100    N100SAvgACz
109     P3aDAvgACz
122     P3aDAvgAPz
131    P200DAvgAF3
135     P3aDAvgAF3
148     P3aDAvgAP3
157    P200DAvgAF4
161     P3aDAvgAF4
174     P3aDAvgAP4
175        Missing
dtype: object
[ 0.5   1.    0.82  1.    1.    1.    1.    1.    1.    0.98  0.7   1.    1.
  1.    1.    0.92  1.    1.    1.    1.    1.  ]
0        P200DAFz
1         P3aDACz
2         P3bTAPz
3         P3aDAPz
4        P200DAF3
5         P3aDAP3
6        P200DAF4
7         P3aDAP4
8     N100SAvgAFz
9     P200DAvgAFz
10     P3aDAvgAFz
11    N100SAvgACz
12     P3aDAvgACz
13     P3aDAvgAPz
14    P200DAvgAF3
15     P3aDAvgAF3
16     P3aDAvgAP3
17    P200DAvgAF4
18     P3aDAvgAF4
19     P3aDAvgAP4
20        Missing
dtype: object


In [46]:
from sklearn import metrics

accuracies = []
precisions = []
recalls = []
fscores = []
confusions = []
warnings.filterwarnings("ignore")
for i in range(200):
    cross_val = cross_validation.StratifiedKFold(targets, n_folds=folds, shuffle=True)
    preds = cross_validation.cross_val_predict(clf, data, targets, cv=cross_val)
    accuracy = metrics.accuracy_score(targets, preds)
    precision, recall, fscore, support = metrics.precision_recall_fscore_support(
        targets, preds, average='binary', pos_label='AD')
    confusion = metrics.confusion_matrix(targets, preds, labels=['HC', 'AD'])
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    fscores.append(fscore)
    confusions.append(confusion)

print("Accuracy: {0}, with std: {1}".format(np.mean(accuracies), np.std(accuracies)))
print("Precision: {0}".format(np.mean(precisions)))
print("Recall: {0}".format(np.mean(recalls)))
print("F1 Score: {0}".format(np.mean(fscores)))
print("Confusion Matrix:\n   HC     AD\n{0}".format(np.mean(confusions, axis=0)))

Accuracy: 0.7077135678391958, with std: 0.005941082153965935
Precision: 0.6848915962626434
Recall: 0.763989898989899
F1 Score: 0.7222659406855653
Confusion Matrix:
   HC     AD
[[ 65.2    34.8  ]
 [ 23.365  75.635]]
