In [23]:
import numpy as np
from sklearn.model_selection import LeaveOneGroupOut,GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.ensemble import RandomForestClassifier,IsolationForest
from sklearn.neural_network import MLPClassifier
import joblib


In [24]:
nJobs = 12  # Number of cores to use

# Load feature matrices, labels, and groups (denoting which labeled time
# segment each row of the feature matrix comes from)

featuresAll = np.loadtxt('JohnAll2.csv',delimiter=',')
featuresAcc = np.loadtxt('JohnAcc2.csv',delimiter=',')
featuresEda = np.loadtxt('JohnEda2.csv',delimiter=',')
labels = np.loadtxt('JohnLabels2.csv',delimiter=',')
groups = np.loadtxt('JohnGroups2.csv',delimiter=',')
NickAll = np.loadtxt('NickAll3.csv',delimiter=',')
NickAcc = np.loadtxt('NickAcc3.csv',delimiter=',')
NickEda = np.loadtxt('NickEda3.csv',delimiter=',')
NickLabels = np.loadtxt('NickLabels3.csv',delimiter=',')
NickGroups = np.loadtxt('NickGroups3.csv',delimiter=',')

# Leave-one-group-out cross-validation
cv = LeaveOneGroupOut()

In [25]:
# Isolation Forest
# Parameter tuning by grid search
IFparameters = {'n_estimators': 10*np.arange(1,21)}
IFgsAll = GridSearchCV(IsolationForest(),
                     IFparameters,'roc_auc',n_jobs=nJobs,cv=cv,refit=True,
                     verbose=1)
IFgsAll.fit(featuresAll,1-labels,groups)
bestNumTreesAll = IFgsAll.best_params_['n_estimators']

IFgsAcc = GridSearchCV(IsolationForest(),
                     IFparameters,'roc_auc',n_jobs=nJobs,cv=cv,refit=True,
                     verbose=1)
IFgsAcc.fit(featuresAcc,1-labels,groups)
bestNumTreesAcc = IFgsAcc.best_params_['n_estimators']

IFgsEda = GridSearchCV(IsolationForest(),
                     IFparameters,'roc_auc',n_jobs=nJobs,cv=cv,refit=True,
                     verbose=1)
IFgsEda.fit(featuresEda,1-labels,groups)
bestNumTreesEda = IFgsEda.best_params_['n_estimators']

In [27]:
IFpredAll = np.zeros(np.shape(labels))
IFpredAcc = np.zeros(np.shape(labels))
IFpredEda = np.zeros(np.shape(labels))
IFpredNickAll = np.zeros(np.shape(NickLabels))
IFpredNickAcc = np.zeros(np.shape(NickLabels))
IFpredNickEda = np.zeros(np.shape(NickLabels))

#training the model using dataset
for train, test in cv.split(featuresAll,labels,groups):
    
    IFAll = IsolationForest(n_estimators=bestNumTreesAll)
    IFAll.fit(featuresAll[train,:])
    IFpredAll[test] = IFAll.decision_function(featuresAll[test,:])
    
    IFAcc = IsolationForest(n_estimators=bestNumTreesAcc)
    IFAcc.fit(featuresAcc[train,:])
    IFpredAcc[test] = IFAcc.decision_function(featuresAcc[test,:])

    IFEda = IsolationForest(n_estimators=bestNumTreesEda)
    IFEda.fit(featuresEda[train,:])
    IFpredEda[test] = IFEda.decision_function(featuresEda[test,:])


#accuracy scores of the model
print('IF AUC ALL: %f (%s)' % (roc_auc_score(1-labels,IFpredAll),IFgsAll.best_params_))
print('IF AUC ALL: %f (%s)' % (roc_auc_score(1-labels,IFpredAcc),IFgsAcc.best_params_))
print('IF AUC ALL: %f (%s)' % (roc_auc_score(1-labels,IFpredEda),IFgsEda.best_params_))

IF AUC ALL: 0.848566 ({'n_estimators': 80})
IF AUC ALL: 0.828674 ({'n_estimators': 130})
IF AUC ALL: 0.638183 ({'n_estimators': 130})


In [28]:
#testing the model with another dataset
IFAll = IsolationForest(n_estimators=bestNumTreesAll)
IFAll.fit(featuresAll)
IFNickPredAll = IFAll.decision_function(NickAll)

IFAcc = IsolationForest(n_estimators=bestNumTreesAcc)
IFAcc.fit(featuresAcc)
IFNickPredAcc = IFAcc.decision_function(NickAcc)

IFEda = IsolationForest(n_estimators=bestNumTreesEda)
IFEda.fit(featuresEda)
IFNickPredEda = IFEda.decision_function(NickEda)

#accuracy of prediction with current model
print('IF AUC ALL PREDICTION: %f (%s)' % (roc_auc_score(1-NickLabels,IFNickPredAll),
                            IFgsAll.best_params_))
print('IF AUC ACC PREDICTION: %f (%s)' % (roc_auc_score(1-NickLabels,IFNickPredAcc),
                            IFgsAcc.best_params_))
print('IF AUC EDA PREDICTION: %f (%s)' % (roc_auc_score(1-NickLabels,IFNickPredEda),
                            IFgsEda.best_params_))

IF AUC ALL PREDICTION: 0.816378 ({'n_estimators': 80})
IF AUC ACC PREDICTION: 0.857826 ({'n_estimators': 130})
IF AUC EDA PREDICTION: 0.476188 ({'n_estimators': 130})


In [29]:
#training the model with the other dataset and doing the same
IFgsAll2 = GridSearchCV(IsolationForest(),
                     IFparameters,'roc_auc',n_jobs=nJobs,cv=cv,refit=True,
                     verbose=1)
IFgsAll2.fit(NickAll,1-NickLabels,NickGroups)
bestNumTreesAll2 = IFgsAll2.best_params_['n_estimators']

IFgsAcc2 = GridSearchCV(IsolationForest(),
                     IFparameters,'roc_auc',n_jobs=nJobs,cv=cv,refit=True,
                     verbose=1)
IFgsAcc2.fit(NickAcc,1-NickLabels,NickGroups)
bestNumTreesAcc2 = IFgsAcc2.best_params_['n_estimators']

IFgsEda2 = GridSearchCV(IsolationForest(),
                     IFparameters,'roc_auc',n_jobs=nJobs,cv=cv,refit=True,
                     verbose=1)
IFgsEda2.fit(NickEda,1-NickLabels,NickGroups)
bestNumTreesEda2 = IFgsEda2.best_params_['n_estimators']

cv = LeaveOneGroupOut()
IFpredAll2 = np.zeros(np.shape(NickLabels))
IFpredAcc2 = np.zeros(np.shape(NickLabels))
IFpredEda2 = np.zeros(np.shape(NickLabels))
IFpredNickAll2 = np.zeros(np.shape(labels))
IFpredNickAcc2 = np.zeros(np.shape(labels))
IFpredNickEda2 = np.zeros(np.shape(labels))

for train, test in cv.split(NickAll,NickLabels,NickGroups):
    IFAll2 = IsolationForest(n_estimators=bestNumTreesAll2)
    IFAll2.fit(NickAll[train,:])
    IFpredAll2[test] = IFAll2.decision_function(NickAll[test,:])

    IFAcc2 = IsolationForest(n_estimators=bestNumTreesAcc2)
    IFAcc2.fit(NickAcc[train,:])
    IFpredAcc2[test] = IFAcc2.decision_function(NickAcc[test,:])

    IFEda2 = IsolationForest(n_estimators=bestNumTreesEda2)
    IFEda2.fit(NickEda[train,:])
    IFpredEda2[test] = IFEda2.decision_function(NickEda[test,:])

print('IF AUC ALL: %f (%s)' % (roc_auc_score(1-NickLabels,IFpredAll2),IFgsAll2.best_params_))
print('IF AUC ACC: %f (%s)' % (roc_auc_score(1-NickLabels,IFpredAcc2),IFgsAcc2.best_params_))
print('IF AUC EDA: %f (%s)' % (roc_auc_score(1-NickLabels,IFpredEda2),IFgsEda2.best_params_))

IFGsAll2 = GridSearchCV(IsolationForest(),
                        IFparameters,'roc_auc',n_jobs=12,cv=cv,refit=False,
                        verbose=1)
IFGsAcc2 = GridSearchCV(IsolationForest(),
                        IFparameters,'roc_auc',n_jobs=12,cv=cv,refit=False,
                        verbose=1)
IFGsEda2 = GridSearchCV(IsolationForest(),
                        IFparameters,'roc_auc',n_jobs=12,cv=cv,refit=False,
                        verbose=1)


# Fit classifier with best parameters from grid search CV to entire UTD data
# (including excluded test subjects) and test on AWW data
IFAll2 = IsolationForest(n_estimators=bestNumTreesAll2)
IFAll2.fit(NickAll)
IFNickPredAll2 = IFAll2.decision_function(featuresAll)

IFAcc2 = IsolationForest(n_estimators=bestNumTreesAcc2)
IFAcc2.fit(NickAcc)
IFNickPredAcc2 = IFAcc2.decision_function(featuresAcc)

IFEda2 = IsolationForest(n_estimators=bestNumTreesEda2)
IFEda2.fit(NickEda)
IFNickPredEda2 = IFEda2.decision_function(featuresEda)

print('IF AUC ALL PREDICTION: %f (%s)' % (roc_auc_score(1-labels,IFNickPredAll2),
                            IFgsAll2.best_params_))
print('IF AUC ACC PREDICTION: %f (%s)' % (roc_auc_score(1-labels,IFNickPredAcc2),
                            IFgsAcc2.best_params_))
print('IF AUC EDA PREDICTION: %f (%s)' % (roc_auc_score(1-labels,IFNickPredEda2),
                            IFgsEda2.best_params_))

Fitting 8 folds for each of 20 candidates, totalling 160 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  28 tasks      | elapsed:    1.4s
[Parallel(n_jobs=12)]: Done 160 out of 160 | elapsed:   17.2s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 8 folds for each of 20 candidates, totalling 160 fits


[Parallel(n_jobs=12)]: Done  28 tasks      | elapsed:    1.3s
[Parallel(n_jobs=12)]: Done 160 out of 160 | elapsed:   16.9s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 8 folds for each of 20 candidates, totalling 160 fits


[Parallel(n_jobs=12)]: Done  28 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 137 out of 160 | elapsed:    7.5s remaining:    1.2s
[Parallel(n_jobs=12)]: Done 160 out of 160 | elapsed:    9.0s finished


IF AUC ALL: 0.860900 ({'n_estimators': 190})
IF AUC ACC: 0.852808 ({'n_estimators': 40})
IF AUC EDA: 0.581001 ({'n_estimators': 40})




IF AUC ALL PREDICTION: 0.851119 ({'n_estimators': 190})
IF AUC ACC PREDICTION: 0.815140 ({'n_estimators': 40})
IF AUC EDA PREDICTION: 0.679289 ({'n_estimators': 40})
