In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy.stats import uniform
import os
from os import listdir, getcwd
import re
import sys
import warnings
import sklearn
import math
from scipy import stats
from IPython.display import clear_output

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, LeaveOneGroupOut, StratifiedKFold
from sklearn.feature_selection import VarianceThreshold, SelectFpr, SelectFwe, RFECV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.externals import joblib
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.base import BaseEstimator, ClassifierMixin

from sklearn.linear_model import ElasticNetCV, LogisticRegression, SGDClassifier, LogisticRegressionCV, LarsCV, LassoLarsCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

### IMPORT CROSS-SITE TEST RESULTS

In [2]:
for direct in listdir(re.sub("RESULTS","",getcwd())):
    if "FOLD" in direct:
       
        print(direct)
    
        # DEFINE CORRECT PATH FOR IMPORTING
        directory_this = re.sub("RESULTS","",getcwd())+direct+"/files"
        
        # IMPORT IDs OF TEST CASES
        globals()["id_test_"+re.sub("[^0-9]", "", direct)] = pd.read_csv(directory_this+'/id_test.csv', sep=',',header=None)
        
        # IMPORT TRUE OBSERVED OUTCOME OF TEST CASES
        globals()["true_outcome_test_"+re.sub("[^0-9]", "", direct)] = pd.read_csv(directory_this+'/true_outcome_test.csv', sep=',',header=None)
        
        # IMPORT PROBABILISTIC PREDICTIONS OF TEST CASES WRA
        globals()["probabilistic_prediction_test_"+re.sub("[^0-9]", "", direct)+"_wra"] = pd.read_csv(directory_this+'/probabilistic_prediction_test_wra.csv', sep=',',header=None)
        
        # IMPORT CATEGORICAL PREDICTIONS (BEST BALANCED ACCURACY) OUTCOME OF TEST CASES WRA
        globals()["categorical_outcome_test_"+re.sub("[^0-9]", "", direct)+"_wra"] = pd.read_csv(directory_this+'/categorical_outcome_test_wra.csv', sep=',',header=None)

        # IMPORT ROC OF TEST CASES WRA
        globals()["roc_test_"+re.sub("[^0-9]", "", direct)+"_wra"] = pd.read_csv(directory_this+'/roc_test_wra.csv', sep=',',header=None)

        # IMPORT THRESHOLD FOR BEST BALANCED ACCURACY OF TEST CASES WRA
        globals()["t_best_balanced_accuracy_"+re.sub("[^0-9]", "", direct)+"_wra"] = pd.read_csv(directory_this+'/t_best_balanaced_accuracy_wra.csv', sep=',',header=None)

        # IMPORT RESULTS OF THRESHOLD FOR OTHER SENSITIVITY LEVELS OF TEST CASES WRA
        globals()["t_moving_results_"+re.sub("[^0-9]", "", direct)+"_wra"] = pd.read_csv(directory_this+'/t_moving_results_wra.csv', sep=',',header=None)

        # EXTRACT THRESHOLD FOR OTHER SENSITIVITY LEVELS OF TEST CASES WRA
        for i in range(globals()["t_moving_results_"+re.sub("[^0-9]", "", direct)+"_wra"].shape[0]):
            globals()["t_sensitivity_"+str(globals()["t_moving_results_"+re.sub("[^0-9]", "", direct)+"_wra"].iloc[i,0])+"_"+re.sub("[^0-9]", "", direct)+"_wra"] = globals()["t_moving_results_"+re.sub("[^0-9]", "", direct)+"_wra"].loc[i,1]
        
        # IMPORT CATEGORICAL PREDICTIONS FOR OTHER SENSITIVITY LEVELS OF TEST CASES WRA
        for i in range(globals()["t_moving_results_"+re.sub("[^0-9]", "", direct)+"_wra"].shape[0]):
            globals()["categor_outcome_sensitivity_"+str(globals()["t_moving_results_"+re.sub("[^0-9]", "", direct)+"_wra"].iloc[i,0])+"_"+re.sub("[^0-9]", "", direct)+"_wra"] = pd.read_csv(directory_this+"/categor_moving_sensitivity_outcome_test_"+str(globals()["t_moving_results_"+re.sub("[^0-9]", "", direct)+"_wra"].iloc[i,0])+"_wra.csv", sep=',',header=None)
        

    
    

FOLD_1
FOLD_2
FOLD_3
FOLD_4
FOLD_5


### CHECK THAT ALL CASES HAVE BEEN TESTED (N TOT = 550)

In [3]:
objects = list(globals())

# CHECK IDs
summa = 0
for obj in objects:
    if "id_test" in obj:
        summa += len(globals()[obj])
print(summa)

# CHECK TRUE OBSERVED
summa = 0
for obj in objects:
    if "true_outcome" in obj:
        summa += len(globals()[obj])
print(summa)

# CHECK PROBABILISTIC PREDICTIONS WRA
summa = 0
for obj in objects:
    if "probabilistic_prediction" in obj and "wra" in obj:
        summa += len(globals()[obj])
print(summa)

# CHECK CATEGORICAL PREDICTIONS (BEST BALANCED ACCURACY) WRA
summa = 0
for obj in objects:
    if "categorical_outcome" in obj and "wra" in obj:
        summa += len(globals()[obj])
print(summa)

# CHECK CATEGORICAL PREDICTIONS (SENSITIVITY LEVELS) WRA
for i in range(globals()["t_moving_results_1_wra"].shape[0]):
    summa = 0
    for obj in objects:
        if "categor_outcome" in obj and "wra" in obj and str(globals()["t_moving_results_1_wra"].loc[i,0])+"_" in obj:
            summa += len(globals()[obj])
    print(summa)

550
550
550
550
550
550
550
550
550
550
550
550


### MERGE THE TEST RESULTS, AND CHECK THAT THE DIMENSION IS CORRECT (N=550)

In [4]:
objects = list(globals())

# MERGE IDs
id_test_complete = []
for obj in objects:
    if "id_test" in obj:
        id_test_complete = np.append(id_test_complete, globals()[obj])
print(id_test_complete.shape)

# MERGE THE FOLD NUMBER
fold_number = []
for obj in objects:
    if "id_test" in obj:
        fold_this = np.repeat(re.sub("id_test_","",obj),globals()[obj].shape[0])
        fold_number = np.append(fold_number, fold_this)
print(fold_number.shape)

# MERGE TRUE OBSERVED
true_outcome_complete = []
for obj in objects:
    if "true_outcome" in obj:
        true_outcome_complete = np.append(true_outcome_complete, globals()[obj])
print(true_outcome_complete.shape)

# MERGE PROBABILISTIC PREDICTIONS WRA
probabilistic_prediction_complete_wra = []
for obj in objects:
    if "probabilistic_prediction" in obj and "wra" in obj:
        probabilistic_prediction_complete_wra = np.append(probabilistic_prediction_complete_wra, globals()[obj])
print(probabilistic_prediction_complete_wra.shape)

# MERGE CATEGORICAL PREDICTIONS (BEST BALANCED ACCURACY) WRA
categorical_outcome_complete_wra = []
for obj in objects:
    if "categorical_outcome" in obj and "wra" in obj:
        categorical_outcome_complete_wra = np.append(categorical_outcome_complete_wra, globals()[obj])
print(categorical_outcome_complete_wra.shape)

# MERGE CATEGORICAL PREDICTIONS (SENSITIVITY LEVELS) WRA
for i in range(globals()["t_moving_results_1_wra"].shape[0]):
    globals()["categor_outcome_sensitivity_"+str(globals()["t_moving_results_1_wra"].loc[i,0])+"_wra"] = []
    for obj in objects:
        if "categor_outcome" in obj and "wra" in obj and str(globals()["t_moving_results_1_wra"].loc[i,0])+"_" in obj:
            globals()["categor_outcome_sensitivity_"+str(globals()["t_moving_results_1_wra"].loc[i,0])+"_wra"] = np.append(globals()["categor_outcome_sensitivity_"+str(globals()["t_moving_results_1_wra"].loc[i,0])+"_wra"], globals()[obj])
    print(globals()["categor_outcome_sensitivity_"+str(globals()["t_moving_results_1_wra"].loc[i,0])+"_wra"].shape)
    
# AVERAGE THRESHOLDS (BEST BALANCED ACCURACY) WRA
t_average_best_balanced_wra = 0.
count=0
for obj in objects:
    if "t_best_balanced" in obj and "wra" in obj:
        t_average_best_balanced_wra += globals()[obj].iloc[0,0]
        count+=1
t_average_best_balanced_wra /= count
print(t_average_best_balanced_wra)
    
# AVERAGE THRESHOLDS (SENSITIVITY LEVELS) WRA
for i in range(globals()["t_moving_results_1_wra"].shape[0]):
    globals()["t_average_sensitivity_"+str(globals()["t_moving_results_1_wra"].loc[i,0])+"_wra"] = 0
    count=0
    for obj in objects:
        if "t_sensitivity" in obj and "wra" in obj and str(globals()["t_moving_results_1_wra"].loc[i,0])+"_" in obj:
            globals()["t_average_sensitivity_"+str(globals()["t_moving_results_1_wra"].loc[i,0])+"_wra"] += globals()[obj]
            count +=1
    globals()["t_average_sensitivity_"+str(globals()["t_moving_results_1_wra"].loc[i,0])+"_wra"] /= count
    print(globals()["t_average_sensitivity_"+str(globals()["t_moving_results_1_wra"].loc[i,0])+"_wra"])


(550,)
(550,)
(550,)
(550,)
(550,)
(550,)
(550,)
(550,)
(550,)
(550,)
(550,)
(550,)
(550,)
0.58874
0.25338
0.29436
0.32068
0.35074
0.45856
0.50886
0.58214
0.62716


# TEST IT!

# WEIGHTED RANK AVERAGE

In [5]:
# WHOLE TEST SAMPLE

print("### WHOLE TEST SAMPLE ")
print(" ")

prediction_BEST_MODEL = probabilistic_prediction_complete_wra
y_test = true_outcome_complete

objects = list(globals())
mean_roc = 0
count = 0
for obj in objects:
    if "roc_test" in obj and "wra" in obj:
        mean_roc += globals()[obj]
        count += 1
mean_roc = mean_roc / count
mean_roc = float(np.array(mean_roc))

# print("TEST AUCROC SCORE: "+str(roc_auc_score(y_true = y_test, y_score=prediction_BEST_MODEL)))
print("AVERAGE TEST AUCROC SCORE: "+str(mean_roc))
print("POOLED TEST AUCROC SCORE: "+str(roc_auc_score(y_true = y_test, y_score=prediction_BEST_MODEL)))
print("     ")
print("_______________________________________________")
print("     ")

print("TEST CATEGORICAL THRESHOLD APPLIED WITHIN TRAIN SPLIT")
print(" ")

index_converters_test = np.where(y_test ==1)
index_non_converters_test = np.where(y_test ==0)
index_pred_pos_test = np.where(categorical_outcome_complete_wra == 1)
index_pred_neg_test = np.where(categorical_outcome_complete_wra == 0)

sens_test = sum(categorical_outcome_complete_wra[index_converters_test] == 1) / sum(y_test == 1)
spec_test = sum(categorical_outcome_complete_wra[index_non_converters_test] == 0) / sum(y_test == 0)
ppv_test = sum(y_test[index_pred_pos_test] == 1) / sum(categorical_outcome_complete_wra == 1)
npv_test = sum(y_test[index_pred_neg_test] == 0) / sum(categorical_outcome_complete_wra == 0)

print("RESULTS OF BEST BALANCED ACCURACY")
print("BALANCED ACCURACY:                   "+ str((sens_test+spec_test)/2))
print("F1-SCORE:                            "+ str(2*ppv_test*sens_test/(ppv_test+sens_test)))
print("SENSITIVITY/RECALL:                  "+ str(sens_test))
print("SPECIFICITY:                         "+ str(spec_test))
print("POSITIVE PREDICTIVE VALUE/PRECISION: "+ str(ppv_test))
print("NEGATIVE PREDICTIVE VALUE:           "+ str(npv_test))
print("     ")

for i in range(globals()["t_moving_results_1_wra"].shape[0]):
    
    index_converters_test = np.where(y_test ==1)
    index_non_converters_test = np.where(y_test ==0)
    index_pred_pos_test = np.where(globals()["categor_outcome_sensitivity_"+str(globals()["t_moving_results_1_wra"].loc[i,0])+"_wra"] == 1)
    index_pred_neg_test = np.where(globals()["categor_outcome_sensitivity_"+str(globals()["t_moving_results_1_wra"].loc[i,0])+"_wra"] == 0)

    sens_test = sum(globals()["categor_outcome_sensitivity_"+str(globals()["t_moving_results_1_wra"].loc[i,0])+"_wra"][index_converters_test] == 1) / sum(y_test == 1)
    spec_test = sum(globals()["categor_outcome_sensitivity_"+str(globals()["t_moving_results_1_wra"].loc[i,0])+"_wra"][index_non_converters_test] == 0) / sum(y_test == 0)
    ppv_test = sum(y_test[index_pred_pos_test] == 1) / sum(globals()["categor_outcome_sensitivity_"+str(globals()["t_moving_results_1_wra"].loc[i,0])+"_wra"] == 1)
    npv_test = sum(y_test[index_pred_neg_test] == 0) / sum(globals()["categor_outcome_sensitivity_"+str(globals()["t_moving_results_1_wra"].loc[i,0])+"_wra"] == 0)

    print("RESULTS OF SENSITIVITY LEVEL "+str(globals()["t_moving_results_1_wra"].loc[i,0]))
    print("BALANCED ACCURACY:                   "+ str((sens_test+spec_test)/2))
    print("F1-SCORE:                            "+ str(2*ppv_test*sens_test/(ppv_test+sens_test)))
    print("SENSITIVITY/RECALL:                  "+ str(sens_test))
    print("SPECIFICITY:                         "+ str(spec_test))
    print("POSITIVE PREDICTIVE VALUE/PRECISION: "+ str(ppv_test))
    print("NEGATIVE PREDICTIVE VALUE:           "+ str(npv_test))
    print("     ")

print("TEST CATEGORICAL WITH AVERAGE THRESHOLD APPLIED AFTER POOLING OF CONTINUOUS WRA PREDICTIONS") 
print(" ")

t = t_average_best_balanced_wra

index_converters_test = np.where(y_test ==1)
index_non_converters_test = np.where(y_test ==0)

sens_test = sum(prediction_BEST_MODEL[index_converters_test] >= t) / prediction_BEST_MODEL[index_converters_test].shape[0]
spec_test = sum(prediction_BEST_MODEL[index_non_converters_test] < t) / prediction_BEST_MODEL[index_non_converters_test].shape[0]
ppv_test = sum(prediction_BEST_MODEL[index_converters_test] >= t) / sum(prediction_BEST_MODEL >= t)
npv_test = sum(prediction_BEST_MODEL[index_non_converters_test] < t) / sum(prediction_BEST_MODEL < t)

print("RESULTS OF BEST BALANCED ACCURACY")
print("BALANCED ACCURACY:                   "+ str((sens_test+spec_test)/2))
print("F1-SCORE:                            "+ str(2*ppv_test*sens_test/(ppv_test+sens_test)))
print("SENSITIVITY/RECALL:                  "+ str(sens_test))
print("SPECIFICITY:                         "+ str(spec_test))
print("POSITIVE PREDICTIVE VALUE/PRECISION: "+ str(ppv_test))
print("NEGATIVE PREDICTIVE VALUE:           "+ str(npv_test))
print("     ")

for i in range(globals()["t_moving_results_1_wra"].shape[0]):
    
    t = globals()["t_average_sensitivity_"+str(globals()["t_moving_results_1_wra"].iloc[i,0])+"_wra"]

    index_converters_test = np.where(y_test ==1)
    index_non_converters_test = np.where(y_test ==0)

    sens_test = sum(prediction_BEST_MODEL[index_converters_test] >= t) / prediction_BEST_MODEL[index_converters_test].shape[0]
    spec_test = sum(prediction_BEST_MODEL[index_non_converters_test] < t) / prediction_BEST_MODEL[index_non_converters_test].shape[0]
    ppv_test = sum(prediction_BEST_MODEL[index_converters_test] >= t) / sum(prediction_BEST_MODEL >= t)
    npv_test = sum(prediction_BEST_MODEL[index_non_converters_test] < t) / sum(prediction_BEST_MODEL < t)

    print("RESULTS OF SENSITIVITY LEVEL "+str(globals()["t_moving_results_1_wra"].loc[i,0]))
    print("BALANCED ACCURACY:                   "+ str((sens_test+spec_test)/2))
    print("F1-SCORE:                            "+ str(2*ppv_test*sens_test/(ppv_test+sens_test)))
    print("SENSITIVITY/RECALL:                  "+ str(sens_test))
    print("SPECIFICITY:                         "+ str(spec_test))
    print("POSITIVE PREDICTIVE VALUE/PRECISION: "+ str(ppv_test))
    print("NEGATIVE PREDICTIVE VALUE:           "+ str(npv_test))
    print("     ")

### WHOLE TEST SAMPLE 
 
AVERAGE TEST AUCROC SCORE: 0.8842278900391307
POOLED TEST AUCROC SCORE: 0.880315209732
     
_______________________________________________
     
TEST CATEGORICAL THRESHOLD APPLIED WITHIN TRAIN SPLIT
 
RESULTS OF BEST BALANCED ACCURACY
BALANCED ACCURACY:                   0.785515019916
F1-SCORE:                            0.724220623501
SENSITIVITY/RECALL:                  0.766497461929
SPECIFICITY:                         0.804532577904
POSITIVE PREDICTIVE VALUE/PRECISION: 0.686363636364
NEGATIVE PREDICTIVE VALUE:           0.860606060606
     
RESULTS OF SENSITIVITY LEVEL 1.0
BALANCED ACCURACY:                   0.694345781625
F1-SCORE:                            0.645799011532
SENSITIVITY/RECALL:                  0.994923857868
SPECIFICITY:                         0.393767705382
POSITIVE PREDICTIVE VALUE/PRECISION: 0.478048780488
NEGATIVE PREDICTIVE VALUE:           0.992857142857
     
RESULTS OF SENSITIVITY LEVEL 0.99
BALANCED ACCURACY:                 

In [6]:
fold_number.tofile('fold_number.csv')
np.savetxt('true_outcome_complete.csv', true_outcome_complete, delimiter=",")
np.savetxt('probabilistic_prediction_complete_wra.csv', probabilistic_prediction_complete_wra, delimiter=",")
np.savetxt('categorical_outcome_complete_wra.csv', categorical_outcome_complete_wra, delimiter=",")

# POOLED UNIVARIATE FEATURE IMPORTANCE

### IMPORT

In [7]:
for direct in listdir(re.sub("RESULTS","",getcwd())):
    if "FOLD" in direct:

        print(direct)   
        
        # DEFINE CORRECT PATH FOR IMPORTING
        directory_this = re.sub("RESULTS","",getcwd())+direct+"/univariate_feature_importance"
        for file in listdir(directory_this):
            if "prediction_test" in file:
                
                # IMPORT UNIVARIATE FEATURE PREDICTIONS OF THAT TEST FOLD
                globals()[re.sub(".csv","",file)] = pd.read_csv(directory_this+"/"+file, sep=',',header=None)

FOLD_1
FOLD_2
FOLD_3
FOLD_4
FOLD_5


### MERGE POOLED UNIVARIATE FEATURE IMPORTANCE PREDICTION

In [8]:
objects = list(globals())

# FIND VARIABLE NAMES
feature_names = []

for obj in objects:
    if "prediction_test_UNIVARIATE_FEATURE_IMPORTANCE_" in obj:
        feature_names.append(re.sub("prediction_test_UNIVARIATE_FEATURE_IMPORTANCE_","",re.sub("_[0-9]","",obj)))
        # feature_names.append(re.sub("prediction_test_UNIVARIATE_FEATURE_IMPORTANCE_","",obj))

feature_names = [i for i in np.unique(feature_names)]

# MERGE PREDICTIONS FOR EACH FEATURE
for feature in feature_names:
    if "PCA" in feature:
        feature = re.sub("PCA_","PCA_1_",feature)

    prediction_this = np.array([])   
    summa = 0
    for obj in objects:
        if feature in obj and "prediction_test_UNIVARIATE_FEATURE_IMPORTANCE_" in obj:
            prediction_this = np.append(prediction_this, np.array(globals()[obj]).tolist())
            summa += len(globals()[obj])
    globals()["prediction_test_UNIVARIATE_FEATURE_IMPORTANCE_complete_"+feature] = prediction_this
    print("feature: ", feature, " ",summa)

feature:  AGE   550
feature:  CDRSB   550
feature:  DX_bl   550
feature:  FAQ   550
feature:  LDELTOTAL   550
feature:  MMSE   550
feature:  PCA_1_ADAS   550
feature:  PCA_1_RAVLT_forgetting   550
feature:  PTEDUCAT   550
feature:  PTGENDER   550
feature:  PTMARRY_Divorced   550
feature:  PTMARRY_Married   550
feature:  PTMARRY_Never married   550
feature:  PTMARRY_Widowed   550
feature:  RAVLT_immediate   550
feature:  RAVLT_learning   550
feature:  TRABSCOR   550


In [9]:
# CREATE DATAFRAME OF RESULTS
UNIVARIATE_FEATURE_IMPORTANCE = pd.DataFrame(np.array(["",0,"",0]).reshape((2,2)), columns= ["FEATURE","Pooled_AUC"])
y_test = true_outcome_complete

# CALCULATE RESULTS AND INPUT THEM IN THE DATAFRAME
count = 0
for feature in feature_names:
    if "PCA" in feature:
        feature = re.sub("PCA_","PCA_1_",feature)
    
    # CALCULATE THE AUROC
    roc_this = roc_auc_score(y_true = y_test, y_score = globals()["prediction_test_UNIVARIATE_FEATURE_IMPORTANCE_complete_"+feature] )
    
    # UPDATE THE DATAFRAME OF REULTS
    UNIVARIATE_FEATURE_IMPORTANCE.loc[count] = [feature,roc_this]
    count +=1

# PRINT THE RESULTS
display(UNIVARIATE_FEATURE_IMPORTANCE.sort_values(by="Pooled_AUC", ascending=False))
    

Unnamed: 0,FEATURE,Pooled_AUC
6,PCA_1_ADAS,0.808933
14,RAVLT_immediate,0.777124
3,FAQ,0.776887
4,LDELTOTAL,0.76956
15,RAVLT_learning,0.707094
1,CDRSB,0.696546
7,PCA_1_RAVLT_forgetting,0.685236
5,MMSE,0.678197
2,DX_bl,0.657662
16,TRABSCOR,0.657634


In [13]:
# EXPORT FOR BOOSTRAP CI CALCULATION IN R
for object in list(globals()):
    if "prediction_test_UNIVARIATE_FEATURE_IMPORTANCE_complete_" in object:
        np.savetxt("probabilistic_"+object+".csv", globals()[object], delimiter=",")