In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import KFold, RepeatedStratifiedKFold, StratifiedKFold
from collections import Counter

In [None]:
#APPAC, PAAC, AAC, & DPC
X1=pd.read_csv('Features/Sodium_APAAC.csv', header=None).iloc[:,1:].values
X2=pd.read_csv('Features/Sodium_PAAC.csv', header=None).iloc[:,1:].values
X3=pd.read_csv('Features/Sodium_AAC.csv', header=None).iloc[:,1:].values
X4=pd.read_csv('Features/Sodium_DPC.csv', header=None).iloc[:,1:].values

#Labels
pos_labels = np.ones(492)
neg_labels = np.zeros(492)
y = np.concatenate((pos_labels,neg_labels),axis=0)

In [None]:
X=np.concatenate((X1,X2, X3, X4), axis=1, out=None)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=67, stratify=y)

In [None]:
X_train.shape, X_test.shape

In [None]:
# #NXTPred data
# N1=pd.read_csv('Data/Sodium/NTXPred/PREDNa/Features/NTXPred_CKSAAGP.csv', header=None).iloc[:,1:].values
# N2=pd.read_csv('Data/Sodium/NTXPred/PREDNa/Features/NTXPred_PAAC.csv', header=None).iloc[:,1:].values
# N3=pd.read_csv('Data/Sodium/NTXPred/PREDNa/Features/NTXPred_APAAC.csv', header=None).iloc[:,1:].values
# N4=pd.read_csv('Data/Sodium/NTXPred/PREDNa/Features/NTXPred_GTPC.csv', header=None).iloc[:,1:].values

# #Labels
# y1= np.ones(244)
# y2=np.zeros(244)

In [None]:
# train=np.concatenate((N1, N2, N3, N4), axis=1, out=None)
# y=np.concatenate((y1,y2),axis=0)

In [None]:
# #Keep almost all the samples in the training set to check CV results 
# X_train2, X_test2, y_train2, y_test2= train_test_split(train, y, test_size=0.005, random_state=67, stratify=y)

In [None]:
def evaluate_model_test(model, X_test, y_test):
    from sklearn import metrics

    # Predict Test Data 
    y_pred = model.predict(X_test)

    # Calculate accuracy, precision, recall, f1-score, and kappa score
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)

    # Calculate area under curve (AUC)
    y_pred_proba = model.predict_proba(X_test)[::,1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)
    
    #MCC
    mcc=matthews_corrcoef(y_test, model.predict(X_test))
    
    # Display confussion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)
    total=sum(sum(cm))
    
    #accuracy=(cm[0,0]+cm[1,1])/total
    spec= cm[0,0]/(cm[0,0]+cm[0,1])
    sen= cm[1,1]/(cm[1,0]+cm[1,1])

    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'mcc':mcc,
            'fpr': fpr, 'tpr': tpr, 'auc': auc, 'cm': cm, 'sen': sen, 'spec':spec}

In [None]:
from sklearn import metrics
def evaluate_model_train(model, X_train, y_train):
    conf_matrix_list_of_arrays = []
    mcc_array=[]
    cv = KFold(n_splits=5) 
    lst_accu = []
    AUC_list=[]
    
    score=cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy',n_jobs=-1, error_score='raise')
    prec_train=np.mean(cross_val_score(model, X_train, y_train, cv=cv, scoring='precision'))
    recall_train=np.mean(cross_val_score(model, X_train, y_train, cv=cv, scoring='recall'))
    f1_train=np.mean(cross_val_score(model, X_train, y_train, cv=cv, scoring='f1'))
    
    for train_index, test_index in cv.split(X_train, y_train): 
        X_train_fold, X_test_fold = X_train[train_index], X_train[test_index] 
        y_train_fold, y_test_fold = y_train[train_index], y_train[test_index] 
        model.fit(X_train_fold, y_train_fold) 
        lst_accu.append(model.score(X_test_fold, y_test_fold))
        conf_matrix = confusion_matrix(y_test_fold, model.predict(X_test_fold))
        conf_matrix_list_of_arrays.append(conf_matrix)
        cm = np.mean(conf_matrix_list_of_arrays, axis=0)
        mcc_array.append(matthews_corrcoef(y_test_fold, model.predict(X_test_fold)))
        mcc=np.mean(mcc_array, axis=0)
        
        # Calculate area under curve (AUC)
        AUC=metrics.roc_auc_score( y_test_fold, model.predict_proba(X_test_fold)[:,1])
        AUC_list.append(AUC)
        auc=np.mean(AUC_list)
        
        
    total=sum(sum(cm))
    accuracy=(cm[0,0]+cm[1,1])/total
    specificity= cm[0,0]/(cm[0,0]+cm[0,1])
    sensitivity= cm[1,1]/(cm[1,0]+cm[1,1])
    
    
    return {'prec_train': prec_train, 'recall_train': recall_train,
            'f1_train': f1_train, 'cm': cm, 'mcc': mcc,'acc':accuracy,
           'sen':sensitivity,'spec':specificity, 'AUC':auc, 'score':score}

# Random Forest

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
def RF_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 2000)
    max_depth = trial.suggest_int('max_depth', 1, 80)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 1000)
    min_samples_split= trial.suggest_int("min_samples_split", 2, 20)
    
    ## Create Model
    model = RandomForestClassifier(max_depth = max_depth, min_samples_split=min_samples_split,
                                   n_estimators = n_estimators,n_jobs=2
                                     )


    score = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
    accuracy_mean = score.mean()
    return accuracy_mean

#Execute optuna and set hyperparameters
RF_study = optuna.create_study(direction='maximize')
RF_study.optimize(RF_objective, n_trials=20)

In [None]:
optimized_RF=RandomForestClassifier(**RF_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_RF, X_train, y_train)
print("Confusion Matrix is: ", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("The Acc value from CM is: ", train_eval['acc'])
print("The Recall value is: ", train_eval['recall_train'])
print("The F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['AUC'])
#print('5 accuracies: ', train_eval['lst_accu'])

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_RF, X_test, y_test)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# Extra Tree Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import optuna
def objective(trial):
    """Define the objective function"""
    params = {
            'n_estimators' : trial.suggest_int('n_estimators', 100, 2000),
            'max_depth' : trial.suggest_int('max_depth', 10, 90),
            'max_leaf_nodes' : trial.suggest_int('max_leaf_nodes', 15, 100),
            'criterion' : trial.suggest_categorical('criterion', ['gini', 'entropy'])

    }


    # Fit the model
    etc_model = ExtraTreesClassifier(**params)
    
    score = cross_val_score(etc_model, X_train, y_train, cv=5, scoring="accuracy")
    accuracy_mean = score.mean()

    return accuracy_mean


#Execute optuna and set hyperparameters
etc_study = optuna.create_study(direction='maximize')
etc_study.optimize(objective, n_trials=20)

optimized_etc =ExtraTreesClassifier(**etc_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_etc, X_train, y_train)
print("Confusion Matrix is:\n", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("Precision value is: ", train_eval['prec_train'])
print("Recall value is: ", train_eval['recall_train'])
print('The area under curve is:', train_eval['AUC'])
print("F1 score is: ", train_eval['f1_train'])

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_etc, X_test, y_test)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# XGB

In [None]:
from xgboost import XGBClassifier
#cv = RepeatedStratifiedKFold(n_splits=5)
import optuna
def objective(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 370),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 10.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'gamma': trial.suggest_float('gamma', 1e-8, 10.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0),
        #'eval_metric': 'mlogloss',
        #'use_label_encoder': False
    }

    # Fit the model
    xgb_model = XGBClassifier(**params,  eval_metric='mlogloss')
    score = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring="accuracy")
    accuracy_mean = score.mean()
    return accuracy_mean
#Execute optuna and set hyperparameters
XGB_study = optuna.create_study(direction='maximize')
XGB_study.optimize(objective, n_trials=20)
optimized_XGB =XGBClassifier(**XGB_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_XGB, X_train, y_train)

print("Confusion Matrix is:\n", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("Precision value is: ", train_eval['prec_train'])
print("Recall value is: ", train_eval['recall_train'])
print("F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['AUC'])

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_XGB, X_test, y_test)

# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# LGBM

In [None]:
import lightgbm as lgbm
import optuna
def objective(trial):
    """Define the objective function"""
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 100), 
        'max_depth': trial.suggest_int('max_depth', 1, 100), 
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 10), 
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000), 
        #'objective': 'multiclass', 
       # 'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100), 
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'random_state': 0
    }


    # Fit the model
    lgbm_model = lgbm.LGBMClassifier(**params)
    score = cross_val_score(lgbm_model, X_train, y_train, cv=5, scoring="accuracy")
    accuracy_mean = score.mean()

    return accuracy_mean


#Execute optuna and set hyperparameters
lgbm_study = optuna.create_study(direction='maximize')
lgbm_study.optimize(objective, n_trials=30)

optimized_lgbm =lgbm.LGBMClassifier(**lgbm_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_lgbm, X_train, y_train)
print("Confusion Matrix is: ", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("The Precision value is: ", train_eval['prec_train'])
print("The Recall value is: ", train_eval['recall_train'])
print("The F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['AUC'])

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_lgbm, X_test, y_test)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# SVM Classifier

In [None]:
# for Optuna
from sklearn.svm import SVC
def objective(trial):
    # C
    svc_c = trial.suggest_float('C', 1e0, 1e2)
    # kernel
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])
    # SVC
    clf = SVC(C=svc_c, kernel=kernel)
    score = cross_val_score(clf, X_train, y_train, cv=5, scoring="accuracy")
    accuracy_mean = score.mean()

    return accuracy_mean


#Execute optuna and set hyperparameters
svm_study = optuna.create_study(direction='maximize')
svm_study.optimize(objective, n_trials=100)

In [None]:
optimized_svm =SVC(**svm_study.best_params, probability=True)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_svm, X_train, y_train)
print("Confusion Matrix is: ", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("The Precision value is: ", train_eval['prec_train'])
print("The Recall value is: ", train_eval['recall_train'])
print("The F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['AUC'])

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_svm, X_test, y_test)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# Ensemble Model

In [None]:
#NaII-Pred
from sklearn.ensemble import VotingClassifier
en_clf = VotingClassifier(estimators=[ ('RF', optimized_RF), ('XGB', optimized_XGB), 
                                      ("SVM", optimized_svm), ('LGBM', optimized_lgbm)], voting='soft')

In [None]:
#Evaluate Model on Training data
train_eval = evaluate_model_train(en_clf, X_train, y_train)
print("Confusion Matrix is:\n", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("Precision value is: ", train_eval['prec_train'])
print("Recall value is: ", train_eval['recall_train'])
print("F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['AUC'])

In [None]:
#Evaluate Model on Testing data
dtc_eval = evaluate_model_test(en_clf, X_test, y_test)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])