In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import RepeatedStratifiedKFold

# Import TF dataset for first-layer model

In [None]:
#import TF dataset
X1=pd.read_csv('Prot-Bert/TF_Training_Embedding_ProtBert.csv', header=None).iloc[:,1:].values
X2=pd.read_csv('Prot-Bert/NTF_Training_Embedding_ProtBert.csv', header=None).iloc[:,1:].values
X_train = np.concatenate((X1,X2),axis=0)


In [None]:
pos_labels = np.ones(413)
neg_labels = np.zeros(416)
y_train = np.concatenate((pos_labels,neg_labels),axis=0)

In [None]:
X1=pd.read_csv('Prot-Bert/TF_Ind_Embedding_ProtBert.csv', header=None).iloc[:,1:].values
X2=pd.read_csv('Prot-Bert/NTF_Ind_Embedding_ProtBert.csv', header=None).iloc[:,1:].values
X_test = np.concatenate((X1,X2),axis=0)


In [None]:
pos_labels = np.ones(106)
neg_labels = np.zeros(106)
y_test = np.concatenate((pos_labels,neg_labels),axis=0)

In [None]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=67, stratify=y)

In [None]:
X_train.shape, X_test.shape

In [None]:
def evaluate_model_test(model, X_test, y_test):
    from sklearn import metrics

    # Predict Test Data 
    y_pred = model.predict_proba(X_test)[:,1]
    for i in range(len(y_pred)):
        if y_pred[i]>0.5:
            y_pred[i]=1
        else:
            y_pred[i]=0
    

    # Calculate accuracy, precision, recall, f1-score, and kappa score
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)

    # Calculate area under curve (AUC)
    y_pred_proba = model.predict_proba(X_test)[::,1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)
    
    #MCC
    mcc=matthews_corrcoef(y_test, model.predict(X_test))
    
    # Display confussion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)
    total=sum(sum(cm))
    
    #accuracy=(cm[0,0]+cm[1,1])/total
    spec = cm[0,0]/(cm[0,0]+cm[0,1])
    sen= cm[1,1]/(cm[1,0]+cm[1,1])
    
#     print(y_pred_proba)

    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'mcc':mcc,
            'fpr': fpr, 'tpr': tpr, 'auc': auc, 'cm': cm, 'sen': sen, 'spec':spec}

In [None]:
from sklearn import metrics
def evaluate_model_train(model, X_train, y_train):
    conf_matrix_list_of_arrays = []
    mcc_array=[]
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1) 
    lst_accu = []
    AUC_list=[]
    Sen_list=[]
    Spec_list=[]
    
    score=cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy',n_jobs=-1, error_score='raise')
    prec_train=np.mean(cross_val_score(model, X_train, y_train, cv=cv, scoring='precision'))
    recall_train=np.mean(cross_val_score(model, X_train, y_train, cv=cv, scoring='recall'))
    f1_train=np.mean(cross_val_score(model, X_train, y_train, cv=cv, scoring='f1'))
    
    
    for train_index, test_index in cv.split(X_train, y_train): 
        X_train_fold, X_test_fold = X_train[train_index], X_train[test_index] 
        y_train_fold, y_test_fold = y_train[train_index], y_train[test_index] 
        
        model.fit(X_train_fold, y_train_fold) 
        #lst_accu.append(model.score(X_test_fold, y_test_fold))
        
        #CM
        conf_matrix = confusion_matrix(y_test_fold, model.predict(X_test_fold))
        conf_matrix_list_of_arrays.append(conf_matrix)
        cm=conf_matrix
#         #Spec
#         spec = round(cm[0,0]/(cm[0,1]+cm[0,0]),3)
#         Spec_list.append(spec)
#         #Sen
#         sen = round(cm[1,1]/(cm[1,0]+cm[1,1]),3)
#         Sen_list.append(sen)
        
        
        #MCC
        MCC=matthews_corrcoef(y_test_fold, model.predict(X_test_fold))
        mcc_array.append(round(MCC, 3))

        
        # Calculate area under curve (AUC)
        AUC=metrics.roc_auc_score( y_test_fold, model.predict_proba(X_test_fold)[:,1])
                         
        AUC_list.append(round(AUC,3))
        
        
    auc=np.mean(AUC_list)    
    mcc=np.mean(mcc_array, axis=0)   
    cm = np.mean(conf_matrix_list_of_arrays, axis=0)    
    total=sum(sum(cm))
    accuracy=(cm[0,0]+cm[1,1])/total
    specificity = cm[0,0]/(cm[0,1]+cm[0,0])
    sensitivity = cm[1,1]/(cm[1,0]+cm[1,1])
    
    
    return {'prec_train': prec_train, 'recall_train': recall_train,
            'f1_train': f1_train, 'cm': cm, 'mcc': mcc,'acc':accuracy,
           'sen':sensitivity,'spec':specificity, 'auc':auc, 'score':score,
            'mcc_list': mcc_array, 'auc_list':AUC_list, 'Sen_list':Sen_list, 'Spec_list':Spec_list}

# Random Forest

In [None]:
import optuna
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=24) 
from sklearn.ensemble import RandomForestClassifier
def RF_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    max_depth = trial.suggest_int('max_depth', 1, 60)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 1000)
    min_samples_split= trial.suggest_int("min_samples_split", 2, 20)
    
    ## Create Model
    model = RandomForestClassifier(max_depth = max_depth, min_samples_split=min_samples_split,
                                   n_estimators = n_estimators,n_jobs=2
                                    )

   
    score = cross_val_score(model, X_train, y_train, cv=cv, scoring="accuracy")
    accuracy_mean = score.mean()
    return accuracy_mean

#Execute optuna and set hyperparameters
RF_study = optuna.create_study(direction='maximize')
RF_study.optimize(RF_objective, n_trails=200)

In [None]:
optimized_RF=RandomForestClassifier(**RF_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_RF, X_train, y_train)
print("Confusion Matrix is: ", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("The Acc value from CM is: ", train_eval['acc'])
print("The Recall value is: ", train_eval['recall_train'])
print("The F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['auc'])
print('5 accuracies: ', train_eval['score'])
Acc_rf=train_eval['score']
Sen_rf=train_eval['Sen_list']
Spec_rf=train_eval['Spec_list']
MCC_rf=train_eval['mcc_list']
AUC_rf=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
#rfc.fit(X_train, y_train)
dtc_eval = evaluate_model_test(optimized_RF, X_test, y_test)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# Extra Tree Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import optuna
def objective(trial):
    """Define the objective function"""
    params = {
            'n_estimators' : trial.suggest_int('n_estimators', 100, 2000),
            'max_depth' : trial.suggest_int('max_depth', 10, 90),
            'max_leaf_nodes' : trial.suggest_int('max_leaf_nodes', 15, 100),
            'criterion' : trial.suggest_categorical('criterion', ['gini', 'entropy'])

    }


    # Fit the model
    etc_model = ExtraTreesClassifier(**params)
    score = cross_val_score(etc_model, X_train, y_train, cv=cv, scoring="accuracy")
    accuracy_mean = score.mean()
    return accuracy_mean


#Execute optuna and set hyperparameters
etc_study = optuna.create_study(direction='maximize')
etc_study.optimize(objective, n_trails=200)

optimized_etc =ExtraTreesClassifier(**etc_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_etc, X_train, y_train)
print("Confusion Matrix is:\n", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("Precision value is: ", train_eval['prec_train'])
print("Recall value is: ", train_eval['recall_train'])
print('The area under curve is:', train_eval['auc'])
print("F1 score is: ", train_eval['f1_train'])
Acc_etc=train_eval['score']
Sen_etc=train_eval['Sen_list']
Spec_etc=train_eval['Spec_list']
MCC_etc=train_eval['mcc_list']
AUC_etc=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_etc, X_test, y_test)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# XGB

In [None]:
from xgboost import XGBClassifier
#cv = RepeatedStratifiedKFold(n_splits=5)
import optuna
def objective(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 400),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 10.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'gamma': trial.suggest_float('gamma', 1e-8, 10.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0),
        #'eval_metric': 'mlogloss',
        #'use_label_encoder': False
    }

    # Fit the model
    xgb_model = XGBClassifier(**params,  eval_metric='mlogloss')
    score = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring="accuracy")
    accuracy_mean = score.mean()
    return accuracy_mean
#Execute optuna and set hyperparameters
XGB_study = optuna.create_study(direction='maximize')
XGB_study.optimize(objective, n_trails=200)
optimized_XGB =XGBClassifier(**XGB_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_XGB, X_train, y_train)

print("Confusion Matrix is:\n", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("Precision value is: ", train_eval['prec_train'])
print("Recall value is: ", train_eval['recall_train'])
print("F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['auc'])
Acc_xgb=train_eval['score']
Sen_xgb=train_eval['Sen_list']
Spec_xgb=train_eval['Spec_list']
MCC_xgb=train_eval['mcc_list']
AUC_xgb=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_XGB, X_test, y_test)

# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# LGBM

In [None]:
import lightgbm as lgbm
import optuna
def objective(trial):
    """Define the objective function"""
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 100), 
        'max_depth': trial.suggest_int('max_depth', 1, 100), 
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 10), 
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000), 
        #'objective': 'multiclass', 
       # 'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100), 
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'random_state': 0
    }


    # Fit the model
    lgbm_model = lgbm.LGBMClassifier(**params)
    score = cross_val_score(lgbm_model, X_train, y_train, cv=cv, scoring="accuracy")
    accuracy_mean = score.mean()

    return accuracy_mean


#Execute optuna and set hyperparameters
lgbm_study = optuna.create_study(direction='maximize')
lgbm_study.optimize(objective, n_trails=200)

optimized_lgbm =lgbm.LGBMClassifier(**lgbm_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_lgbm, X_train, y_train)
print("Confusion Matrix is: ", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("The Precision value is: ", train_eval['prec_train'])
print("The Recall value is: ", train_eval['recall_train'])
print("The F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['auc'])
Acc_lgbm=train_eval['score']
Sen_lgbm=train_eval['Sen_list']
Spec_lgbm=train_eval['Spec_list']
MCC_lgbm=train_eval['mcc_list']
AUC_lgbm=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_lgbm, X_test, y_test)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# AdaBoost

In [None]:
# from sklearn.ensemble import AdaBoostClassifier
# def objective(trial):
    
#     params = {
#                 'n_estimators':trial.suggest_int('n_estimators',50,500),
#                 'learning_rate': trial.suggest_float('learning_rate', 0.1,2.5,step = 0.0000005),
#                 'algorithm':'SAMME.R', 
#                 'random_state':47
#             }
    
    
#     # Fit the model
#     abc_model = AdaBoostClassifier(**params)
    
#     score = cross_val_score(abc_model, X_train, y_train, cv=cv, scoring="accuracy")
#     accuracy_mean = score.mean()

#     return accuracy_mean


# #Execute optuna and set hyperparameters
# abc_study = optuna.create_study(direction='maximize')
# abc_study.optimize(objective, n_trails=200)

# optimized_abc =AdaBoostClassifier(**abc_study.best_params)

In [None]:
# # Evaluate Model on Training data
# train_eval = evaluate_model_train(optimized_abc, X_train, y_train)
# print("Confusion Matrix is: ", train_eval['cm'])
# print ('Accuracy : ', train_eval['acc'])
# print('Sensitivity : ', train_eval['sen'])
# print('Specificity : ', train_eval['spec'])
# print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
# print("The Precision value is: ", train_eval['prec_train'])
# print("The Recall value is: ", train_eval['recall_train'])
# print("The F1 score is: ", train_eval['f1_train'])
# print('The area under curve is:', train_eval['auc'])
# AAindex_abc=train_eval['score']

In [None]:
# # Evaluate Model on Testing data
# dtc_eval = evaluate_model_test(optimized_abc, X_test, y_test)
# # Print result
# print('Accuracy:', dtc_eval['acc'])
# print('Precision:', dtc_eval['prec'])
# print('Recall:', dtc_eval['rec'])
# print('F1 Score:', dtc_eval['f1'])
# print('Area Under Curve:', dtc_eval['auc'])
# print('Sensitivity : ', dtc_eval['sen'])
# print('Specificity : ', dtc_eval['spec'])
# print('MCC Score : ', dtc_eval['mcc'])
# print('Confusion Matrix:\n', dtc_eval['cm'])

# CatBoost

In [None]:
# from catboost import CatBoostClassifier
# def objective(trial):
#     params = {
#             "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5),
#             "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
#             "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
#             "depth": trial.suggest_int("depth", 1, 12),
#             "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
#             "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
#             ),
#             "used_ram_limit": "3gb",
#         }

# #     if param["bootstrap_type"] == "Bayesian":
# #         param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
# #     elif param["bootstrap_type"] == "Bernoulli":
# #         param["subsample"] = trial.suggest_float("subsample", 0.1, 1)


#     # Fit the model
#     cat_model = CatBoostClassifier(**params)

#     score = cross_val_score(cat_model, X_train, y_train, cv=cv, scoring="accuracy")
#     accuracy_mean = score.mean()

#     return accuracy_mean


# #Execute optuna and set hyperparameters
# cat_study = optuna.create_study(direction='maximize')
# cat_study.optimize(objective, n_trials=5)

# optimized_cat =CatBoostClassifier(**cat_study.best_params)


In [None]:
# # Evaluate Model on Training data
# train_eval = evaluate_model_train(optimized_cat, X_train, y_train)
# print("Confusion Matrix is: ", train_eval['cm'])
# print ('Accuracy : ', train_eval['acc'])
# print('Sensitivity : ', train_eval['sen'])
# print('Specificity : ', train_eval['spec'])
# print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
# print("The Precision value is: ", train_eval['prec_train'])
# print("The Recall value is: ", train_eval['recall_train'])
# print("The F1 score is: ", train_eval['f1_train'])
# print('The area under curve is:', train_eval['auc'])
# cbc=train_eval['score']
# Sen_cbc=train_eval['Sen_list']
# Spec_cbc=train_eval['Spec_list']
# MCC_cbc=train_eval['mcc_list']
# AUC_cbc=train_eval['auc_list']

In [None]:
# # Evaluate Model on Testing data
# dtc_eval = evaluate_model_test(optimized_cat, X_test, y_test)
# # Print result
# print('Accuracy:', dtc_eval['acc'])
# print('Precision:', dtc_eval['prec'])
# print('Recall:', dtc_eval['rec'])
# print('F1 Score:', dtc_eval['f1'])
# print('Area Under Curve:', dtc_eval['auc'])
# print('Sensitivity : ', dtc_eval['sen'])
# print('Specificity : ', dtc_eval['spec'])
# print('MCC Score : ', dtc_eval['mcc'])
# print('Confusion Matrix:\n', dtc_eval['cm'])

In [None]:
# import seaborn as sns
# box= pd.DataFrame({1:rf, 2:xgb, 3:etc, 4:lgbm, 5:cbc})
# # boxplot=sns.boxplot(data=box_AAindex, width=0.5)
# # boxplot.set_xlabel("AAindex", fontsize=14)
# # boxplot.set_ylabel("Accuracy", fontsize=14)
# # plt.show()
# box=pd.DataFrame(box)
# box.to_csv('Box_SCPAAC_Accuracies.csv')

# SVM Classifier

In [None]:
# for Optuna
from sklearn.svm import SVC
def objective(trial):
    # C
    svc_c = trial.suggest_float('C', 1e0, 1e2)
    # kernel
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])
    # SVC
    clf = SVC(C=svc_c, kernel=kernel)
    score = cross_val_score(clf, X_train, y_train, cv=cv, scoring="accuracy")
    accuracy_mean = score.mean()

    return accuracy_mean


#Execute optuna and set hyperparameters
svm_study = optuna.create_study(direction='maximize')
svm_study.optimize(objective, n_trails=200)



In [None]:
optimized_svm =SVC(**svm_study.best_params, probability=True)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_svm, X_train, y_train)
print("Confusion Matrix is: ", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("The Precision value is: ", train_eval['prec_train'])
print("The Recall value is: ", train_eval['recall_train'])
print("The F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['auc'])
Acc_svm=train_eval['score']
Sen_svm=train_eval['Sen_list']
Spec_svm=train_eval['Spec_list']
MCC_svm=train_eval['mcc_list']
AUC_svm=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_svm, X_test, y_test)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

In [None]:
# import seaborn as sns


# box_ACC= pd.DataFrame({1:Acc_rf, 2:Acc_xgb, 3:Acc_etc, 4:Acc_lgbm, 5:Acc_svm})
# box_Sen= pd.DataFrame({1:Sen_rf, 2:Sen_xgb, 3:Sen_etc, 4:Sen_lgbm, 5:Sen_svm})
# box_Spec= pd.DataFrame({1:Spec_rf, 2:Spec_xgb, 3:Spec_etc, 4:Spec_lgbm, 5:Spec_svm})
# box_MCC= pd.DataFrame({1:MCC_rf, 2:MCC_xgb, 3:MCC_etc, 4:MCC_lgbm, 5:MCC_svm})
# box_AUC= pd.DataFrame({1:AUC_rf, 2:AUC_xgb, 3:AUC_etc, 4:AUC_lgbm, 5:AUC_svm})

# boxplot=sns.boxplot(data=box_ACC, width=0.5)
# boxplot.set_xlabel("PAAC", fontsize=14)
# boxplot.set_ylabel("Accuracy", fontsize=14)
# plt.show()

In [None]:
# #Saving the models
# import pickle
# pickle.dump(optimized_RF, open('Models/Optimized_RF_PAAC.pkl', 'wb'))
# pickle.dump(optimized_XGB, open('Models/Optimized_XGB_PAAC.pkl', 'wb'))
# pickle.dump(optimized_etc, open('Models/Optimized_etc_PAAC.pkl', 'wb'))
# pickle.dump(optimized_lgbm, open('Models/Optimized_lgbm_PAAC.pkl', 'wb'))
# pickle.dump(optimized_svm, open('Models/Optimized_SVM_PAAC.pkl', 'wb'))

# Import TFPM dataset for second-layer model 

In [None]:
X1=pd.read_csv('Prot-Bert/TFPM/TFPM_Training_Embedding_ProtBert.csv', header=None).iloc[:,1:].values
X2=pd.read_csv('Prot-Bert/TFPM/TFPNM_Training_Embedding_ProtBert.csv', header=None).iloc[:,1:].values
X_train2 = np.concatenate((X1,X2),axis=0)

In [None]:
pos_labels = np.ones(146)
neg_labels = np.zeros(146)
y_train2 = np.concatenate((pos_labels,neg_labels),axis=0)

In [None]:
X1=pd.read_csv('Prot-Bert/TFPM/TFPM_Ind_Embedding_ProtBert.csv', header=None).iloc[:,1:].values
X2=pd.read_csv('Prot-Bert/TFPM/TFPNM_Ind_Embedding_ProtBert.csv', header=None).iloc[:,1:].values
X_test2 = np.concatenate((X1,X2),axis=0)


In [None]:
pos_labels = np.ones(69)
neg_labels = np.zeros(37)
y_test2 = np.concatenate((pos_labels,neg_labels),axis=0)

In [None]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=67, stratify=y)

In [None]:
X_train2.shape, X_test2.shape, y_train2.shape, y_test2.shape

In [None]:
# X1=pd.read_csv('Prot-Bert/TFPM/Features/Binding_Training_PAAC.csv', header=None).iloc[:,1:].values

In [None]:
# pos_labels = np.ones(146)
# neg_labels = np.zeros(146)
# y1 = np.concatenate((pos_labels,neg_labels),axis=0)

In [None]:
# X1.shape, y1.shape

# Random Forest

In [None]:
import optuna
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=24) 
from sklearn.ensemble import RandomForestClassifier
def RF_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    max_depth = trial.suggest_int('max_depth', 1, 60)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 1000)
    min_samples_split= trial.suggest_int("min_samples_split", 2, 20)
    
    ## Create Model
    model = RandomForestClassifier(max_depth = max_depth, min_samples_split=min_samples_split,
                                   n_estimators = n_estimators,n_jobs=2
                                    )

   
    score = cross_val_score(model, X_train2, y_train2, cv=cv, scoring="accuracy")
    accuracy_mean = score.mean()
    return accuracy_mean

#Execute optuna and set hyperparameters
RF_study = optuna.create_study(direction='maximize')
RF_study.optimize(RF_objective, n_trails=200)

In [None]:
optimized_RF=RandomForestClassifier(**RF_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_RF, X_train2, y_train2)
print("Confusion Matrix is: ", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("The Acc value from CM is: ", train_eval['acc'])
print("The Recall value is: ", train_eval['recall_train'])
print("The F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['auc'])
print('5 accuracies: ', train_eval['score'])
Acc_rf=train_eval['score']
Sen_rf=train_eval['Sen_list']
Spec_rf=train_eval['Spec_list']
MCC_rf=train_eval['mcc_list']
AUC_rf=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
#rfc.fit(X_train, y_train)
dtc_eval = evaluate_model_test(optimized_RF, X_test2, y_test2)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# Extra Tree Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import optuna
def objective(trial):
    """Define the objective function"""
    params = {
            'n_estimators' : trial.suggest_int('n_estimators', 100, 2000),
            'max_depth' : trial.suggest_int('max_depth', 10, 90),
            'max_leaf_nodes' : trial.suggest_int('max_leaf_nodes', 15, 100),
            'criterion' : trial.suggest_categorical('criterion', ['gini', 'entropy'])

    }


    # Fit the model
    etc_model = ExtraTreesClassifier(**params)
    score = cross_val_score(etc_model, X_train2, y_train2, cv=cv, scoring="accuracy")
    accuracy_mean = score.mean()
    return accuracy_mean


#Execute optuna and set hyperparameters
etc_study = optuna.create_study(direction='maximize')
etc_study.optimize(objective, n_trails=200)

optimized_etc =ExtraTreesClassifier(**etc_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_etc, X_train2, y_train2)
print("Confusion Matrix is:\n", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("Precision value is: ", train_eval['prec_train'])
print("Recall value is: ", train_eval['recall_train'])
print('The area under curve is:', train_eval['auc'])
print("F1 score is: ", train_eval['f1_train'])
Acc_etc=train_eval['score']
Sen_etc=train_eval['Sen_list']
Spec_etc=train_eval['Spec_list']
MCC_etc=train_eval['mcc_list']
AUC_etc=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_etc, X_test2, y_test2)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# XGB

In [None]:
from xgboost import XGBClassifier
#cv = RepeatedStratifiedKFold(n_splits=5)
import optuna
def objective(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 400),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 10.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'gamma': trial.suggest_float('gamma', 1e-8, 10.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0),
        #'eval_metric': 'mlogloss',
        #'use_label_encoder': False
    }

    # Fit the model
    xgb_model = XGBClassifier(**params,  eval_metric='mlogloss')
    score = cross_val_score(xgb_model, X_train2, y_train2, cv=cv, scoring="accuracy")
    accuracy_mean = score.mean()
    return accuracy_mean


#Execute optuna and set hyperparameters
XGB_study = optuna.create_study(direction='maximize')
XGB_study.optimize(objective, n_trails=200)
optimized_XGB =XGBClassifier(**XGB_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_XGB, X_train2, y_train2)

print("Confusion Matrix is:\n", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("Precision value is: ", train_eval['prec_train'])
print("Recall value is: ", train_eval['recall_train'])
print("F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['auc'])
Acc_xgb=train_eval['score']
Sen_xgb=train_eval['Sen_list']
Spec_xgb=train_eval['Spec_list']
MCC_xgb=train_eval['mcc_list']
AUC_xgb=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_XGB, X_test2, y_test2)

# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# LGBM

In [None]:
import lightgbm as lgbm
import optuna
def objective(trial):
    """Define the objective function"""
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 100), 
        'max_depth': trial.suggest_int('max_depth', 1, 100), 
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 10), 
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000), 
        #'objective': 'multiclass', 
       # 'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100), 
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'random_state': 0
    }


    # Fit the model
    lgbm_model = lgbm.LGBMClassifier(**params)
    score = cross_val_score(lgbm_model, X_train2, y_train2, cv=cv, scoring="accuracy")
    accuracy_mean = score.mean()

    return accuracy_mean


#Execute optuna and set hyperparameters
lgbm_study = optuna.create_study(direction='maximize')
lgbm_study.optimize(objective, n_trails=200)

optimized_lgbm =lgbm.LGBMClassifier(**lgbm_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_lgbm, X_train2, y_train2)
print("Confusion Matrix is: ", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("The Precision value is: ", train_eval['prec_train'])
print("The Recall value is: ", train_eval['recall_train'])
print("The F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['auc'])
Acc_lgbm=train_eval['score']
Sen_lgbm=train_eval['Sen_list']
Spec_lgbm=train_eval['Spec_list']
MCC_lgbm=train_eval['mcc_list']
AUC_lgbm=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_lgbm, X_test2, y_test2)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# AdaBoost

In [None]:
# from sklearn.ensemble import AdaBoostClassifier
# def objective(trial):
    
#     params = {
#                 'n_estimators':trial.suggest_int('n_estimators',50,500),
#                 'learning_rate': trial.suggest_float('learning_rate', 0.1,2.5,step = 0.0000005),
#                 'algorithm':'SAMME.R', 
#                 'random_state':47
#             }
    
    
#     # Fit the model
#     abc_model = AdaBoostClassifier(**params)
    
#     score = cross_val_score(abc_model, X_train, y_train, cv=cv, scoring="accuracy")
#     accuracy_mean = score.mean()

#     return accuracy_mean


# #Execute optuna and set hyperparameters
# abc_study = optuna.create_study(direction='maximize')
# abc_study.optimize(objective, n_trails=200)

# optimized_abc =AdaBoostClassifier(**abc_study.best_params)

In [None]:
# # Evaluate Model on Training data
# train_eval = evaluate_model_train(optimized_abc, X_train, y_train)
# print("Confusion Matrix is: ", train_eval['cm'])
# print ('Accuracy : ', train_eval['acc'])
# print('Sensitivity : ', train_eval['sen'])
# print('Specificity : ', train_eval['spec'])
# print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
# print("The Precision value is: ", train_eval['prec_train'])
# print("The Recall value is: ", train_eval['recall_train'])
# print("The F1 score is: ", train_eval['f1_train'])
# print('The area under curve is:', train_eval['auc'])
# AAindex_abc=train_eval['score']

In [None]:
# # Evaluate Model on Testing data
# dtc_eval = evaluate_model_test(optimized_abc, X_test, y_test)
# # Print result
# print('Accuracy:', dtc_eval['acc'])
# print('Precision:', dtc_eval['prec'])
# print('Recall:', dtc_eval['rec'])
# print('F1 Score:', dtc_eval['f1'])
# print('Area Under Curve:', dtc_eval['auc'])
# print('Sensitivity : ', dtc_eval['sen'])
# print('Specificity : ', dtc_eval['spec'])
# print('MCC Score : ', dtc_eval['mcc'])
# print('Confusion Matrix:\n', dtc_eval['cm'])

# CatBoost

In [None]:
# from catboost import CatBoostClassifier
# def objective(trial):
#     params = {
#             "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5),
#             "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
#             "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
#             "depth": trial.suggest_int("depth", 1, 12),
#             "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
#             "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
#             ),
#             "used_ram_limit": "3gb",
#         }

# #     if param["bootstrap_type"] == "Bayesian":
# #         param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
# #     elif param["bootstrap_type"] == "Bernoulli":
# #         param["subsample"] = trial.suggest_float("subsample", 0.1, 1)


#     # Fit the model
#     cat_model = CatBoostClassifier(**params)

#     score = cross_val_score(cat_model, X_train, y_train, cv=cv, scoring="accuracy")
#     accuracy_mean = score.mean()

#     return accuracy_mean


# #Execute optuna and set hyperparameters
# cat_study = optuna.create_study(direction='maximize')
# cat_study.optimize(objective, n_trials=5)

# optimized_cat =CatBoostClassifier(**cat_study.best_params)


In [None]:
# # Evaluate Model on Training data
# train_eval = evaluate_model_train(optimized_cat, X_train, y_train)
# print("Confusion Matrix is: ", train_eval['cm'])
# print ('Accuracy : ', train_eval['acc'])
# print('Sensitivity : ', train_eval['sen'])
# print('Specificity : ', train_eval['spec'])
# print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
# print("The Precision value is: ", train_eval['prec_train'])
# print("The Recall value is: ", train_eval['recall_train'])
# print("The F1 score is: ", train_eval['f1_train'])
# print('The area under curve is:', train_eval['auc'])
# cbc=train_eval['score']
# Sen_cbc=train_eval['Sen_list']
# Spec_cbc=train_eval['Spec_list']
# MCC_cbc=train_eval['mcc_list']
# AUC_cbc=train_eval['auc_list']

In [None]:
# # Evaluate Model on Testing data
# dtc_eval = evaluate_model_test(optimized_cat, X_test, y_test)
# # Print result
# print('Accuracy:', dtc_eval['acc'])
# print('Precision:', dtc_eval['prec'])
# print('Recall:', dtc_eval['rec'])
# print('F1 Score:', dtc_eval['f1'])
# print('Area Under Curve:', dtc_eval['auc'])
# print('Sensitivity : ', dtc_eval['sen'])
# print('Specificity : ', dtc_eval['spec'])
# print('MCC Score : ', dtc_eval['mcc'])
# print('Confusion Matrix:\n', dtc_eval['cm'])

In [None]:
# import seaborn as sns
# box= pd.DataFrame({1:rf, 2:xgb, 3:etc, 4:lgbm, 5:cbc})
# # boxplot=sns.boxplot(data=box_AAindex, width=0.5)
# # boxplot.set_xlabel("AAindex", fontsize=14)
# # boxplot.set_ylabel("Accuracy", fontsize=14)
# # plt.show()
# box=pd.DataFrame(box)
# box.to_csv('Box_SCPAAC_Accuracies.csv')

# SVM Classifier

In [None]:
# for Optuna
from sklearn.svm import SVC
def objective(trial):
    # C
    svc_c = trial.suggest_float('C', 1e0, 1e2)
    # kernel
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])
    # SVC
    clf = SVC(C=svc_c, kernel=kernel)
    score = cross_val_score(clf, X_train2, y_train2, cv=cv, scoring="accuracy")
    accuracy_mean = score.mean()

    return accuracy_mean


#Execute optuna and set hyperparameters
svm_study = optuna.create_study(direction='maximize')
svm_study.optimize(objective, n_trails=200)



In [None]:
optimized_svm =SVC(**svm_study.best_params, probability=True)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_svm, X_train2, y_train2)
print("Confusion Matrix is: ", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("The Precision value is: ", train_eval['prec_train'])
print("The Recall value is: ", train_eval['recall_train'])
print("The F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['auc'])
Acc_svm=train_eval['score']
Sen_svm=train_eval['Sen_list']
Spec_svm=train_eval['Spec_list']
MCC_svm=train_eval['mcc_list']
AUC_svm=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_svm, X_test2, y_test2)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# Import Unbalanced dataset to check the generalizability of the TFProtBert method

In [None]:
#Split the NTF_samples in the train and test part.
X=pd.read_csv('Prot-Bert/NTF_6444_Embedding_ProtBert.csv', header=None).iloc[:,1:].values
y = np.zeros(6444)

In [None]:
from sklearn.model_selection import train_test_split
X_NTF_train, X_NTF_test, y_NTF_train, y_NTF_test= train_test_split(X, y, test_size=0.2, random_state=67, stratify=y)

In [None]:
X_NTF_train.shape, X_NTF_test.shape

In [None]:
train_pos=pd.read_csv('Prot-Bert/TF_Training_Embedding_ProtBert.csv', header=None).iloc[:,1:].values
test_pos=pd.read_csv('Prot-Bert/TF_Ind_Embedding_ProtBert.csv', header=None).iloc[:,1:].values

In [None]:
#Join the TF train and NTF train
#Also join TF Ind and NTF Ind
X_train=np.concatenate((train_pos, X_NTF_train), axis=0) 
X_test=np.concatenate((test_pos, X_NTF_test), axis=0)

In [None]:
y_pos_train=np.ones(413)
y_train=np.concatenate((y_pos_train, y_NTF_train), axis=0)

In [None]:
y_pos_test=np.ones(106)
y_test=np.concatenate((y_pos_test, y_NTF_test), axis=0)

In [None]:
#Shape

y_train.shape, y_test.shape, X_train.shape, X_test.shape

In [None]:
from collections import Counter
Counter(y_train), Counter(y_test)

# Random Forest

In [None]:
import optuna
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=24) 
from sklearn.ensemble import RandomForestClassifier
def RF_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    max_depth = trial.suggest_int('max_depth', 1, 60)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 1000)
    min_samples_split= trial.suggest_int("min_samples_split", 2, 20)
    
    ## Create Model
    model = RandomForestClassifier(max_depth = max_depth, min_samples_split=min_samples_split,
                                   n_estimators = n_estimators,n_jobs=2
                                    )

   
    score = cross_val_score(model, X_train, y_train, cv=cv, scoring="accuracy")
    accuracy_mean = score.mean()
    return accuracy_mean

#Execute optuna and set hyperparameters
RF_study = optuna.create_study(direction='maximize')
RF_study.optimize(RF_objective, n_trails=200)

In [None]:
params={'n_estimators': 209, 'max_depth': 10, 'max_leaf_nodes': 60, 'min_samples_split': 2}
optimized_RF=RandomForestClassifier(**params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_RF, X_train, y_train)
print("Confusion Matrix is: ", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("The Acc value from CM is: ", train_eval['acc'])
print("The Recall value is: ", train_eval['recall_train'])
print("Precision value is: ", train_eval['prec_train'])
print("The F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['auc'])
print('5 accuracies: ', train_eval['score'])
Acc_rf=train_eval['score']
Sen_rf=train_eval['Sen_list']
Spec_rf=train_eval['Spec_list']
MCC_rf=train_eval['mcc_list']
AUC_rf=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
#rfc.fit(X_train, y_train)
dtc_eval = evaluate_model_test(optimized_RF, X_test, y_test)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# Extra Tree Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import optuna
def objective(trial):
    """Define the objective function"""
    params = {
            'n_estimators' : trial.suggest_int('n_estimators', 100, 2000),
            'max_depth' : trial.suggest_int('max_depth', 10, 90),
            'max_leaf_nodes' : trial.suggest_int('max_leaf_nodes', 15, 100),
            'criterion' : trial.suggest_categorical('criterion', ['gini', 'entropy'])

    }


    # Fit the model
    etc_model = ExtraTreesClassifier(**params)
    score = cross_val_score(etc_model, X_train, y_train, cv=cv, scoring="accuracy")
    accuracy_mean = score.mean()
    return accuracy_mean


#Execute optuna and set hyperparameters
etc_study = optuna.create_study(direction='maximize')
etc_study.optimize(objective, n_trails=200)

optimized_etc =ExtraTreesClassifier(**etc_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_etc, X_train, y_train)
print("Confusion Matrix is:\n", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("Precision value is: ", train_eval['prec_train'])
print("Recall value is: ", train_eval['recall_train'])
print('The area under curve is:', train_eval['auc'])
print("F1 score is: ", train_eval['f1_train'])
Acc_etc=train_eval['score']
Sen_etc=train_eval['Sen_list']
Spec_etc=train_eval['Spec_list']
MCC_etc=train_eval['mcc_list']
AUC_etc=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_etc, X_test, y_test)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# XGB

In [None]:
from xgboost import XGBClassifier
#cv = RepeatedStratifiedKFold(n_splits=5)
import optuna
def objective(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 400),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 10.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'gamma': trial.suggest_float('gamma', 1e-8, 10.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0),
        #'eval_metric': 'mlogloss',
        #'use_label_encoder': False
    }

    # Fit the model
    xgb_model = XGBClassifier(**params,  eval_metric='mlogloss')
    score = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring="accuracy")
    accuracy_mean = score.mean()
    return accuracy_mean
#Execute optuna and set hyperparameters
XGB_study = optuna.create_study(direction='maximize')
XGB_study.optimize(objective, n_trails=200)
optimized_XGB =XGBClassifier(**XGB_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_XGB, X_train, y_train)

print("Confusion Matrix is:\n", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("Precision value is: ", train_eval['prec_train'])
print("Recall value is: ", train_eval['recall_train'])
print("F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['auc'])
Acc_xgb=train_eval['score']
Sen_xgb=train_eval['Sen_list']
Spec_xgb=train_eval['Spec_list']
MCC_xgb=train_eval['mcc_list']
AUC_xgb=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_XGB, X_test, y_test)

# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# LGBM

In [None]:
import lightgbm as lgbm
import optuna
def objective(trial):
    """Define the objective function"""
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 100), 
        'max_depth': trial.suggest_int('max_depth', 1, 100), 
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 10), 
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000), 
        #'objective': 'multiclass', 
       # 'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100), 
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'random_state': 0
    }


    # Fit the model
    lgbm_model = lgbm.LGBMClassifier(**params)
    score = cross_val_score(lgbm_model, X_train, y_train, cv=cv, scoring="accuracy")
    accuracy_mean = score.mean()

    return accuracy_mean


#Execute optuna and set hyperparameters
lgbm_study = optuna.create_study(direction='maximize')
lgbm_study.optimize(objective, n_trails=200)

optimized_lgbm =lgbm.LGBMClassifier(**lgbm_study.best_params)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_lgbm, X_train, y_train)
print("Confusion Matrix is: ", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("The Precision value is: ", train_eval['prec_train'])
print("The Recall value is: ", train_eval['recall_train'])
print("The F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['auc'])
Acc_lgbm=train_eval['score']
Sen_lgbm=train_eval['Sen_list']
Spec_lgbm=train_eval['Spec_list']
MCC_lgbm=train_eval['mcc_list']
AUC_lgbm=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_lgbm, X_test, y_test)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

# AdaBoost

In [None]:
# from sklearn.ensemble import AdaBoostClassifier
# def objective(trial):
    
#     params = {
#                 'n_estimators':trial.suggest_int('n_estimators',50,500),
#                 'learning_rate': trial.suggest_float('learning_rate', 0.1,2.5,step = 0.0000005),
#                 'algorithm':'SAMME.R', 
#                 'random_state':47
#             }
    
    
#     # Fit the model
#     abc_model = AdaBoostClassifier(**params)
    
#     score = cross_val_score(abc_model, X_train, y_train, cv=cv, scoring="accuracy")
#     accuracy_mean = score.mean()

#     return accuracy_mean


# #Execute optuna and set hyperparameters
# abc_study = optuna.create_study(direction='maximize')
# abc_study.optimize(objective, n_trails=200)

# optimized_abc =AdaBoostClassifier(**abc_study.best_params)

In [None]:
# # Evaluate Model on Training data
# train_eval = evaluate_model_train(optimized_abc, X_train, y_train)
# print("Confusion Matrix is: ", train_eval['cm'])
# print ('Accuracy : ', train_eval['acc'])
# print('Sensitivity : ', train_eval['sen'])
# print('Specificity : ', train_eval['spec'])
# print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
# print("The Precision value is: ", train_eval['prec_train'])
# print("The Recall value is: ", train_eval['recall_train'])
# print("The F1 score is: ", train_eval['f1_train'])
# print('The area under curve is:', train_eval['auc'])
# AAindex_abc=train_eval['score']

In [None]:
# # Evaluate Model on Testing data
# dtc_eval = evaluate_model_test(optimized_abc, X_test, y_test)
# # Print result
# print('Accuracy:', dtc_eval['acc'])
# print('Precision:', dtc_eval['prec'])
# print('Recall:', dtc_eval['rec'])
# print('F1 Score:', dtc_eval['f1'])
# print('Area Under Curve:', dtc_eval['auc'])
# print('Sensitivity : ', dtc_eval['sen'])
# print('Specificity : ', dtc_eval['spec'])
# print('MCC Score : ', dtc_eval['mcc'])
# print('Confusion Matrix:\n', dtc_eval['cm'])

# CatBoost

In [None]:
# from catboost import CatBoostClassifier
# def objective(trial):
#     params = {
#             "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5),
#             "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
#             "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
#             "depth": trial.suggest_int("depth", 1, 12),
#             "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
#             "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
#             ),
#             "used_ram_limit": "3gb",
#         }

# #     if param["bootstrap_type"] == "Bayesian":
# #         param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
# #     elif param["bootstrap_type"] == "Bernoulli":
# #         param["subsample"] = trial.suggest_float("subsample", 0.1, 1)


#     # Fit the model
#     cat_model = CatBoostClassifier(**params)

#     score = cross_val_score(cat_model, X_train, y_train, cv=cv, scoring="accuracy")
#     accuracy_mean = score.mean()

#     return accuracy_mean


# #Execute optuna and set hyperparameters
# cat_study = optuna.create_study(direction='maximize')
# cat_study.optimize(objective, n_trials=5)

# optimized_cat =CatBoostClassifier(**cat_study.best_params)


In [None]:
# # Evaluate Model on Training data
# train_eval = evaluate_model_train(optimized_cat, X_train, y_train)
# print("Confusion Matrix is: ", train_eval['cm'])
# print ('Accuracy : ', train_eval['acc'])
# print('Sensitivity : ', train_eval['sen'])
# print('Specificity : ', train_eval['spec'])
# print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
# print("The Precision value is: ", train_eval['prec_train'])
# print("The Recall value is: ", train_eval['recall_train'])
# print("The F1 score is: ", train_eval['f1_train'])
# print('The area under curve is:', train_eval['auc'])
# cbc=train_eval['score']
# Sen_cbc=train_eval['Sen_list']
# Spec_cbc=train_eval['Spec_list']
# MCC_cbc=train_eval['mcc_list']
# AUC_cbc=train_eval['auc_list']

In [None]:
# # Evaluate Model on Testing data
# dtc_eval = evaluate_model_test(optimized_cat, X_test, y_test)
# # Print result
# print('Accuracy:', dtc_eval['acc'])
# print('Precision:', dtc_eval['prec'])
# print('Recall:', dtc_eval['rec'])
# print('F1 Score:', dtc_eval['f1'])
# print('Area Under Curve:', dtc_eval['auc'])
# print('Sensitivity : ', dtc_eval['sen'])
# print('Specificity : ', dtc_eval['spec'])
# print('MCC Score : ', dtc_eval['mcc'])
# print('Confusion Matrix:\n', dtc_eval['cm'])

In [None]:
# import seaborn as sns
# box= pd.DataFrame({1:rf, 2:xgb, 3:etc, 4:lgbm, 5:cbc})
# # boxplot=sns.boxplot(data=box_AAindex, width=0.5)
# # boxplot.set_xlabel("AAindex", fontsize=14)
# # boxplot.set_ylabel("Accuracy", fontsize=14)
# # plt.show()
# box=pd.DataFrame(box)
# box.to_csv('Box_SCPAAC_Accuracies.csv')

# SVM Classifier

In [None]:
# for Optuna
from sklearn.svm import SVC
def objective(trial):
    # C
    svc_c = trial.suggest_float('C', 1e0, 1e2)
    # kernel
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])
    # SVC
    clf = SVC(C=svc_c, kernel=kernel)
    score = cross_val_score(clf, X_train, y_train, cv=cv, scoring="accuracy")
    accuracy_mean = score.mean()

    return accuracy_mean


#Execute optuna and set hyperparameters
svm_study = optuna.create_study(direction='maximize')
svm_study.optimize(objective, n_trails=200)



In [None]:
optimized_svm =SVC(**svm_study.best_params, probability=True)

In [None]:
# Evaluate Model on Training data
train_eval = evaluate_model_train(optimized_svm, X_train, y_train)
print("Confusion Matrix is: ", train_eval['cm'])
print ('Accuracy : ', train_eval['acc'])
print('Sensitivity : ', train_eval['sen'])
print('Specificity : ', train_eval['spec'])
print("Mean of Matthews Correlation Coefficient is: ", train_eval['mcc'])
print("The Precision value is: ", train_eval['prec_train'])
print("The Recall value is: ", train_eval['recall_train'])
print("The F1 score is: ", train_eval['f1_train'])
print('The area under curve is:', train_eval['auc'])
Acc_svm=train_eval['score']
Sen_svm=train_eval['Sen_list']
Spec_svm=train_eval['Spec_list']
MCC_svm=train_eval['mcc_list']
AUC_svm=train_eval['auc_list']

In [None]:
# Evaluate Model on Testing data
dtc_eval = evaluate_model_test(optimized_svm, X_test, y_test)
# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Area Under Curve:', dtc_eval['auc'])
print('Sensitivity : ', dtc_eval['sen'])
print('Specificity : ', dtc_eval['spec'])
print('MCC Score : ', dtc_eval['mcc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

In [None]:
# import seaborn as sns


# box_ACC= pd.DataFrame({1:Acc_rf, 2:Acc_xgb, 3:Acc_etc, 4:Acc_lgbm, 5:Acc_svm})
# box_Sen= pd.DataFrame({1:Sen_rf, 2:Sen_xgb, 3:Sen_etc, 4:Sen_lgbm, 5:Sen_svm})
# box_Spec= pd.DataFrame({1:Spec_rf, 2:Spec_xgb, 3:Spec_etc, 4:Spec_lgbm, 5:Spec_svm})
# box_MCC= pd.DataFrame({1:MCC_rf, 2:MCC_xgb, 3:MCC_etc, 4:MCC_lgbm, 5:MCC_svm})
# box_AUC= pd.DataFrame({1:AUC_rf, 2:AUC_xgb, 3:AUC_etc, 4:AUC_lgbm, 5:AUC_svm})

# boxplot=sns.boxplot(data=box_ACC, width=0.5)
# boxplot.set_xlabel("PAAC", fontsize=14)
# boxplot.set_ylabel("Accuracy", fontsize=14)
# plt.show()

In [None]:
# #Saving the models
# import pickle
# pickle.dump(optimized_RF, open('Models/Optimized_RF_PAAC.pkl', 'wb'))
# pickle.dump(optimized_XGB, open('Models/Optimized_XGB_PAAC.pkl', 'wb'))
# pickle.dump(optimized_etc, open('Models/Optimized_etc_PAAC.pkl', 'wb'))
# pickle.dump(optimized_lgbm, open('Models/Optimized_lgbm_PAAC.pkl', 'wb'))
# pickle.dump(optimized_svm, open('Models/Optimized_SVM_PAAC.pkl', 'wb'))