In [70]:
#Bibliotecas
import pandas as pd
from pandas import DataFrame
import numpy as np
from numpy import linalg #SVD
import screed 
import itertools
import matplotlib.pyplot as plt
import random
import statistics
from collections import Counter
import pickle

random_seed = 600
np.random.seed(random_seed)
random.seed(random_seed)

import os
os.environ['PYTHONHASHSEED'] = '0'

import tensorflow as tf
tf.random.set_seed(random_seed)

import keras
from keras.layers import Input, Dense
from keras.models import Model
from keras.datasets import mnist

from keras.models import Sequential
from keras.layers import Conv1D
from keras.layers import Dropout
from keras.layers import MaxPooling1D
from keras.layers import Flatten
from keras.layers import LSTM
from keras.utils import to_categorical
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import Adam
from keras.optimizers import Nadam

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

%matplotlib inline
#---------------

In [81]:
def print_metrics(metrics, db_choosen, model_name):
    print('-'*50)
    db = "No Feature" if db_choosen == 0 else "Full"
    print("Database: ", db," ", model_name," \n")
    print('-'*50)
    if (db_choosen == 0):
        print('Model Selection: Onehot | k-mers Sparse Matrix | Autoencoder')
    elif (db_choosen == 1):
        print('Model Selection: Onehot | k-mers Sparse Matrix | Autoencoder | (AAC - DPC - PCP)')
    print('-'*50)
    print('Accuracy Training: ', statistics.mean(metrics["accuracy_train"])*100)
    print('Accuracy Testing: ', statistics.mean(metrics["accuracy_test"])*100)
    print('Precision: ', statistics.mean(metrics["precision"])*100)
    print('Sensitivity: ', statistics.mean(metrics["sensitivity"])*100) 
    print('Specificity: ', statistics.mean(metrics["specificity"])*100)
    print('f_1 Score: ', statistics.mean(metrics["f1"])*100)
    print('MCC: ', statistics.mean(metrics["mcc"])*100) 
    print('AUC Score: ', statistics.mean(metrics["auc"])*100) 
    print('MSE: ', statistics.mean(metrics["mse"]))
    print('Mis-Classification: ', statistics.mean(metrics["misc"])) 

    #Mostrar más decimales en DF
    pd.set_option("display.precision", 15)

    #-------------------------------------------
    metrics_model = [metrics["accuracy_train"], metrics["accuracy_test"], metrics["precision"], metrics["sensitivity"], metrics["specificity"], metrics["mcc"], metrics["auc"], metrics["f1"], metrics["misc"]]
    metrics_m = pd.DataFrame(metrics_model, columns = ['1', '2','3', '4', '5'],index = ['Accuracy Training','Accuracy Test', 'Precision', 'Sensitivity', 'Specificity', 'MCC', 'AUC Score', 'f_1 Score','Mis-Classification'])
    print(metrics_m)

def confusion_metrics(conf_matrix):
    # Guardar la matriz de confusión y dividirla en 4 piezas
    TP = conf_matrix[1][1]
    TN = conf_matrix[0][0]
    FP = conf_matrix[0][1]
    FN = conf_matrix[1][0]

    # Calcular Precisión
    conf_accuracy = (float (TP+TN) / float(TP + TN + FP + FN))

    # Calcular mis-classification
    conf_misclassification = 1- conf_accuracy

    # Calcular Sensitivity
    conf_sensitivity = (TP / float(TP + FN))
    # Calcular Specificity
    conf_specificity = (TN / float(TN + FP))

    # Calcular la Precisión
    conf_precision = (TP / float(TP + FP))
    # Calcular f_1 score
    conf_f1 = 2 * ((conf_precision * conf_sensitivity) / (conf_precision + conf_sensitivity))

    precision = conf_precision
    sensitivity = conf_sensitivity
    specificity = conf_specificity

    return precision, sensitivity, specificity, conf_f1, conf_misclassification

def create_model_lstm(input_shape, n_outputs):
    #opt = tf.keras.optimizers.Adam(learning_rate=0.01)
    model = Sequential()
    model.add(LSTM(4, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(n_outputs, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_model_cnn(input_shape, n_outputs):
    opt = tf.keras.optimizers.Nadam(learning_rate=0.001)
    model = Sequential()
    model.add(Conv1D(filters=100, kernel_size=3, activation='softsign', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(64, activation='softsign'))
    model.add(Dense(n_outputs, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model

def change_index(df):
    #Cambiar indices de filas y columnas por valores numéricos
    total_rows_df = df.shape[0]
    df.index = np.arange(0, total_rows_df)

    total_columns_df = df.shape[1]
    df.columns = np.arange(0, total_columns_df)
    
    return df

def build_datasets(df_model):
    X = df_model.iloc[:, :-1]

    y = df_model.iloc[:,-1]

    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)

    return X, y

In [82]:
def model_selection(db_choosen, db):

    path_nf = 'Databases/NoFeature/nf_polarizability.txt'
    path_full = 'Databases/Full/full_polarizability.txt'
    
    df_nf= pd.read_csv(path_nf, sep=" ", header=None)
    df_full = pd.read_csv(path_full, sep=" ", header=None)

    if(db_choosen == 0):
        df_model = df_nf
        path_choosen = path_nf
    elif(db_choosen == 1):
        df_model = df_full
        path_choosen = path_full

    df_model = change_index(df_model)

    X, y = build_datasets(df_model)

    #Counter(y).keys() # equals to list(set(words))
    #Counter(y).values() # counts the elements' frequency
    print("\n")
    print("_"*70)
    print("_"*70)
    print("\n")
    #print("Dataset Shape: X = %s, Y = %s" %(X.shape, y.shape))
    #print("Positives Samples = %i, Negatives Samples = %i" %(list(Counter(y).values())[0] , list(Counter(y).values())[1]))
    
    X_main, X_testing, y_main, y_testing = train_test_split(X, y, test_size=0.1, random_state=42)
    
    y_main = y_main.to_numpy()
    
    #print("\nDataset Train Shape (90): X = %s, Y = %s" %(X_main.shape, y_main.shape))
    #print("Dataset Test Shape (10): X = %s, Y = %s" %(X_testing.shape, y_testing.shape))
    #print("\nPositives Samples Train = %i, Negatives Samples = %i" %(list(Counter(y_main).values())[0] , list(Counter(y_main).values())[1]))
    #print(y.shape)
    #print("Positives Samples Test = %i, Negatives Samples = %i" %(list(Counter(y_testing).values())[0] , list(Counter(y_testing).values())[1]))

    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    kf.get_n_splits(X_main)

    batch_size = 32
    epoch = 10
    verbose = 0

    count = 1

    dict_metrics = { "RF" : {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 'precision':[], 'sensitivity':[], 'specificity':[], 
                             'f1':[], 'auc':[], 'mcc':[], 'mse':[], 'misc':[]},
                      "SVM": {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 'precision':[], 'sensitivity':[], 'specificity':[], 
                              'f1':[], 'auc':[], 'mcc':[], 'mse':[], 'misc':[]},
                      "LSTM": {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 'precision':[], 'sensitivity':[], 'specificity':[], 
                               'f1':[], 'auc':[], 'mcc':[], 'mse':[], 'misc':[]},
                      "CNN": {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 'precision':[], 'sensitivity':[], 'specificity':[], 
                              'f1':[], 'auc':[], 'mcc':[], 'mse':[], 'misc':[]},                
                    }

    for train_index, test_index in kf.split(X_main):

        X_train, X_test = X_main[train_index], X_main[test_index]
        y_train, y_test = y_main[train_index], y_main[test_index]

        #______________________________________________________________________________________________________
        #RF---------------------------------------------------------------------------------------------------
        #model_rf = RandomForestClassifier(max_depth=1, random_state = 42)
        model_rf = RandomForestClassifier(max_depth=2, max_features=2, random_state = 42)
        model_rf.fit(X_train, y_train)
        
        filename = 'SavedModel/'+db+'/rf_'+str(count)+'.sav'
        pickle.dump(model_rf, open(filename, 'wb'))

        # Testing Accuracy
        dict_metrics["RF"]["accuracy_train"].append(model_rf.score(X_train, y_train))
        dict_metrics["RF"]["accuracy_test"].append(model_rf.score(X_test, y_test))

        # Predicciones
        new_predictions = model_rf.predict(X_test)
        y_pred = new_predictions
        y_t = y_test
        
        # MSE
        dict_metrics["RF"]["mse"].append(mean_squared_error(y_t, y_pred))

        # Cálculo de métricas
        cm = metrics.confusion_matrix(y_t, y_pred)
        precision, sensitivity, specificity, conf_f1, conf_misclassification = confusion_metrics(cm)

        dict_metrics["RF"]["precision"].append(precision)
        dict_metrics["RF"]["sensitivity"].append(sensitivity)
        dict_metrics["RF"]["specificity"].append(specificity)
        dict_metrics["RF"]["f1"].append(conf_f1)
        dict_metrics["RF"]["misc"].append(conf_misclassification)

        #ROC and AUC Score
        auc_score = roc_auc_score(y_t, y_pred)
        dict_metrics["RF"]["auc"].append(auc_score)

        #MCC
        mcc_score = matthews_corrcoef(y_t, y_pred)
        dict_metrics["RF"]["mcc"].append(mcc_score)
    
        #______________________________________________________________________________________________________
        #SVM---------------------------------------------------------------------------------------------------
        #model_svm = svm.SVC(kernel='linear', verbose = False) # Linear Kernel
        model_svm = svm.SVC(kernel='rbf', gamma='auto', C=1, verbose = False) # Linear Kernel
        model_svm.fit(X_train, y_train)
        
        filename = 'SavedModel/'+db+'/svm_'+str(count)+'.sav'
        pickle.dump(model_svm, open(filename, 'wb'))

        # Testing Accuracy
        dict_metrics["SVM"]["accuracy_train"].append(model_svm.score(X_train, y_train))
        dict_metrics["SVM"]["accuracy_test"].append(model_svm.score(X_test, y_test))

        # Predicciones
        new_predictions = model_svm.predict(X_test)
        y_pred = new_predictions
        y_t = y_test
        
        # MSE
        dict_metrics["SVM"]["mse"].append(mean_squared_error(y_t, y_pred))

        # Cálculo de métricas
        cm = metrics.confusion_matrix(y_t, y_pred)
        precision, sensitivity, specificity, conf_f1, conf_misclassification = confusion_metrics(cm)

        dict_metrics["SVM"]["precision"].append(precision)
        dict_metrics["SVM"]["sensitivity"].append(sensitivity)
        dict_metrics["SVM"]["specificity"].append(specificity)
        dict_metrics["SVM"]["f1"].append(conf_f1)
        dict_metrics["SVM"]["misc"].append(conf_misclassification)

        #ROC and AUC Score
        auc_score = roc_auc_score(y_t, y_pred)
        dict_metrics["SVM"]["auc"].append(auc_score)

        #MCC
        mcc_score = matthews_corrcoef(y_t, y_pred)
        dict_metrics["SVM"]["mcc"].append(mcc_score)

        #______________________________________________________________________________________________________
        #LSTM--------------------------------------------------------------------------------------------------
        X_train_lstm = np.expand_dims(X_train, axis=2)
        X_test_lstm = np.expand_dims(X_test, axis=2)
        y_train_lstm = to_categorical(y_train, 2)
        y_test_lstm = to_categorical(y_test, 2)

        n_timesteps, n_features, n_outputs = X_train_lstm.shape[0], X_train_lstm.shape[1], y_train_lstm.shape[1]
        input_shape = (n_features,1)

        model_lstm = create_model_lstm(input_shape, n_outputs)
        model_lstm.fit(X_train_lstm,y_train_lstm,verbose=0)
        
        model_lstm.save('SavedModel/'+db+'/lstm_'+str(count)+'.h5')

        (_, lstm_acc_train) = model_lstm.evaluate(X_train_lstm, y_train_lstm, verbose=0)
        (_, lstm_acc_test) = model_lstm.evaluate(X_test_lstm, y_test_lstm, verbose=0)
        dict_metrics["LSTM"]["accuracy_train"].append(lstm_acc_train)
        dict_metrics["LSTM"]["accuracy_test"].append(lstm_acc_test)

        # Predicciones
        new_predictions = model_lstm.predict(X_test_lstm, batch_size=batch_size, verbose=0)
        y_pred = np.argmax(new_predictions, axis=1)
        y_t = np.argmax(y_test_lstm, axis=1)
        
        # MSE
        dict_metrics["LSTM"]["mse"].append(mean_squared_error(y_t, y_pred))

        # Cálculo de métricas
        cm = metrics.confusion_matrix(y_t, y_pred)
        precision, sensitivity, specificity, conf_f1, conf_misclassification = confusion_metrics(cm)

        dict_metrics["LSTM"]["precision"].append(precision)
        dict_metrics["LSTM"]["sensitivity"].append(sensitivity)
        dict_metrics["LSTM"]["specificity"].append(specificity)
        dict_metrics["LSTM"]["f1"].append(conf_f1)
        dict_metrics["LSTM"]["misc"].append(conf_misclassification)

        #ROC and AUC Score
        auc_score = roc_auc_score(y_t, y_pred)
        dict_metrics["LSTM"]["auc"].append(auc_score)

        #MCC
        mcc_score = matthews_corrcoef(y_t, y_pred)
        dict_metrics["LSTM"]["mcc"].append(mcc_score)

        #______________________________________________________________________________________________________
        #CNN---------------------------------------------------------------------------------------------------
        X_train_cnn = np.expand_dims(X_train, axis=2)
        X_test_cnn = np.expand_dims(X_test, axis=2)
        y_train_cnn = to_categorical(y_train, 2)
        y_test_cnn = to_categorical(y_test, 2)

        n_timesteps, n_features, n_outputs = X_train_cnn.shape[0], X_train_cnn.shape[1], y_train_cnn.shape[1]
        input_shape = (n_features,1)

        model_cnn = create_model_cnn(input_shape, n_outputs)
        model_cnn.fit(X_train_cnn,y_train_cnn,batch_size=150,verbose=0)
        
        model_cnn.save('SavedModel/'+db+'/cnn_'+str(count)+'.h5')

        (_, cnn_acc_train) = model_cnn.evaluate(X_train_cnn, y_train_cnn, verbose=0)
        (_, cnn_acc_test) = model_cnn.evaluate(X_test_cnn, y_test_cnn, verbose=0)
        dict_metrics["CNN"]["accuracy_train"].append(cnn_acc_train)
        dict_metrics["CNN"]["accuracy_test"].append(cnn_acc_test)

        # Predicciones
        new_predictions = model_cnn.predict(X_test_cnn, batch_size=batch_size, verbose=0)
        y_pred = np.argmax(new_predictions, axis=1)
        y_t = np.argmax(y_test_cnn, axis=1)
        
        # MSE
        dict_metrics["CNN"]["mse"].append(mean_squared_error(y_t, y_pred))

        # Cálculo de métricas
        cm = metrics.confusion_matrix(y_t, y_pred)
        precision, sensitivity, specificity, conf_f1, conf_misclassification = confusion_metrics(cm)

        dict_metrics["CNN"]["precision"].append(precision)
        dict_metrics["CNN"]["sensitivity"].append(sensitivity)
        dict_metrics["CNN"]["specificity"].append(specificity)
        dict_metrics["CNN"]["f1"].append(conf_f1)
        dict_metrics["CNN"]["misc"].append(conf_misclassification)

        #ROC and AUC Score
        auc_score = roc_auc_score(y_t, y_pred)
        dict_metrics["CNN"]["auc"].append(auc_score)

        #MCC
        mcc_score = matthews_corrcoef(y_t, y_pred)
        dict_metrics["CNN"]["mcc"].append(mcc_score)
        
        count = count + 1
        
    return dict_metrics  

In [83]:
choosen=0 # 0: No Feature, 1: Full

db = "nf" 

dict_metrics = model_selection(choosen, db)

# RF-----------------------------------------------------
print_metrics(dict_metrics["RF"], choosen, model_name = "Random Forest")

# SVM----------------------------------------------------
print_metrics(dict_metrics["SVM"], choosen, model_name = "Support Vector Machine")

# LSTM---------------------------------------------------
print_metrics(dict_metrics["LSTM"], choosen, model_name = "LSTM")

# CNN----------------------------------------------------
print_metrics(dict_metrics["CNN"], choosen, model_name = "CNN")



______________________________________________________________________
______________________________________________________________________


--------------------------------------------------
Database:  No Feature   Random Forest  

--------------------------------------------------
Model Selection: Onehot | k-mers Sparse Matrix | Autoencoder
--------------------------------------------------
Accuracy Training:  97.8556655687728
Accuracy Testing:  96.4802844214609
Precision:  97.64833145774868
Sensitivity:  95.38931297709924
Specificity:  97.71674705329264
f_1 Score:  96.43778423988009
MCC:  93.11627434338439
AUC Score:  96.55303001519594
MSE:  0.03519715578539108
Mis-Classification:  0.035197155785391065
                                    1                  2                  3  \
Accuracy Training   0.979835013748854  0.975252062328139  0.976168652612282   
Accuracy Test       0.956043956043956  0.978021978021978  0.959706959706960   
Precision           1.000000000000000  0.96

In [84]:
choosen=1 # 0: No Feature, 1: Full

db = "full" 

dict_metrics = model_selection(choosen, db)

# RF-----------------------------------------------------
print_metrics(dict_metrics["RF"], choosen, model_name = "Random Forest")

# SVM----------------------------------------------------
print_metrics(dict_metrics["SVM"], choosen, model_name = "Support Vector Machine")

# LSTM---------------------------------------------------
print_metrics(dict_metrics["LSTM"], choosen, model_name = "LSTM")

# CNN----------------------------------------------------
print_metrics(dict_metrics["CNN"], choosen, model_name = "CNN")



______________________________________________________________________
______________________________________________________________________


--------------------------------------------------
Database:  Full   Random Forest  

--------------------------------------------------
Model Selection: Onehot | k-mers Sparse Matrix | Autoencoder | (AAC - DPC - PCP)
--------------------------------------------------
Accuracy Training:  92.9986435806784
Accuracy Testing:  92.59534583063996
Precision:  94.69032803021788
Sensitivity:  90.3555070883315
Specificity:  94.86841303001421
f_1 Score:  92.42140149537691
MCC:  85.35867401595758
AUC Score:  92.61196005917286
MSE:  0.07404654169360052
Mis-Classification:  0.0740465416936005
                                    1                  2                  3  \
Accuracy Training   0.933088909257562  0.930339138405133  0.928505957836847   
Accuracy Test       0.904761904761905  0.941391941391941  0.923076923076923   
Precision           0.959677419

# Model Testing

## No Feature

In [85]:
db = 'nf'

dict_metrics = { "RF" : {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 'precision':[], 'sensitivity':[], 'specificity':[], 
                             'f1':[], 'auc':[], 'mcc':[], 'mse':[], 'misc':[]},
                  "SVM": {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 'precision':[], 'sensitivity':[], 'specificity':[], 
                          'f1':[], 'auc':[], 'mcc':[], 'mse':[], 'misc':[]},
                  "LSTM": {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 'precision':[], 'sensitivity':[], 'specificity':[], 
                           'f1':[], 'auc':[], 'mcc':[], 'mse':[], 'misc':[]},
                  "CNN": {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 'precision':[], 'sensitivity':[], 'specificity':[], 
                          'f1':[], 'auc':[], 'mcc':[], 'mse':[], 'misc':[]},                
                }

path_nf = 'Databases/NoFeature/nf_polarizability.txt'
#path_full = 'Databases/Full/full_polarizability.txt'

df_model= pd.read_csv(path_nf, sep=" ", header=None)
#df_full = pd.read_csv(path_full, sep=" ", header=None)

df_model = change_index(df_model)

X, y = build_datasets(df_model)

X_main, X_testing, y_main, y_testing = train_test_split(X, y, test_size=0.1, random_state=42)

model_names = ['rf', 'svm', 'lstm', 'cnn']
model_count = 1

for model_name in model_names:
    for count in range (5):
        if model_count<3:
            filename = 'SavedModel/'+db+'/'+model_name+'_'+str(count+1)+'.sav'
            new_model = pickle.load(open(filename, 'rb'))
            
            # Testing Accuracy
            dict_metrics[model_name.upper()]["accuracy_train"].append(0)
            dict_metrics[model_name.upper()]["accuracy_test"].append(new_model.score(X_testing, y_testing))

            # Predicciones
            new_predictions = new_model.predict(X_testing)
            y_pred = new_predictions
            y_t = y_testing

            # MSE
            dict_metrics[model_name.upper()]["mse"].append(mean_squared_error(y_t, y_pred))

            # Cálculo de métricas
            cm = metrics.confusion_matrix(y_t, y_pred)
            precision, sensitivity, specificity, conf_f1, conf_misclassification = confusion_metrics(cm)

            dict_metrics[model_name.upper()]["precision"].append(precision)
            dict_metrics[model_name.upper()]["sensitivity"].append(sensitivity)
            dict_metrics[model_name.upper()]["specificity"].append(specificity)
            dict_metrics[model_name.upper()]["f1"].append(conf_f1)
            dict_metrics[model_name.upper()]["misc"].append(conf_misclassification)

            #ROC and AUC Score
            auc_score = roc_auc_score(y_t, y_pred)
            dict_metrics[model_name.upper()]["auc"].append(auc_score)

            #MCC
            mcc_score = matthews_corrcoef(y_t, y_pred)
            dict_metrics[model_name.upper()]["mcc"].append(mcc_score)
        else:
            filename = 'SavedModel/'+db+'/'+model_name+'_'+str(count+1)+'.h5'
            new_model = keras.models.load_model(filename)
            
            X_test_sequential = np.expand_dims(X_testing, axis=2)
            y_test_sequential = to_categorical(y_testing, 2)
            
            # Testing Accuracy
            (_, cnn_acc_test) = new_model.evaluate(X_test_sequential, y_test_sequential, verbose=0)
            dict_metrics[model_name.upper()]["accuracy_train"].append(0)
            dict_metrics[model_name.upper()]["accuracy_test"].append(cnn_acc_test)

            # Predicciones
            new_predictions = new_model.predict(X_test_sequential)
            y_pred = np.argmax(new_predictions, axis=1)
            y_t = np.argmax(y_test_sequential, axis=1)

            # MSE
            dict_metrics[model_name.upper()]["mse"].append(mean_squared_error(y_t, y_pred))

            # Cálculo de métricas
            cm = metrics.confusion_matrix(y_t, y_pred)
            precision, sensitivity, specificity, conf_f1, conf_misclassification = confusion_metrics(cm)

            dict_metrics[model_name.upper()]["precision"].append(precision)
            dict_metrics[model_name.upper()]["sensitivity"].append(sensitivity)
            dict_metrics[model_name.upper()]["specificity"].append(specificity)
            dict_metrics[model_name.upper()]["f1"].append(conf_f1)
            dict_metrics[model_name.upper()]["misc"].append(conf_misclassification)

            #ROC and AUC Score
            auc_score = roc_auc_score(y_t, y_pred)
            dict_metrics[model_name.upper()]["auc"].append(auc_score)

            #MCC
            mcc_score = matthews_corrcoef(y_t, y_pred)
            dict_metrics[model_name.upper()]["mcc"].append(mcc_score)
            
    model_count = model_count + 1

In [86]:
choosen=0 # 0: No Feature, 1: Full

db = "nf" 

# RF-----------------------------------------------------
print_metrics(dict_metrics["RF"], choosen, model_name = "Random Forest")

# SVM----------------------------------------------------
print_metrics(dict_metrics["SVM"], choosen, model_name = "Support Vector Machine")

# LSTM---------------------------------------------------
print_metrics(dict_metrics["LSTM"], choosen, model_name = "LSTM")

# CNN----------------------------------------------------
print_metrics(dict_metrics["CNN"], choosen, model_name = "CNN")

--------------------------------------------------
Database:  No Feature   Random Forest  

--------------------------------------------------
Model Selection: Onehot | k-mers Sparse Matrix | Autoencoder
--------------------------------------------------
Accuracy Training:  0
Accuracy Testing:  96.84210526315789
Precision:  97.48417721518987
Sensitivity:  96.3157894736842
Specificity:  97.36842105263158
f_1 Score:  96.84377777947815
MCC:  93.79151993466269
AUC Score:  96.84210526315789
MSE:  0.031578947368421054
Mis-Classification:  0.03157894736842104
                                    1                  2                  3  \
Accuracy Training   0.000000000000000  0.000000000000000  0.000000000000000   
Accuracy Test       0.973684210526316  0.953947368421053  0.960526315789474   
Precision           1.000000000000000  0.936708860759494  0.937500000000000   
Sensitivity         0.947368421052632  0.973684210526316  0.986842105263158   
Specificity         1.000000000000000  0.93421

## Full

In [87]:
db = 'full'

dict_metrics = { "RF" : {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 'precision':[], 'sensitivity':[], 'specificity':[], 
                             'f1':[], 'auc':[], 'mcc':[], 'mse':[], 'misc':[]},
                  "SVM": {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 'precision':[], 'sensitivity':[], 'specificity':[], 
                          'f1':[], 'auc':[], 'mcc':[], 'mse':[], 'misc':[]},
                  "LSTM": {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 'precision':[], 'sensitivity':[], 'specificity':[], 
                           'f1':[], 'auc':[], 'mcc':[], 'mse':[], 'misc':[]},
                  "CNN": {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 'precision':[], 'sensitivity':[], 'specificity':[], 
                          'f1':[], 'auc':[], 'mcc':[], 'mse':[], 'misc':[]},                
                }

#path_nf = 'Databases/NoFeature/nf_polarizability.txt'
path_full = 'Databases/Full/full_polarizability.txt'

#df_model= pd.read_csv(path_nf, sep=" ", header=None)
df_model = pd.read_csv(path_full, sep=" ", header=None)

df_model = change_index(df_model)

X, y = build_datasets(df_model)

X_main, X_testing, y_main, y_testing = train_test_split(X, y, test_size=0.1, random_state=42)

model_names = ['rf', 'svm', 'lstm', 'cnn']
model_count = 1

for model_name in model_names:
    for count in range (5):
        if model_count<3:
            filename = 'SavedModel/'+db+'/'+model_name+'_'+str(count+1)+'.sav'
            new_model = pickle.load(open(filename, 'rb'))
            
            # Testing Accuracy
            dict_metrics[model_name.upper()]["accuracy_train"].append(0)
            dict_metrics[model_name.upper()]["accuracy_test"].append(new_model.score(X_testing, y_testing))

            # Predicciones
            new_predictions = new_model.predict(X_testing)
            y_pred = new_predictions
            y_t = y_testing

            # MSE
            dict_metrics[model_name.upper()]["mse"].append(mean_squared_error(y_t, y_pred))

            # Cálculo de métricas
            cm = metrics.confusion_matrix(y_t, y_pred)
            precision, sensitivity, specificity, conf_f1, conf_misclassification = confusion_metrics(cm)

            dict_metrics[model_name.upper()]["precision"].append(precision)
            dict_metrics[model_name.upper()]["sensitivity"].append(sensitivity)
            dict_metrics[model_name.upper()]["specificity"].append(specificity)
            dict_metrics[model_name.upper()]["f1"].append(conf_f1)
            dict_metrics[model_name.upper()]["misc"].append(conf_misclassification)

            #ROC and AUC Score
            auc_score = roc_auc_score(y_t, y_pred)
            dict_metrics[model_name.upper()]["auc"].append(auc_score)

            #MCC
            mcc_score = matthews_corrcoef(y_t, y_pred)
            dict_metrics[model_name.upper()]["mcc"].append(mcc_score)
        else:
            filename = 'SavedModel/'+db+'/'+model_name+'_'+str(count+1)+'.h5'
            new_model = keras.models.load_model(filename)
            
            X_test_sequential = np.expand_dims(X_testing, axis=2)
            y_train_sequential = to_categorical(y_testing, 2)
            
            # Testing Accuracy
            (_, cnn_acc_test) = new_model.evaluate(X_test_sequential, y_train_sequential, verbose=0)
            dict_metrics[model_name.upper()]["accuracy_train"].append(0)
            dict_metrics[model_name.upper()]["accuracy_test"].append(cnn_acc_test)

            # Predicciones
            new_predictions = new_model.predict(X_test_sequential)
            y_pred = np.argmax(new_predictions, axis=1)
            y_t = np.argmax(y_train_sequential, axis=1)

            # MSE
            dict_metrics[model_name.upper()]["mse"].append(mean_squared_error(y_t, y_pred))

            # Cálculo de métricas
            cm = metrics.confusion_matrix(y_t, y_pred)
            precision, sensitivity, specificity, conf_f1, conf_misclassification = confusion_metrics(cm)

            dict_metrics[model_name.upper()]["precision"].append(precision)
            dict_metrics[model_name.upper()]["sensitivity"].append(sensitivity)
            dict_metrics[model_name.upper()]["specificity"].append(specificity)
            dict_metrics[model_name.upper()]["f1"].append(conf_f1)
            dict_metrics[model_name.upper()]["misc"].append(conf_misclassification)

            #ROC and AUC Score
            auc_score = roc_auc_score(y_t, y_pred)
            dict_metrics[model_name.upper()]["auc"].append(auc_score)

            #MCC
            mcc_score = matthews_corrcoef(y_t, y_pred)
            dict_metrics[model_name.upper()]["mcc"].append(mcc_score)
            
    model_count = model_count + 1

In [88]:
choosen=1 # 0: No Feature, 1: Full

# RF-----------------------------------------------------
print_metrics(dict_metrics["RF"], choosen, model_name = "Random Forest")

# SVM----------------------------------------------------
print_metrics(dict_metrics["SVM"], choosen, model_name = "Support Vector Machine")

# LSTM---------------------------------------------------
print_metrics(dict_metrics["LSTM"], choosen, model_name = "LSTM")

# CNN----------------------------------------------------
print_metrics(dict_metrics["CNN"], choosen, model_name = "CNN")

--------------------------------------------------
Database:  Full   Random Forest  

--------------------------------------------------
Model Selection: Onehot | k-mers Sparse Matrix | Autoencoder | (AAC - DPC - PCP)
--------------------------------------------------
Accuracy Training:  0
Accuracy Testing:  91.97368421052632
Precision:  93.72263993316625
Sensitivity:  90.0
Specificity:  93.94736842105263
f_1 Score:  91.80304752677351
MCC:  84.0466850383767
AUC Score:  91.97368421052632
MSE:  0.08026315789473684
Mis-Classification:  0.08026315789473684
                                    1                  2                  3  \
Accuracy Training   0.000000000000000  0.000000000000000  0.000000000000000   
Accuracy Test       0.907894736842105  0.927631578947368  0.921052631578947   
Precision           0.942857142857143  0.933333333333333  0.921052631578947   
Sensitivity         0.868421052631579  0.921052631578947  0.921052631578947   
Specificity         0.947368421052632  0.93421

In [None]:
new_model = keras.models.load_model('path_to_my_model.h5')

In [116]:
model_rf = RandomForestClassifier(max_depth=2, max_features=2, min_samples_split = 10, min_samples_leaf = 2, bootstrap=True, oob_score=True,criterion='entropy',random_state = 42, n_jobs=-1)
model_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=2, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [117]:
model_rf.score(X_test, y_test)

0.993421052631579

# RF

In [57]:
test = []
train = []
test_ext = []
mse = []
path_nf = 'Databases/NoFeature/nf_polarizability.txt'

df_nf= pd.read_csv(path_nf, sep=" ", header=None)

df_model = df_nf

df_model = change_index(df_model)

X, y = build_datasets(df_model)

X_main, X_testing, y_main, y_testing = train_test_split(X, y, test_size=0.1, random_state=42)
y_main = y_main.to_numpy()

kf = KFold(n_splits=5, random_state=42, shuffle=True)

kf.get_n_splits(X_main)

for train_index, test_index in kf.split(X_main):

    X_train, X_test = X_main[train_index], X_main[test_index]
    y_train, y_test = y_main[train_index], y_main[test_index]
    
    #model_rf = RandomForestClassifier(max_depth=2, max_features=2, min_samples_split = 10, min_samples_leaf = 2, bootstrap=True, oob_score=True,criterion='entropy',random_state = 42, n_jobs=-1)
    model_rf = RandomForestClassifier(max_depth=2, max_features=2, random_state = 42)
    model_rf.fit(X_train, y_train)
    
    train.append(model_rf.score(X_train, y_train))
    test.append(model_rf.score(X_test, y_test))
    test_ext.append(model_rf.score(X_testing, y_testing))
    
    # Predicciones
    new_predictions = model_rf.predict(X_test)
    y_pred = new_predictions
    y_t = y_test
    
    #print(y_pred.shape, y_t.shape)
    
    # MSE
    from sklearn.metrics import mean_squared_error
    mse.append(mean_squared_error(y_t, y_pred))


In [58]:
statistics.mean(train), statistics.mean(test), statistics.mean(test_ext)

(0.9785566556877281, 0.9648028442146089, 0.968421052631579)

# SVM

In [61]:
test = []
train = []
test_ext = []
path_nf = 'Databases/NoFeature/nf_polarizability.txt'

df_nf= pd.read_csv(path_nf, sep=" ", header=None)

df_model = df_nf

df_model = change_index(df_model)

X, y = build_datasets(df_model)

X_main, X_testing, y_main, y_testing = train_test_split(X, y, test_size=0.1, random_state=42)
y_main = y_main.to_numpy()

kf = KFold(n_splits=5, random_state=42, shuffle=True)

kf.get_n_splits(X_main)

for train_index, test_index in kf.split(X_main):

    X_train, X_test = X_main[train_index], X_main[test_index]
    y_train, y_test = y_main[train_index], y_main[test_index]
    
    #model_rf = RandomForestClassifier(max_depth=2, max_features=2, min_samples_split = 10, min_samples_leaf = 2, bootstrap=True, oob_score=True,criterion='entropy',random_state = 42, n_jobs=-1)
    #model_svm = svm.SVC(kernel='poly', C=10, degree=10, gamma='auto', probability=True,verbose = False) # Linear Kernel
    model_svm = svm.SVC(kernel='rbf', gamma='auto', C=1, verbose = False) # Linear Kernel
    model_svm.fit(X_train, y_train)
    
    train.append(model_svm.score(X_train, y_train))
    test.append(model_svm.score(X_test, y_test))
    test_ext.append(model_svm.score(X_testing, y_testing))


In [62]:
statistics.mean(train), statistics.mean(test), statistics.mean(test_ext)

(1.0, 0.9684847015729369, 0.9631578947368421)

# LSTM

In [222]:
test = []
train = []
test_ext = []
path_nf = 'Databases/NoFeature/nf_polarizability.txt'

df_nf= pd.read_csv(path_nf, sep=" ", header=None)

df_model = df_nf

df_model = change_index(df_model)

X, y = build_datasets(df_model)

X_main, X_testing, y_main, y_testing = train_test_split(X, y, test_size=0.1, random_state=42)

X_testing_lstm = np.expand_dims(X_testing, axis=2)
y_testing_lstm = to_categorical(y_testing, 2)

kf = KFold(n_splits=5, random_state=42, shuffle=True)

kf.get_n_splits(X_main)

for train_index, test_index in kf.split(X_main):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train_lstm = np.expand_dims(X_train, axis=2)
    X_test_lstm = np.expand_dims(X_test, axis=2)
    y_train_lstm = to_categorical(y_train, 2)
    y_test_lstm = to_categorical(y_test, 2)

    n_timesteps, n_features, n_outputs = X_train_lstm.shape[0], X_train_lstm.shape[1], y_train_lstm.shape[1]
    input_shape = (n_features,1)

    model_lstm = create_model_lstm(input_shape, n_outputs)
    model_lstm.fit(X_train_lstm,y_train_lstm,verbose=0)
    
    (_, lstm_accuracy) = model_lstm.evaluate(X_train_lstm, y_train_lstm, verbose=0)
    train.append(lstm_accuracy)
    
    (_, lstm_accuracy) = model_lstm.evaluate(X_test_lstm, y_test_lstm, verbose=0)
    test.append(lstm_accuracy)
    
    (_, lstm_accuracy) = model_lstm.evaluate(X_testing_lstm, y_testing_lstm, verbose=0)
    test_ext.append(lstm_accuracy)

In [223]:
statistics.mean(train), statistics.mean(test), statistics.mean(test_ext)

(0.9373193264007569, 0.9347500562667846, 0.9407894849777222)

# CNN

In [65]:
test = []
train = []
test_ext = []
path_nf = 'Databases/NoFeature/nf_polarizability.txt'

df_nf= pd.read_csv(path_nf, sep=" ", header=None)

df_model = df_nf

df_model = change_index(df_model)

X, y = build_datasets(df_model)

X_main, X_testing, y_main, y_testing = train_test_split(X, y, test_size=0.1, random_state=42)
y_main = y_main.to_numpy()

X_testing_cnn = np.expand_dims(X_testing, axis=2)
y_testing_cnn = to_categorical(y_testing, 2)

kf = KFold(n_splits=5, random_state=42, shuffle=True)

kf.get_n_splits(X_main)

for train_index, test_index in kf.split(X_main):

    X_train, X_test = X_main[train_index], X_main[test_index]
    y_train, y_test = y_main[train_index], y_main[test_index]
    
    X_train_cnn = np.expand_dims(X_train, axis=2)
    X_test_cnn = np.expand_dims(X_test, axis=2)
    y_train_cnn = to_categorical(y_train, 2)
    y_test_cnn = to_categorical(y_test, 2)
    
    n_timesteps, n_features, n_outputs = X_train_cnn.shape[0], X_train_cnn.shape[1], y_train_cnn.shape[1]
    input_shape = (n_features,1)

    model_cnn = create_model_cnn(input_shape, n_outputs)
    model_cnn.fit(X_train_cnn,y_train_cnn,batch_size=150,verbose=0)
    
    (_, cnn_accuracy) = model_cnn.evaluate(X_train_cnn, y_train_cnn, verbose=0)
    train.append(cnn_accuracy)
    
    (_, cnn_accuracy) = model_cnn.evaluate(X_test_cnn, y_test_cnn, verbose=0)
    test.append(cnn_accuracy)
    
    (_, cnn_accuracy) = model_cnn.evaluate(X_testing_cnn, y_testing_cnn, verbose=0)
    test_ext.append(cnn_accuracy)

In [66]:
statistics.mean(train), statistics.mean(test), statistics.mean(test_ext)

(0.9871705889701843, 0.9692146062850953, 0.967105257511139)