In [1]:
#Bibliotecas
import pandas as pd
from pandas import DataFrame
import numpy as np
from numpy import linalg #SVD
import screed 
import itertools
import matplotlib.pyplot as plt
import random
import statistics
from collections import Counter
import pickle

random_seed = 600
np.random.seed(random_seed)
random.seed(random_seed)

import os
os.environ['PYTHONHASHSEED'] = '0'

import tensorflow as tf
tf.random.set_seed(random_seed)

import keras
from keras.layers import Input, Dense
from keras.models import Model
from keras.datasets import mnist

from keras.models import Sequential
from keras.layers import Conv1D
from keras.layers import Dropout
from keras.layers import MaxPooling1D
from keras.layers import Flatten
from keras.layers import LSTM
from keras.utils import to_categorical
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import Adam
from keras.optimizers import Nadam

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

%matplotlib inline
#---------------

### Funciones

In [34]:
# IMPRIMIR RESULTADOS DE MÉTRICAS----------------------------------------------------------------
def print_metrics(metrics, model_name, db_selection, folder): 

    db_name = "nf" if int(db_selection) == 0 else "full"
    
    db = "No Feature" if int(db_selection) == 0 else "Full"
    print("Database: ", db," ", model_name," \n")
    print('-'*50)
    if (db_selection == 0):
        print('Model Selection: Onehot | k-mers Sparse Matrix | Autoencoder')
    elif (db_selection == 1):
        print('Model Selection: Onehot | k-mers Sparse Matrix | Autoencoder | (AAC - DPC - PCP)')
    print('-'*50)
    print('Accuracy Training: ', statistics.mean(metrics["accuracy_train"])*100)
    print('Accuracy Testing: ', statistics.mean(metrics["accuracy_test"])*100)
    print('Precision: ', statistics.mean(metrics["precision"])*100)
    print('Sensitivity: ', statistics.mean(metrics["sensitivity"])*100) 
    print('Specificity: ', statistics.mean(metrics["specificity"])*100)
    print('f_1 Score: ', statistics.mean(metrics["f1"])*100)
    print('MCC: ', statistics.mean(metrics["mcc"])*100) 
    print('AUC Score: ', statistics.mean(metrics["auc"])*100) 
    print('MSE: ', statistics.mean(metrics["mse"]))
    print('Mis-Classification: ', statistics.mean(metrics["misc"])) 

    #Mostrar más decimales en DF
    pd.set_option("display.precision", 15)

    #-------------------------------------------
    metrics_model = [metrics["accuracy_train"], metrics["accuracy_test"], metrics["precision"], 
                    metrics["sensitivity"], metrics["specificity"], metrics["f1"], metrics["mcc"], metrics["auc"], 
                     metrics["mse"], metrics["misc"]]
    metrics_m = pd.DataFrame(metrics_model, columns = ['1', '2','3', '4', '5'],
                index = ['Accuracy Training','Accuracy Test', 'Precision', 'Sensitivity', 'Specificity', 'f_1 Score', 
                         'MCC', 'AUC Score',  'MSE', 'Mis-Classification'])
    metrics_m[:-2] = metrics_m[:-2]*100
    
    print(metrics_m)

# Construcción de los Datasets----------------------------------------------------------------
def build_datasets(df_model):
    X = df_model.iloc[:, :-1] # Secuencias

    y = df_model.iloc[:,-1] # Clases

    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)

    return X, y

def confusion_metrics(conf_matrix):
    # Guardar la matriz de confusión y la divida en 4 piezas
    TP = conf_matrix[1][1]
    TN = conf_matrix[0][0]
    FP = conf_matrix[0][1]
    FN = conf_matrix[1][0]

    # Calcular Precisión
    conf_accuracy = (float (TP+TN) / float(TP + TN + FP + FN))

    # Calcular mis-classification
    conf_misclassification = 1- conf_accuracy

    # Calcular Sensitivity
    conf_sensitivity = (TP / float(TP + FN))
    # Calcular Specificity
    conf_specificity = (TN / float(TN + FP))

    # Calcular la Precisión
    conf_precision = (TP / float(TP + FP))
    # Calcular f_1 score
    conf_f1 = 2 * ((conf_precision * conf_sensitivity) / (conf_precision + conf_sensitivity))

    precision = conf_precision
    sensitivity = conf_sensitivity
    specificity = conf_specificity

    return precision, sensitivity, specificity, conf_f1, conf_misclassification

def create_model(input_shape, n_outputs): # Convolutional Neural Network
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    model = Sequential()
    model.add(Conv1D(filters=100, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(Conv1D(filters=100, kernel_size=4, activation='relu', input_shape=input_shape))
    model.add(Conv1D(filters=100, kernel_size=5, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(n_outputs, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model

def change_index(df):
    #Cambiar indices de filas y columnas por valores numéricos
    total_rows_df = df.shape[0]
    df.index = np.arange(0, total_rows_df)

    total_columns_df = df.shape[1]
    df.columns = np.arange(0, total_columns_df)

    return df


### Dataset de Entrenamiento (80% de los datos originales): Creación del Modelo y Obtención de Métricas

In [36]:
def ACP(alphabeth, db_selection): 
    
    # Leer Datasets ========================================================
    path_alphabet_nf = 'Databases/NoFeature/nf_DipoleMoment.txt'
    path_alphabet_full = 'Databases/Full/full_DipoleMoment.txt'

    df_nf = pd.read_csv(path_alphabet_nf, sep=" ", header=None)
    #ind_nf = pd.read_csv(path_independent_nf, sep=" ", header=None)

    df_full = pd.read_csv(path_alphabet_full, sep=" ", header=None)
    #ind_full = pd.read_csv(path_independent_full, sep=" ", header=None)
    #=======================================================================
    
    #Seleccionar entre los dos Datasets
    if (db_selection == 0):
        df_model = df_nf
        db = 'NoFeature'
        #df_ind = ind_nf
    elif (db_selection == 1):
        df_model = df_full
        db = 'Full'
        #df_ind = ind_full

    #frames = [df_model_temp, df_ind]
    #df_model = pd.concat(frames)

    df_model = change_index(df_model)

    X, y = build_datasets(df_model)
    
    # =============================================================================================
    # Construcción de los Datasets de Entrenamiento (90%) y Testing (10%) =========================
    
    # Una vez se tiene el Dataset de Entrenamiento (80% de los datos originales) y el Dataset Independiente (20% de los datos originales), ...
    # ... se debe utilizar el Dataset de Entrenamiento para obtener los Datasets de Training (90% del Dataset de Entrenamiento) ...
    # ... y Testing (10% del Dataset de Entrenamiento). De esta forma se entrena y prueba el modelo, para luego validarlo con ...
    # ... el Dataset Independiente. 
    
    # 90% Training - 10% Testing
    X_main, X_testing, y_main, y_testing = train_test_split(X, y, test_size=0.1, random_state=42)
    y_main = y_main.to_numpy()
    
    # =============================================================================================

    # Model ---------------------------------------------------------------------------------------
    epochs = 50
    batch_size = 50
    verbose = 0
    count = 1
    
    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    kf.get_n_splits(X_main)
    
    # Diccionarios que almacenan los resultados de las métricas ====================================
    dict_metrics = {alphabeth : {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 
                                'precision':[], 'sensitivity':[], 'specificity':[], 'f1':[], 'auc':[], 
                                'mcc':[], 'mse':[], 'misc':[]}}

    dict_metrics_test = {alphabeth : {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 
                                'precision':[], 'sensitivity':[], 'specificity':[], 'f1':[], 'auc':[], 
                                'mcc':[], 'mse':[], 'misc':[]}}

    # Entrenamiento del modelo con 5 Fold Cross Validation =======================================
    for train_index, test_index in kf.split(X_main):
        
        # Entrenamiento - Validación
        X_train, X_test = X_main[train_index], X_main[test_index]
        y_train, y_test = y_main[train_index], y_main[test_index]

        # CNN ====================================================================================
        
        # Configuración de los datos
        X_train = np.expand_dims(X_train, axis=2)
        X_test = np.expand_dims(X_test, axis=2)
        y_train = to_categorical(y_train, 2)
        y_test = to_categorical(y_test, 2)

        n_timesteps, n_features, n_outputs = X_train.shape[0], X_train.shape[1], y_train.shape[1]
        input_shape = (n_features,1)
        
        # Creación del Modelo (CNN) =============================================================
        model = create_model(input_shape, n_outputs)
        model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=verbose)

        # Guardar cada uno de los Modelos creados ===============================================
        filename = 'SavedModel/FinalModel/'+db+'/'+alphabeth+'_'+str(count)+'.h5'
        model.save(filename)
        
        # Obtener Métricas ======================================================================
        (_, acc_train) = model.evaluate(X_train, y_train, verbose=0)
        (_, acc_test) = model.evaluate(X_test, y_test, verbose=0)
        dict_metrics[alphabeth]["accuracy_train"].append(acc_train)
        dict_metrics[alphabeth]["accuracy_test"].append(acc_test)

        # Predicciones
        new_predictions = model.predict(X_test, batch_size=batch_size, verbose=0)
        y_pred = np.argmax(new_predictions, axis=1)
        y_t = np.argmax(y_test, axis=1)
        
        # MSE
        dict_metrics[alphabeth]["mse"].append(mean_squared_error(y_t, y_pred))

        # Cálculo de métricas
        cm = metrics.confusion_matrix(y_t, y_pred)
        precision, sensitivity, specificity, conf_f1, conf_misclassification = confusion_metrics(cm)

        dict_metrics[alphabeth]["precision"].append(precision)
        dict_metrics[alphabeth]["sensitivity"].append(sensitivity)
        dict_metrics[alphabeth]["specificity"].append(specificity)
        dict_metrics[alphabeth]["f1"].append(conf_f1)
        dict_metrics[alphabeth]["misc"].append(conf_misclassification)

        #ROC and AUC Score
        auc_score = roc_auc_score(y_t, y_pred)
        dict_metrics[alphabeth]["auc"].append(auc_score)

        #MCC
        mcc_score = matthews_corrcoef(y_t, y_pred)
        dict_metrics[alphabeth]["mcc"].append(mcc_score)
        
        count = count + 1
    #================================================================================================================
    # Testing
    # Con los modelos ya guardados se prueba cada uno de los modelos con datos desconocidos que están almacenados ...
    # ... en el Dataset de Testing (10% del Dataset de Entrenamiento)

    X_testing = np.expand_dims(X_testing, axis=2)
    y_testing = to_categorical(y_testing, 2)

    for count in range (5): # Cargar cada uno de los 5 "modelos" guardados en cada "Fold" mediante 5F-CV
        # Cargar Modelo =============================================================================================
        filename = 'SavedModel/FinalModel/'+db+'/'+alphabeth+'_'+str(count+1)+'.h5'
        new_model = keras.models.load_model(filename)
        
        # Testing Accuracy
        (_, acc_test) = new_model.evaluate(X_testing, y_testing, verbose=0)
        dict_metrics_test[alphabeth]["accuracy_train"].append(0)
        dict_metrics_test[alphabeth]["accuracy_test"].append(acc_test)

        # Predicciones
        new_predictions = new_model.predict(X_testing)
        y_pred = np.argmax(new_predictions, axis=1)
        y_t = np.argmax(y_testing, axis=1)

        # MSE
        dict_metrics_test[alphabeth]["mse"].append(mean_squared_error(y_t, y_pred))

        # Cálculo de métricas
        cm = metrics.confusion_matrix(y_t, y_pred)
        precision, sensitivity, specificity, conf_f1, conf_misclassification = confusion_metrics(cm)

        dict_metrics_test[alphabeth]["precision"].append(precision)
        dict_metrics_test[alphabeth]["sensitivity"].append(sensitivity)
        dict_metrics_test[alphabeth]["specificity"].append(specificity)
        dict_metrics_test[alphabeth]["f1"].append(conf_f1)
        dict_metrics_test[alphabeth]["misc"].append(conf_misclassification)

        #ROC and AUC Score
        auc_score = roc_auc_score(y_t, y_pred)
        dict_metrics_test[alphabeth]["auc"].append(auc_score)

        #MCC
        mcc_score = matthews_corrcoef(y_t, y_pred)
        dict_metrics_test[alphabeth]["mcc"].append(mcc_score)

    return dict_metrics, dict_metrics_test

#### Obtener Métricas de Entrenamiento y Testing

In [37]:
alphabeth = 'dipolemoment'
db = [0,1]

for db_selection in db:
    dict_metrics, dict_metrics_test = ACP(alphabeth, db_selection)

    print_metrics(dict_metrics[alphabeth], alphabeth, db_selection, folder='Training')
    print_metrics(dict_metrics_test[alphabeth], alphabeth, db_selection, folder='Testing')

Database:  No Feature   dipolemoment  

--------------------------------------------------
Model Selection: Onehot | k-mers Sparse Matrix | Autoencoder
--------------------------------------------------
Accuracy Training:  100.0
Accuracy Testing:  99.48717951774597
Precision:  99.71325796505653
Sensitivity:  99.2660850599782
Specificity:  99.69924812030075
f_1 Score:  99.48824293985584
MCC:  98.97595990926243
AUC Score:  99.48266659013947
MSE:  0.005128205128205128
Mis-Classification:  0.00512820512820511
                                      1      2                    3  \
Accuracy Training   100.000000000000000  100.0  100.000000000000000   
Accuracy Test        99.267399311065674  100.0   99.267399311065674   
Precision            99.285714285714292  100.0  100.000000000000000   
Sensitivity          99.285714285714292  100.0   98.473282442748086   
Specificity          99.248120300751879  100.0  100.000000000000000   
f_1 Score            99.285714285714292  100.0   99.23076923076

### Dataset Independiente (20% de los datos Originales)

In [38]:
def independent(alphabeth, db_selection):

    path_alphabet_nf = 'Databases/NoFeature/ind_nf_dipoleMoment.txt'
    path_alphabet_full = 'Databases/Full/ind_full_dipolemoment.txt'

    ind_nf = pd.read_csv(path_alphabet_nf, sep=" ", header=None)
    ind_full = pd.read_csv(path_alphabet_full, sep=" ", header=None)

    if (db_selection == 0):
        df_model = ind_nf
        db = 'NoFeature'
    elif (db_selection == 1):
        df_model = ind_full
        db = 'Full'
        
    df_model = change_index(df_model)

    X_ind, y_ind = build_datasets(df_model)
        
    dict_metrics_ind = {alphabeth : {'accuracy_train':[], 'accuracy_test':[], 'accuracy_test_ext':[], 
                                'precision':[], 'sensitivity':[], 'specificity':[], 'f1':[], 'auc':[], 
                                'mcc':[], 'mse':[], 'misc':[]}}
    
    batch_size = 50
    count = 1
    
    X_ind = np.expand_dims(X_ind, axis=2)
    y_ind = to_categorical(y_ind, 2)
    
    for count in range (5):
        filename = 'SavedModel/FinalModel/'+db+'/'+alphabeth+'_'+str(count+1)+'.h5'
        new_model = keras.models.load_model(filename)
        
        # Testing Accuracy
        (_, acc_test) = new_model.evaluate(X_ind, y_ind, verbose=0)
        dict_metrics_ind[alphabeth]["accuracy_train"].append(0)
        dict_metrics_ind[alphabeth]["accuracy_test"].append(acc_test)

        # Predicciones
        new_predictions = new_model.predict(X_ind)
        y_pred = np.argmax(new_predictions, axis=1)
        y_t = np.argmax(y_ind, axis=1)

        # MSE
        dict_metrics_ind[alphabeth]["mse"].append(mean_squared_error(y_t, y_pred))

        # Cálculo de métricas
        cm = metrics.confusion_matrix(y_t, y_pred)
        precision, sensitivity, specificity, conf_f1, conf_misclassification = confusion_metrics(cm)

        dict_metrics_ind[alphabeth]["precision"].append(precision)
        dict_metrics_ind[alphabeth]["sensitivity"].append(sensitivity)
        dict_metrics_ind[alphabeth]["specificity"].append(specificity)
        dict_metrics_ind[alphabeth]["f1"].append(conf_f1)
        dict_metrics_ind[alphabeth]["misc"].append(conf_misclassification)

        #ROC and AUC Score
        auc_score = roc_auc_score(y_t, y_pred)
        dict_metrics_ind[alphabeth]["auc"].append(auc_score)

        #MCC
        mcc_score = matthews_corrcoef(y_t, y_pred)
        dict_metrics_ind[alphabeth]["mcc"].append(mcc_score)

    return dict_metrics_ind

In [39]:
alphabeth = 'dipolemoment'
db = [0,1]

for db_selection in db:
    dict_metrics_ind = independent(alphabeth, db_selection)

    print_metrics(dict_metrics_ind[alphabeth], alphabeth, db_selection, folder='Independent')

Database:  No Feature   dipolemoment  

--------------------------------------------------
Model Selection: Onehot | k-mers Sparse Matrix | Autoencoder
--------------------------------------------------
Accuracy Training:  0
Accuracy Testing:  96.99999928474426
Precision:  95.0623450497375
Sensitivity:  99.1578947368421
Specificity:  94.84210526315789
f_1 Score:  97.06511660558593
MCC:  94.09122633457706
AUC Score:  97.0
MSE:  0.03
Mis-Classification:  0.02999999999999998
                                     1                   2  \
Accuracy Training    0.000000000000000   0.000000000000000   
Accuracy Test       96.052628755569458  97.894734144210815   
Precision           93.969849246231149  96.428571428571431   
Sensitivity         98.421052631578945  99.473684210526315   
Specificity         93.684210526315795  96.315789473684205   
f_1 Score           96.143958868894600  97.927461139896394   
MCC                 92.208768761783361  95.837271500683130   
AUC Score           96.0526