In [1]:
import pandas as pd
import numpy as np
import tensorflow.keras.backend as K
from tensorflow import keras
import tensorflow as tf
from numpy import mean
from numpy import std
from tensorflow.keras.metrics import binary_accuracy
from sklearn.metrics import roc_auc_score, fbeta_score, recall_score, precision_score, accuracy_score

In [2]:
def fbeta2(y_true, y_pred, threshold_shift=0):
    beta = 2
    
    # just in case of hipster activation at the final layer
    y_pred = K.clip(y_pred, 0, 1)

    # shifting the prediction threshold from .5 if needed
    y_pred_bin = K.round(y_pred + threshold_shift)

    tp = K.sum(K.round(y_true * y_pred_bin)) + K.epsilon()
    fp = K.sum(K.round(K.clip(y_pred_bin - y_true, 0, 1)))
    fn = K.sum(K.round(K.clip(y_true - y_pred, 0, 1)))

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    beta_squared = beta ** 2
    return (beta_squared + 1) * (precision * recall) / (beta_squared * precision + recall + K.epsilon())

In [3]:
def f2(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    # tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 5*p*r / (4*p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

In [4]:

mload = keras.models.load_model('final_all_human_model.h5', custom_objects={'f2':f2, 'fbeta2':fbeta2})

In [5]:
# read in data from R
df = pd.read_csv("ProcessedLabelled_Ecoli.txt", low_memory=False) 
y = df.loc[ : ,  'X20':'X95'].values

In [7]:
df

Unnamed: 0,SeqCharge,X20,X25,X30,X35,X40,X45,X50,X55,X60,...,X80,X85,X90,X95,maxCV,Charge,ModSequence,Length,Sequence,LabelSequence
0,3QSVEADINGLRR,0,0,0,0,0,0,0,1,1,...,0,0,0,0,60,3,QSVEADINGLRR,12,QSVEADINGLRR,111000000
1,2ANELLINVK,0,0,0,0,0,0,1,0,0,...,0,0,0,0,50,2,ANELLINVK,9,ANELLINVK,1000000000
2,2IGDYAGIK,0,0,0,0,0,0,0,0,1,...,0,0,0,0,65,2,IGDYAGIK,8,IGDYAGIK,11110000
3,2VVGLSTLPEIYEK,0,0,0,0,1,1,0,0,0,...,0,0,0,0,45,2,VVGLSTLPEIYEK,13,VVGLSTLPEIYEK,110000000000
4,2QQIGVVGMAVMGR,0,0,0,0,0,1,1,0,0,...,0,0,0,0,45,2,QQIGVVGMAVMGR,13,QQIGVVGMAVMGR,11000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40962,3SSImVGEVDATTASGIHGLADENEDIR,0,0,0,0,1,1,1,0,0,...,0,0,0,0,45,3,SSImVGEVDATTASGIHGLADENEDIR,27,SSIMVGEVDATTASGIHGLADENEDIR,111000000000
40963,2EQAYQWVEEGK,0,0,0,0,1,1,1,0,0,...,0,0,0,0,50,2,EQAYQWVEEGK,11,EQAYQWVEEGK,111000000000
40964,3IDNAASVIALQWLQLHHQALK,0,0,0,1,0,0,0,0,0,...,0,0,0,0,35,3,IDNAASVIALQWLQLHHQALK,21,IDNAASVIALQWLQLHHQALK,1000000000000
40965,4IDNAASVIALQWLQLHHQALK,0,0,0,0,0,0,1,0,0,...,0,0,0,0,50,4,IDNAASVIALQWLQLHHQALK,21,IDNAASVIALQWLQLHHQALK,1000000000


In [13]:
### combine all the letters into a long string, take the set to find the unique values, add 'END' (for use with one-hot), then get length
seq = df['SeqCharge']
vocab = set(''.join([str(i) for i in seq]))
vocab.add('END')
len_vocab = len(vocab)
print(len_vocab)
cv = df['maxCV']
set(cv)
## make index of the characters in vocab
#char_index = dict((c, i) for i, c in enumerate(vocab))
## char index is static
char_index = {'2': 0, '3': 1, 'F': 2, 'a': 3, 'E': 4, 'T': 5, 'M': 6, '5': 7, 'm': 8, 'R': 9, 'END': 10, 'V': 11, 'A': 12, 'K': 13, 'I': 14, 'G': 15, 'W': 16, 'P': 17, 'Q': 18, 'D': 19, '4': 20, 'C': 21, 'N': 22, 'L': 23, 'S': 24, 'Y': 25, 'H': 26}

maxlen = max([len(x) for x in df.SeqCharge])
print(char_index)
print(maxlen)

#take input upto max and truncate rest
# get index in char_index
#padd 'END' to shorter sequences

x = []
x_name = [str(i)[0:maxlen] for i in seq]
for i in x_name:
    tmp = [char_index[j] for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(char_index["END"])
    x.append(tmp)
x = np.asarray(x)

26
{'2': 0, '3': 1, 'F': 2, 'a': 3, 'E': 4, 'T': 5, 'M': 6, '5': 7, 'm': 8, 'R': 9, 'END': 10, 'V': 11, 'A': 12, 'K': 13, 'I': 14, 'G': 15, 'W': 16, 'P': 17, 'Q': 18, 'D': 19, '4': 20, 'C': 21, 'N': 22, 'L': 23, 'S': 24, 'Y': 25, 'H': 26}
51


In [14]:
x.shape

(40967, 51)

In [15]:
results = mload.evaluate(x, y)
results_dict = dict(zip(mload.metrics_names, results))
results_dict



{'loss': 0.206258657928423,
 'acc': 0.90269303,
 'prec': 0.65036684,
 'recall': 0.548703,
 'auc': 0.9357526,
 'f2': 0.3716709,
 'fbeta2': 0.56453085}