In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras import backend as K
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras.layers import Layer
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import GlobalMaxPool1D, Dense, Dropout, Conv1D, BatchNormalization
import tensorflow as tf
import re

In [None]:
def define_model_1():
    maxlen = 3321
    class_num = 1
    last_activation = 'sigmoid'
    input = Input((maxlen, 20))

    x = Conv1D(256, 32, activation='relu', strides=1, padding='same')(input)
    x = BatchNormalization()(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.5)(x)

    y = Conv1D(256, 16, activation='relu', strides=1, padding='same')(input)
    y = BatchNormalization()(y)
    y = GlobalMaxPool1D()(y)
    y = Dropout(0.5)(y)

    z = Conv1D(256, 8, activation='relu', strides=1, padding='same')(input)
    z = BatchNormalization()(z)
    z = GlobalMaxPool1D()(z)
    z = Dropout(0.5)(z)

    t = tf.keras.layers.Concatenate()([x, y, z])
    t = Dense(64, activation='relu')(t)
    t = Dense(16, activation='relu')(t)
    output = Dense(class_num, activation=last_activation)(t)
    model = Model(inputs=input, outputs=output)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  metrics=['accuracy'])
    return model

In [None]:
def define_model_2():
    maxlen = 3321
    class_num = 6
    last_activation = 'softmax'
    input = Input((maxlen, 20))

    x = Conv1D(256, 32, activation='relu', strides=1, padding='same')(input)
    x = BatchNormalization()(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.5)(x)

    y = Conv1D(256, 16, activation='relu', strides=1, padding='same')(input)
    y = BatchNormalization()(y)
    y = GlobalMaxPool1D()(y)
    y = Dropout(0.5)(y)

    z = Conv1D(256, 8, activation='relu', strides=1, padding='same')(input)
    z = BatchNormalization()(z)
    z = GlobalMaxPool1D()(z)
    z = Dropout(0.5)(z)

    t = tf.keras.layers.Concatenate()([x, y, z])
    t = Dense(64, activation='relu')(t)
    t = Dense(16, activation='relu')(t)
    output = Dense(class_num, activation=last_activation)(t)
    model = Model(inputs=input, outputs=output)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  metrics=['accuracy'])
    return model

In [None]:
def get_feature(seq_matrix):
    """将字符编码为整数
    """
    one_hot = []
    ind_to_char = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    char_to_ind = {char: i for i, char in enumerate(ind_to_char)}
    # 整数编码
    integer_encoded = [char_to_ind[char] for char in seq_matrix]
    for value in integer_encoded:
        letter = tf.eye(len(ind_to_char))
        one_hot.append(letter[value])
    return one_hot

In [None]:
def seq_padding(data):
    data_new = []
    maxlen = 3321
    seq_new = tf.pad(data,[[0, maxlen-len(data)], [0, 0]])
    data_new.append(seq_new)
    return data_new

In [None]:
def run_predict(id_seqs):
    id = []
    is_hsp = []
    prob = []
    is_hsp20 = []
    is_hsp40 = []
    is_hsp60 = []
    is_hsp70 = []
    is_hsp90 = []
    is_hsp100 = []
    seq_len = len(id_seqs)
    for i in range(seq_len):
        record = id_seqs[i]
        if i % 2 == 0:
            if record.startswith('>'):
                continue
            else:
                return 0, "format error, must in fasta format"
        if i % 2 == 1:
            if len(record) == 0:
                return 2, "Please enter the sequence of 20 amino acids"
            if len(record) <= 3321:
                seq = record.upper()
                for seq_one in seq:
                    if seq_one in 'ACDEFGHIKLMNPQRSTVWY':
                        continue
                    else:
                        return 2, "Please enter the sequence of 20 amino acids"
                fea_df = get_feature(seq)
                fea_padding = seq_padding(fea_df)
                feature = np.array(fea_padding)
                model_1 = define_model_1()
                model_1.load_weights('HSP/models/hsp_model.h5')
                res_1 = model_1.predict(feature)
                id.append(id_seqs[i - 1].split('>')[-1])
                prob.append(np.round(np.squeeze(res_1),4))
                if res_1 > 0.5:
                    is_hsp.append("True")
                    model_2 = define_model_2()
                    model_2.load_weights(models/hsp_class_model.h5')
                    res_2 = model_2.predict(feature)
                    for k in range(len(res_2)):
                        res_2[k][np.argmax(res_2[k])] = 1
                        res_2[k][res_2[k] < 1] = 0
                        str_res_2 = res_2.astype(str)
                        if str_res_2[0][0] == "1.0":
                            is_hsp20.append("True")
                        else:
                            is_hsp20.append("-")
                        if str_res_2[0][1] == "1.0":
                            is_hsp40.append("True")
                        else:
                            is_hsp40.append("-")
                        if str_res_2[0][2] == "1.0":
                            is_hsp60.append("True")
                        else:
                            is_hsp60.append("-")
                        if str_res_2[0][3] == "1.0":
                            is_hsp70.append("True")
                        else:
                            is_hsp70.append("-")
                        if str_res_2[0][4] == "1.0":
                            is_hsp90.append("True")
                        else:
                            is_hsp90.append("-")
                        if str_res_2[0][5] == "1.0":
                            is_hsp100.append("True")
                        else:
                            is_hsp100.append("-")
                else:
                    is_hsp.append("False")
                    is_hsp20.append("-")
                    is_hsp40.append("-")
                    is_hsp60.append("-")
                    is_hsp70.append("-")
                    is_hsp90.append("-")
                    is_hsp100.append("-")
            else:
                id.append(id_seqs[i - 1].split('>')[-1])
                is_hsp.append("-")
                prob.append("Sequence length must be <= 3321")
                is_hsp20.append("-")
                is_hsp40.append("-")
                is_hsp60.append("-")
                is_hsp70.append("-")
                is_hsp90.append("-")
                is_hsp100.append("-")
    res_df = pd.DataFrame(
        columns=['id', 'is_hsp', 'prob', 'is_hsp20', 'is_hsp40', 'is_hsp60', 'is_hsp70', 'is_hsp90', 'is_hsp100'])
    res_df.id = id
    pro = []
    for z in prob:
        pro.append(str(z))
    res_df.is_hsp = is_hsp
    res_df.prob = pro
    res_df.is_hsp20 = is_hsp20
    res_df.is_hsp40 = is_hsp40
    res_df.is_hsp60 = is_hsp60
    res_df.is_hsp70 = is_hsp70
    res_df.is_hsp90 = is_hsp90
    res_df.is_hsp100 = is_hsp100
    return 1, res_df.to_json(orient="records")

In [None]:
if __name__ == '__main__':
    data = "fastafile"
    predict_result = run_predict(data)
    print(predict_result)