# 加载环境

In [1]:
from keras.models import load_model
import matplotlib.pyplot as plt
import numpy as np
from Bio import SeqIO

Using TensorFlow backend.


# 模型载入

In [2]:
model = load_model(r'./model/toy.h5')

# 数据载入

In [3]:
test_file = r'./data/test.txt'
fasta_file = r'./data/test.fasta'

# 超参数

In [4]:
WINDOWS_SIZE = 256
STEP = 128
LENGTH = 256

# 数据整理

In [5]:
records = list(SeqIO.parse(fasta_file, "fasta"))
names = []
sequences = []
for seq in records:
    names.append(str(seq.name))
    sequences.append(str(seq.seq))
protein_dict = dict(zip(names,sequences))

In [6]:
with open(test_file, 'r') as test:
    pro_1 = []
    pro_2 = []
    pro_1_seq = []
    pro_2_seq = []
    for line in test.readlines():
        line_list = line.strip().split("\t")
        pro_1.append(line_list[0])
        pro_2.append(line_list[1])
        pro_1_seq.append(protein_dict[line_list[0]])
        pro_2_seq.append(protein_dict[line_list[1]])

In [7]:
len(pro_1_seq)

65

In [8]:
len(pro_2_seq)

65

In [9]:
squences_1 = []
squences_2 = []
random_squences_1 = []
random_squences_2 = []
for seq in pro_1_seq:
    if len(seq) < LENGTH:
        squences_1.append("".join(list(seq) + (['0'] * (LENGTH - len(seq)))))
    else:
        squences_1.append(str(seq[0:LENGTH]))
for seq in pro_2_seq:
    if len(seq) < LENGTH:
        squences_2.append("".join(list(seq) + (['0'] * (LENGTH - len(seq)))))
    else:
        squences_2.append(str(seq[0:LENGTH]))
pro_1_seq = squences_1
pro_2_seq = squences_2

# 数据合并

In [10]:
pro_dual = []
for i in range(len(pro_1_seq)):
    pro_dual.append(str(pro_1_seq[i]) + '1' + str(pro_2_seq[i]))

In [11]:
def toOneHot(base):
    if base == "C":
        base_one_hot = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "S":
        base_one_hot = [0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "T":
        base_one_hot = [0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "P":
        base_one_hot = [0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "A":
        base_one_hot = [0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "G":
        base_one_hot = [0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "N":
        base_one_hot = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "D":
        base_one_hot = [0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "E":
        base_one_hot = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "Q":
        base_one_hot = [0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0]
    elif base == "H":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0]
    elif base == "R":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]
    elif base == "K":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
    elif base == "M":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]
    elif base == "I":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]
    elif base == "L":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0]
    elif base == "V":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
    elif base == "F":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0]
    elif base == "Y":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
    elif base == "W":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
    elif base == "1":
        base_one_hot = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]     #分隔符   
    else:
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    return base_one_hot

In [12]:
squence_list = []
result_list = []
for seq in pro_dual:
    seq_list = list(seq)
    for base in seq_list:
        squence_list.append(toOneHot(base))
    squence_array = np.array(squence_list)
    result_list.append(np.transpose(squence_array))
    squence_list = []
X = np.array(result_list)

In [13]:
X_4D=X.reshape(X.shape[0],LENGTH*2+1,20,1).astype('float32')

In [14]:
predicted_Probaility = model.predict(X_4D)

In [15]:
predicted_Probaility

array([[6.2843945e-08, 9.9999988e-01],
       [1.3232706e-01, 8.6767292e-01],
       [1.4069061e-01, 8.5930938e-01],
       [9.9947172e-01, 5.2832282e-04],
       [9.9999881e-01, 1.1819361e-06],
       [9.9996996e-01, 3.0069526e-05],
       [4.8092835e-02, 9.5190722e-01],
       [1.7442338e-02, 9.8255771e-01],
       [3.4308575e-02, 9.6569145e-01],
       [1.0000000e+00, 9.1665633e-12],
       [8.8931823e-01, 1.1068174e-01],
       [5.6057461e-02, 9.4394255e-01],
       [1.0000000e+00, 4.8583069e-12],
       [9.9856806e-01, 1.4319452e-03],
       [1.0000000e+00, 3.9971484e-09],
       [2.1751144e-04, 9.9978250e-01],
       [8.0514508e-01, 1.9485487e-01],
       [1.4349839e-02, 9.8565012e-01],
       [9.9012214e-01, 9.8778121e-03],
       [9.9735439e-01, 2.6456157e-03],
       [9.6252209e-01, 3.7477892e-02],
       [2.1324191e-02, 9.7867578e-01],
       [5.4180396e-01, 4.5819607e-01],
       [5.8951050e-01, 4.1048947e-01],
       [1.0000000e+00, 3.6417941e-08],
       [4.1558291e-03, 9.