# 加载环境

In [1]:
from Bio import SeqIO
import numpy as np
import random
import re

from keras.utils import np_utils

Using TensorFlow backend.


# 超参数

In [2]:
SEED = 727
LENGTH = 512

# 数据载入

In [3]:
ppi_file = r'.\data\HIPPIE_v2.1.tsv'
fasta_file = r'.\data\HUMAN_uniport.fasta'

In [4]:
records = list(SeqIO.parse(fasta_file, "fasta"))
names = []
sequences = []
for seq in records:
    names.append(str(seq.name))
    sequences.append(str(seq.seq))
protein_dict = dict(zip(names,sequences))

# 数据筛选

In [5]:
with open(ppi_file, 'r') as ppi:
    pro_1 = []
    pro_2 = []
    pro_1_seq = []
    pro_2_seq = []
    raw_pro_1 = []
    raw_pro_2 = []
    for line in ppi.readlines():
        line_list = line.strip().split("\t")
        raw_pro_1.append(line_list[0])
        raw_pro_2.append(line_list[2])
        if re.search(r'_HUMAN', line_list[0]) and re.search(r'_HUMAN', line_list[2]) and float(line_list[4]) >= 0.72 and line_list[0] in protein_dict and line_list[2] in protein_dict:
            if len(protein_dict[line_list[0]]) <= LENGTH and len(protein_dict[line_list[2]]) <= LENGTH:
                pro_1.append(line_list[0])
                pro_2.append(line_list[2])
                pro_1_seq.append(protein_dict[line_list[0]])
                pro_2_seq.append(protein_dict[line_list[2]])

# 随机对照

In [6]:
random.seed(SEED)
number_random_seqences = len(pro_1)
random_pro_1 = []
random_pro_2 = []
random_pro_1_seq = []
random_pro_2_seq = []
raw_pro_1_2 = []
raw_pro_2_1 = []
for i in range(len(raw_pro_1)):
    raw_pro_1_2.append(raw_pro_1[i] + raw_pro_2[i])
    raw_pro_2_1.append(raw_pro_2[i] + raw_pro_1[i])
count = 0
while (count < number_random_seqences):
    pro_1_index = random.randint(0,len(names))
    pro_2_index = random.randint(0,len(names))
    random_pro_1_2 = []
    for i in range(len(random_pro_1)):
        random_pro_1_2.append(random_pro_1[i] + random_pro_2[i]) 
    if  random_pro_1_2 in raw_pro_1_2 or random_pro_1_2 in raw_pro_2_1:
        pass
    else:
        if len(protein_dict[names[pro_1_index]]) <= LENGTH and len(protein_dict[names[pro_2_index]]) <= LENGTH:

            random_pro_1_seq.append(protein_dict[names[pro_1_index]])
            random_pro_2_seq.append(protein_dict[names[pro_2_index]])
            count += 1

In [7]:
len(pro_1_seq)

26533

In [8]:
len(random_pro_1_seq)

26533

# 数据合并

In [9]:
squences_1 = []
squences_2 = []
random_squences_1 = []
random_squences_2 = []
for seq in pro_1_seq:
    if len(seq) < LENGTH:
        squences_1.append("".join(list(seq) + (['0'] * (LENGTH - len(seq)))))
    else:
        squences_1.append(str(seq[0:LENGTH]))
for seq in pro_2_seq:
    if len(seq) < LENGTH:
        squences_2.append("".join(list(seq) + (['0'] * (LENGTH - len(seq)))))
    else:
        squences_2.append(str(seq[0:LENGTH]))
for seq in random_pro_1_seq:
    if len(seq) < LENGTH:
        random_squences_1.append("".join(list(seq) + (['0'] * (LENGTH - len(seq)))))
    else:
        random_squences_1.append(str(seq[0:LENGTH]))
for seq in random_pro_2_seq:
    if len(seq) < LENGTH:
        random_squences_2.append("".join(list(seq) + (['0'] * (LENGTH - len(seq)))))
    else:
        random_squences_2.append(str(seq[0:LENGTH]))
pro_1_seq = squences_1
pro_2_seq = squences_2
random_pro_1_seq = random_squences_1
random_pro_2_seq = random_squences_2

In [10]:
pro_dual = []
for i in range(len(pro_1_seq)):
    pro_dual.append(str(pro_1_seq[i]) + '1' + str(pro_2_seq[i]))
    pro_dual.append(str(pro_2_seq[i]) + '1' + str(pro_1_seq[i]))
for i in range(len(random_pro_1_seq)):
    pro_dual.append(str(random_pro_1_seq[i]) + '1' + str(random_pro_2_seq[i]))
    pro_dual.append(str(random_pro_2_seq[i]) + '1' + str(random_pro_1_seq[i]))

# 数据编码

In [11]:
def toOneHot(base):
    if base == "C":
        base_one_hot = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "S":
        base_one_hot = [0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "T":
        base_one_hot = [0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "P":
        base_one_hot = [0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "A":
        base_one_hot = [0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "G":
        base_one_hot = [0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "N":
        base_one_hot = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "D":
        base_one_hot = [0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "E":
        base_one_hot = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0]
    elif base == "Q":
        base_one_hot = [0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0]
    elif base == "H":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0]
    elif base == "R":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]
    elif base == "K":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
    elif base == "M":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]
    elif base == "I":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]
    elif base == "L":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0]
    elif base == "V":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
    elif base == "F":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0]
    elif base == "Y":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
    elif base == "W":
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
    elif base == "1":
        base_one_hot = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]     #分隔符   
    else:
        base_one_hot = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    return base_one_hot

In [12]:
squence_list = []
result_list = []
for seq in pro_dual:
    seq_list = list(seq)
    for base in seq_list:
        squence_list.append(toOneHot(base))
    squence_array = np.array(squence_list)
    result_list.append(np.transpose(squence_array))
    squence_list = []
X = np.array(result_list)

MemoryError: 

In [13]:
len(X)

18104

# 标签标记

In [14]:
Y_list = ["1"] * len(pro_1_seq) * 2 + ["0"] * len(random_pro_1_seq) * 2
Y = np.array(Y_list)
Y = np.transpose(Y)

# 垃圾回收

In [15]:
del(records, names, sequences, protein_dict, 
    pro_1, pro_2, pro_1_seq, pro_2_seq,
    random_pro_1, random_pro_2, random_pro_1_seq, random_pro_2_seq,
    squences_1, squences_2, random_squences_1, random_squences_2,
    pro_dual, squence_list, result_list, seq_list, Y_list)

# 数据分割

In [16]:
np.random.seed(SEED)
msk = np.random.rand(len(Y)) < 0.8
X_train = X[msk]
Y_train = Y[msk]
X_test = X[~msk]
Y_test = Y[~msk]
print("number of data for train: " + str(len(X_train)))
print("number of data for test: " + str(len(X_test)))
print("number of label for train: " + str(len(Y_train)))
print("number of label for test: " + str(len(Y_test)))

number of data for train: 14522
number of data for test: 3582
number of label for train: 14522
number of label for test: 3582


# 格式整理

In [17]:
X_train4D=X_train.reshape(X_train.shape[0],LENGTH*2+1,20,1).astype('float32')
X_test4D=X_test.reshape(X_test.shape[0],LENGTH*2+1,20,1).astype('float32')

In [18]:
Y_train_One_Hot = np_utils.to_categorical(Y_train)
Y_test_One_Hot = np_utils.to_categorical(Y_test)

# 垃圾回收

In [19]:
del(X, Y, X_train, X_test, Y_train)

# 保存数据

In [20]:
np.save("./data/X_train4D.npy",X_train4D)
np.save("./data/X_test4D.npy",X_test4D)
np.save("./data/Y_train_One_Hot.npy",Y_train_One_Hot)
np.save("./data/Y_test_One_Hot.npy",Y_test_One_Hot)
np.save("./data/Y_test_One_Hot.npy",Y_test_One_Hot)
np.save("./data/Y_test.npy",Y_test)