# Main

## Import Libraries

In [1]:
# import libraries
import numpy as np
import pandas as pd
import pickle as pkl

import torch
import torch.nn as nn
from torch.nn.functional import one_hot 
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

from TorchCRF import CRF
import tqdm

SAVE = False

## Process Data

In [2]:
# 去除数据中的引号
data = pd.read_csv("./Data/Substrate_search.txt", sep='\t', header=None, encoding='utf-8')
data = data.map(lambda x: str(x).strip("'") if isinstance(x, str) else x)

if SAVE is True:
    data.to_csv("./Data/Substrate_search_processed.csv", sep='\t', index=False, header=False, encoding="utf-8")
data.head(5)

  data = pd.read_csv("./Data/Substrate_search.txt", sep='\t', header=None, encoding='utf-8')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,CLE0166975,A01.001,ac-Phe-Tyr(I2),ac-Phe+Tyr(I2),-,-,Ac,Phe,TyI,-,...,,,pepsin A,,,,,,synthetic,
1,CLE0166506,A01.001,alcohol dehydrogenase,peptide-Ala107+Val-peptide,Arg,Thr,Ile,Ala,Val,Asn,...,107.0,Drosophila melanogaster,pepsin A,,,1-256,NT,VT,non-physiological,
2,CLE0166510,A01.001,alcohol dehydrogenase,peptide-Ala119+Ile-peptide,Thr,Thr,Thr,Ala,Ile,Leu,...,119.0,Drosophila melanogaster,pepsin A,,,1-256,NT,VT,non-physiological,
3,CLE0166517,A01.001,alcohol dehydrogenase,peptide-Ala178+Tyr-peptide,Gly,Val,Thr,Ala,Tyr,Thr,...,178.0,Drosophila melanogaster,pepsin A,,,1-256,NT,VT,non-physiological,
4,CLE0166523,A01.001,alcohol dehydrogenase,peptide-Ala218+Cys-peptide,Pro,Ser,Leu,Ala,Cys,Ala,...,218.0,Drosophila melanogaster,pepsin A,,,1-256,NT,VT,non-physiological,


In [3]:
# 只保留蛋白酶名和肽链信息
data = pd.read_csv("./Data/Substrate_search_processed.csv", sep='\t', header=None, encoding='utf-8')

protease_peptide = pd.concat((data[[1]], data.iloc[:, 4:12]), axis=1)  # 拼接蛋白酶和肽链信息
protease_peptide.columns = ["protease"] + [i for i in range(8)]  # 修改列名
protease_peptide = protease_peptide.fillna("?")  # 填充NaN值

if SAVE is True:
    protease_peptide.to_csv("./Data/Protease_Peptides.csv", sep='\t', header=True, index=False)  # 保存数据
protease_peptide.head(5)

  data = pd.read_csv("./Data/Substrate_search_processed.csv", sep='\t', header=None, encoding='utf-8')


Unnamed: 0,protease,0,1,2,3,4,5,6,7
0,A01.001,-,-,Ac,Phe,TyI,-,-,-
1,A01.001,Arg,Thr,Ile,Ala,Val,Asn,Tyr,Thr
2,A01.001,Thr,Thr,Thr,Ala,Ile,Leu,Asp,Phe
3,A01.001,Gly,Val,Thr,Ala,Tyr,Thr,Val,Asn
4,A01.001,Pro,Ser,Leu,Ala,Cys,Ala,Glu,Asn


## Pro2Pep

In [4]:
# 生成x

# 生成蛋白酶名称和id的映射
proname2id = dict()
i = 0
for proname in protease_peptide["protease"].unique():
    if proname not in proname2id:
        proname2id[proname] = i
        i += 1
    else:
        continue

# 生成训练数据的x
proname_x = protease_peptide["protease"]
x = np.zeros_like(proname_x, dtype=np.int32)
for i, proname in enumerate(proname_x):
    x[i] = proname2id[proname]
x = torch.from_numpy(x).long()

# 生成onehot编码
x = one_hot(x)

# 保存proname2id
if SAVE is True:
    with open("./Cache/proname2id.pkl", "wb+") as f:
        pkl.dump(proname2id, f)

In [5]:
# 生成y

# 生成氨基酸和id的映射
pepname2id = {"-": 0, "?": 1}  # 占位符"-"默认为0，在onehot编码中占第一位
i = 2
for pepname in protease_peptide.iloc[:, 1:].values.flatten():
    pepname = pepname.lower()
    if pepname not in pepname2id:
        pepname2id[pepname] = i
        i += 1
    else:
        continue

# 生成训练数据的y
pepname_y = protease_peptide.iloc[:, 1:].values
y = np.zeros_like(pepname_y, dtype=np.int32)
for row_i, row in enumerate(pepname_y):
    for col_i, pepname in enumerate(row):
        y[row_i, col_i] = pepname2id[pepname.lower()]
y = torch.from_numpy(y).long()
y = one_hot(y)

In [6]:
# 构建DataLoader
class Pro2Pep_Dataset(Dataset):
    def __init__(self, x, y) -> None:
        self.x = x
        self.y = y


    def __getitem__(self, index) -> tuple:
        return self.x[index], self.y[index]


    def __len__(self) -> int:
        return self.x.size()[0]

pro2pep_dataset = Pro2Pep_Dataset(x, y)
pro2pep_dataloader = DataLoader(pro2pep_dataset, batch_size=32, shuffle=True)

In [14]:
# 构建Pro2Pep模型
class Pro2Pep(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, device, seq_length=8) -> None:  # hidden_size设置成偶数
        super(Pro2Pep, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size // 2, batch_first=True, bidirectional=True)
        self.fc2 = nn.Linear(hidden_size, output_size)

        self.seq_length = seq_length
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.device = device
    
    def forward(self, x: torch.Tensor):
        embed = F.relu(self.fc1(x))
        embed = embed.unsqueeze(1)  # embed (batch_size, 1, hidden_size)

        outputs = torch.zeros((x.size()[0], self.seq_length, self.hidden_size)).to(self.device)  # outputs (batch_size, seq_length, hidden_size)
        output, h, c = embed, torch.zeros((2, x.size()[0], self.hidden_size // 2)).to(self.device), torch.zeros((2, x.size()[0], self.hidden_size // 2)).to(self.device)  # initialize the h and c
        for i in range(self.seq_length):  # 逐帧预测氨基酸，下一次的输入是上一次的输出
            output, (h, c) = self.lstm(output, (h, c))
            outputs[:, i, :] = output.squeeze()
        outputs = F.softmax(self.fc2(outputs), dim=2)
        return outputs


# 开始训练
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = Pro2Pep(1751, 256, 253, device).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
model.train()

epochs = 100
losses = []
with tqdm.trange(epochs, desc=f'Training', unit='epoch') as pbar:
    for epoch in pbar:
        for batch_x, batch_y in pro2pep_dataloader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            optimizer.zero_grad()

            pred = model(batch_x.float())
            loss = F.binary_cross_entropy(pred, batch_y.float())
            loss.backward()
            optimizer.step()

            pbar.set_postfix({'loss': loss.cpu().item()})
            losses.append(loss.cpu().item())

Training: 100%|██████████| 100/100 [39:06<00:00, 23.47s/epoch, loss=0.0137]


In [None]:
torch.save(model.state_dict(), "./Model/Pro2Pep256.pt")

In [24]:
batch_x.float().device

device(type='cuda', index=0)