# Main

## Import Libraries

In [1]:
# import libraries
import numpy as np
import pandas as pd
import pickle as pkl

import torch
import torch.nn as nn
from torch.nn.functional import one_hot 
from torch.utils.data import Dataset, DataLoader

from TorchCRF import CRF

SAVE = False

## Process Data

In [None]:
# 去除数据中的引号
data = pd.read_csv("./Data/Substrate_search.txt", sep='\t', header=None, encoding='utf-8')
data = data.map(lambda x: str(x).strip("'") if isinstance(x, str) else x)

if SAVE is True:
    data.to_csv("./Data/Substrate_search_processed.csv", sep='\t', index=False, header=False, encoding="utf-8")
data.head(5)

In [2]:
# 只保留蛋白酶名和肽链信息
data = pd.read_csv("./Data/Substrate_search_processed.csv", sep='\t', header=None, encoding='utf-8')

protease_peptide = pd.concat((data[[1]], data.iloc[:, 4:12]), axis=1)  # 拼接蛋白酶和肽链信息
protease_peptide.columns = ["protease"] + [i for i in range(8)]  # 修改列名
protease_peptide = protease_peptide.fillna("?")  # 填充NaN值

if SAVE is True:
    protease_peptide.to_csv("./Data/Protease_Peptides.csv", sep='\t', header=True, index=False)  # 保存数据
protease_peptide.head(5)

  data = pd.read_csv("./Data/Substrate_search_processed.csv", sep='\t', header=None, encoding='utf-8')


Unnamed: 0,protease,0,1,2,3,4,5,6,7
0,A01.001,-,-,Ac,Phe,TyI,-,-,-
1,A01.001,Arg,Thr,Ile,Ala,Val,Asn,Tyr,Thr
2,A01.001,Thr,Thr,Thr,Ala,Ile,Leu,Asp,Phe
3,A01.001,Gly,Val,Thr,Ala,Tyr,Thr,Val,Asn
4,A01.001,Pro,Ser,Leu,Ala,Cys,Ala,Glu,Asn


## Pro2Pep

In [3]:
# 生成x

# 生成蛋白酶名称和id的映射
proname2id = dict()
i = 0
for proname in protease_peptide["protease"].unique():
    if proname not in proname2id:
        proname2id[proname] = i
        i += 1
    else:
        continue

# 生成训练数据的x
proname_x = protease_peptide["protease"]
x = np.zeros_like(proname_x, dtype=np.int32)
for i, proname in enumerate(proname_x):
    x[i] = proname2id[proname]
x = torch.from_numpy(x).long()

# 生成onehot编码
x = one_hot(x)

# 保存proname2id
if SAVE is True:
    with open("./Cache/proname2id.pkl", "wb+") as f:
        pkl.dump(proname2id, f)

In [18]:
# 生成y

# 生成氨基酸和id的映射
pepname2id = {"-": 0, "?": 1}  # 占位符"-"默认为0，在onehot编码中占第一位
i = 2
for pepname in protease_peptide.iloc[:, 1:].values.flatten():
    pepname = pepname.lower()
    if pepname not in pepname2id:
        pepname2id[pepname] = i
        i += 1
    else:
        continue

# 生成训练数据的y
pepname_y = protease_peptide.iloc[:, 1:].values
y = np.zeros_like(pepname_y, dtype=np.int32)
for row_i, row in enumerate(pepname_y):
    for col_i, pepname in enumerate(row):
        y[row_i, col_i] = pepname2id[pepname.lower()]
y = torch.from_numpy(y).long()
y = one_hot(y)

In [27]:
# 构建DataLoader
class Pro2Pep_Dataset(Dataset):
    def __init__(self, x, y) -> None:
        self.x = x
        self.y = y


    def __getitem__(self, index) -> tuple:
        return self.x[index], self.y[index]


    def __len__(self) -> int:
        return self.x.size()[0]

pro2pep_dataset = Pro2Pep_Dataset(x, y)
pro2pep_dataloader = DataLoader(pro2pep_dataset, batch_size=32, shuffle=True)

In [None]:
# 构建Pro2Pep模型
class Pro2Pep(nn.Module):
    def __init__(self, input_size, hidden_size, output_size) -> None:
        super(Pro2Pep, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.crf = CRF(output_size)
    
    def forward(self, x: torch.Tensor):
        output = self.fc1(x)
        output, h = self.lstm(output)
