# Extract Unique Peptides

## Import Libraries

In [1]:
# import libraries
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

import sklearn
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizerFast, BertModel


# 导入其他文件
from extract_features import load_features, load_pred_features
from models import BioNN, BioDeepNN, BioResNet
from Model_Training import PeptidesDataLoader

# constant
SAVE = True
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Load Model

In [2]:
# 加载Bert模型和Bio模型
checkpoint = 'unikei/bert-base-proteins'
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)

bert_model = BertModel.from_pretrained(checkpoint).to(device)
bert_model.load_state_dict(torch.load('./Model/bert_model.pth', map_location=device))
bert_model.eval()

bio_model = BioDeepNN(10350, labels_num=18).to(device)
bio_model.load_state_dict(torch.load('./Model/bio_model.pth', map_location=device))
bio_model.eval()

Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BioDeepNN(
  (nn1): Linear(in_features=10350, out_features=512, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (nn2): Linear(in_features=512, out_features=512, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (nn3): Linear(in_features=512, out_features=512, bias=True)
  (dropout3): Dropout(p=0.5, inplace=False)
  (nn4): Linear(in_features=512, out_features=256, bias=True)
  (dropout4): Dropout(p=0.5, inplace=False)
  (nn5): Linear(in_features=256, out_features=18, bias=True)
)

## Load Data

In [3]:
# 加载数据(这一步会保存相关数据，后续无需再次运行)
data = pd.read_csv("./Data/Protease_Peptides.csv", sep="\t")
data = data[(data.iloc[:, 1:] != "-").sum(axis=1) == 8]  # 删除含有"-"的数据行
# 38727 rows，其中不重复的肽链有26794条，可能存在多个protease对应一个肽链

# 将3位缩写的氨基酸转换为1位缩写
amino_3_to_1 = dict()
amino_table = pd.read_csv("./Data/amino_table.csv", header=None, sep="\t")
for i, x in amino_table.iterrows():
    amino_3_to_1[x[3].lower()] = x[2]

data.iloc[:, 1:] = data.iloc[:, 1:].map(lambda x: amino_3_to_1[x.lower()])

# 生成peptides文件，用于让extract_features文件提取特征
peptides = ["".join(x.tolist()) for i, x in data.iloc[:, 1:].iterrows()]
peptides = pd.DataFrame(peptides)
peptides.to_csv("./Cache/to_predict_peptides.csv", index=False, header=None)
peptides.head()

Unnamed: 0,0
0,FGDLSVTY
1,LGEFLRTH
2,FTSDYSKY
3,QGTFTSDY
4,AQDFVQWL


In [3]:
# 加载peptides(MMP3独有的序列)
mmp3_original_unique_peptides = pd.read_csv("./MMP3_unique_sequence.csv", header=None)

# 加载所有特征
mmp3_features_x = load_pred_features(3).iloc[:, 1:].values  # 不保留peptides列
mmp3_features_x = torch.from_numpy(mmp3_features_x).float()

mmp3_original_unique_peptides = mmp3_original_unique_peptides.iloc[:, 0].values.tolist()
mmp3_dataloader = PeptidesDataLoader(mmp3_original_unique_peptides, mmp3_features_x, np.zeros_like(mmp3_features_x), batch_size=512, shuffle=False)  # 填充labels，labels在这里是用不到的

# 加载peptides(MMP9独有的序列)
mmp9_original_unique_peptides = pd.read_csv("./MMP9_unique_sequence.csv", header=None)

# 加载所有特征
mmp9_features_x = load_pred_features(9).iloc[:, 1:].values  # 不保留peptides列
mmp9_features_x = torch.from_numpy(mmp9_features_x).float()

mmp9_original_unique_peptides = mmp9_original_unique_peptides.iloc[:, 0].values.tolist()
mmp9_dataloader = PeptidesDataLoader(mmp9_original_unique_peptides, mmp9_features_x, np.zeros_like(mmp9_features_x), batch_size=512, shuffle=False)  # 填

Extracting binary features: 1944it [00:00, 457971.63it/s]
Extracting cksaap features: 1944it [00:00, 57928.10it/s]
Extracting binary features: 134it [00:00, 164868.51it/s]
Extracting cksaap features: 134it [00:00, 32110.88it/s]


In [8]:
# 预测测试集的各MMP结果
pred_mmp3 = []
regular_coefficient = 6.82  # 这个值取决于训练时的regular_coefficient
with torch.no_grad():
    for peptides_epoch, features_epoch, _ in mmp3_dataloader:
        tokens_epoch = tokenizer(peptides_epoch, return_tensors='pt').to(device)
        bert_output = bert_model(**tokens_epoch).last_hidden_state.view(len(peptides_epoch), -1)  # 将embed结果铺平
        bio_input = torch.cat([bert_output, features_epoch.to(device)], dim=1)

        bio_output = bio_model(bio_input)
        pred_mmp3.append(bio_output.to("cpu"))
pred_mmp3 = torch.cat(pred_mmp3, dim=0).detach().numpy() * regular_coefficient

pred_mmp9 = []
with torch.no_grad():
    for peptides_epoch, features_epoch, _ in mmp9_dataloader:
        tokens_epoch = tokenizer(peptides_epoch, return_tensors='pt').to(device)
        bert_output = bert_model(**tokens_epoch).last_hidden_state.view(len(peptides_epoch), -1)  # 将embed结果铺平
        bio_input = torch.cat([bert_output, features_epoch.to(device)], dim=1)

        bio_output = bio_model(bio_input)
        pred_mmp9.append(bio_output.to("cpu"))
pred_mmp9 = torch.cat(pred_mmp9, dim=0).detach().numpy() * regular_coefficient

# 保存预测结果
np.save("./Result/pred_mmp3.npy", pred_mmp3)
np.save("./Result/pred_mmp9.npy", pred_mmp9)

In [16]:
pred_mmp3 = np.concatenate([pred_mmp3[:, :2], pred_mmp3[:, 3:]], axis=1)
pred_mmp3.shape

(1944, 17)

In [22]:
sum((pred_mmp3 > 0).sum(axis=1) > 0)

1349

In [None]:
1944 

In [23]:
pred_mmp9 = np.concatenate([pred_mmp9[:, :5], pred_mmp9[:, 6:]], axis=1)
pred_mmp9.shape

(134, 17)

In [24]:
sum((pred_mmp9 > 0).sum(axis=1) > 0)

67