# Extract Unique Peptides

## Import Libraries

In [2]:
# import libraries
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

import sklearn
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizerFast, BertModel


# 导入其他文件
from extract_features import load_features, load_pred_features
from models import BioNN, BioDeepNN, BioResNet
from Model_Training import PeptidesDataLoader

# constant
SAVE = True
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("CPU")

## Load Model

In [5]:
# 加载Bert模型和Bio模型
checkpoint = 'unikei/bert-base-proteins'
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)

bert_model = BertModel.from_pretrained(checkpoint).to(device)
bert_model.load_state_dict(torch.load('./Model/bert_model.pth'))
bert_model.eval()

bio_model = BioDeepNN().to(device)
bio_model.load_state_dict(torch.load('./Model/bio_model.pth'))
bio_model.eval()

Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Error(s) in loading state_dict for BioDeepNN:
	Missing key(s) in state_dict: "nn4.weight", "nn4.bias", "nn5.weight", "nn5.bias". 
	size mismatch for nn1.weight: copying a param with shape torch.Size([512, 10350]) from checkpoint, the shape in current model is torch.Size([512, 768]).
	size mismatch for nn2.weight: copying a param with shape torch.Size([256, 512]) from checkpoint, the shape in current model is torch.Size([512, 512]).
	size mismatch for nn2.bias: copying a param with shape torch.Size([256]) from checkpoint, the shape in current model is torch.Size([512]).
	size mismatch for nn3.weight: copying a param with shape torch.Size([18, 256]) from checkpoint, the shape in current model is torch.Size([512, 512]).
	size mismatch for nn3.bias: copying a param with shape torch.Size([18]) from checkpoint, the shape in current model is torch.Size([512]).

## Load Data

In [34]:
# 加载数据
data = pd.read_csv("./Data/Protease_Peptides.csv", sep="\t")
data = data[(data.iloc[:, 1:] != "-").sum(axis=1) == 8]  # 删除含有"-"的数据行
# 38727 rows

# 将3位缩写的氨基酸转换为1位缩写
amino_3_to_1 = dict()
amino_table = pd.read_csv("./Data/amino_table.csv", header=None, sep="\t")
for i, x in amino_table.iterrows():
    amino_3_to_1[x[3].lower()] = x[2]

data.iloc[:, 1:] = data.iloc[:, 1:].map(lambda x: amino_3_to_1[x.lower()])
data.head()

Unnamed: 0,protease,0,1,2,3,4,5,6,7
0,A01.003,F,G,D,L,S,V,T,Y
1,A01.003,L,G,E,F,L,R,T,H
2,A01.003,F,T,S,D,Y,S,K,Y
4,A01.003,Q,G,T,F,T,S,D,Y
5,A01.003,A,Q,D,F,V,Q,W,L


In [46]:
# 生成peptides文件，用于让extract_features文件提取特征
peptides = ["".join(x.tolist()) for i, x in data.iloc[:, 1:].iterrows()]
peptides = pd.DataFrame(peptides)
peptides.to_csv("./Cache/to_predict_peptides.csv", index=False, header=False)
peptides.head()

Unnamed: 0,0
0,FGDLSVTY
1,LGEFLRTH
2,FTSDYSKY
3,QGTFTSDY
4,AQDFVQWL


In [3]:
# 加载所有特征
features_x = load_pred_features().iloc[:, 1:].values  # 不保留peptides列
features_x = torch.from_numpy(features_x)

peptides = peptides.iloc[:, 0].values.tolist()
dataloader = PeptidesDataLoader(peptides, features_x, 512, shuffle=False)

NameError: name 'peptides' is not defined

In [None]:
# 预测测试集的各MMP结果
test_pred = []
regular_coefficient = 6.82  # 这个值取决于训练时的regular_coefficient
with torch.no_grad():
    for peptides_epoch, features_epoch in dataloader:
        tokens_epoch = tokenizer(peptides_epoch, return_tensors='pt').to(device)
        bert_output = bert_model(**tokens_epoch).last_hidden_state.view(len(peptides_epoch), -1)  # 将embed结果铺平
        bio_input = torch.cat([bert_output, features_epoch.to(device)], dim=1)

        bio_output = bio_model(bio_input)
        test_pred.append(bio_output.to("cpu"))
test_pred = torch.cat(test_pred, dim=0).detach().numpy() * regular_coefficient