In [9]:
# import libraries
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

import sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, mean_squared_error, mean_absolute_error

from catboost import CatBoostRegressor

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizerFast, BertModel


# 导入其他文件
from extract_features import load_features
from models import BioNN, BioDeepNN, BioResNet
from Model_Training import PeptidesDataLoader

# constant
SAVE = True
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# 处理数据
# import data and preprocess
data = pd.read_csv("./Data/processed_peptides10.csv")  # load data

# 得到氨基酸序列
peptides = data.iloc[:, 0].values.tolist()  # 肽链的列表（字符串）

# load extracted features
features_x = load_features().iloc[:, 1:].values

# 处理mmp y的数据
all_mmp_y = data.iloc[:, 1:].values
regular_coefficient = np.max(np.abs(all_mmp_y))
all_mmp_y = all_mmp_y / regular_coefficient

# import Bert-Base-Protein model
checkpoint = 'unikei/bert-base-proteins'
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)

# 定义验证模型效果的函数
model_class = BioResNet
peptides = peptides
features = features_x
y = all_mmp_y
kf = KFold(n_splits=5, random_state=33, shuffle=True)
rg_errors = np.zeros((5, y.shape[1], 3))
cl_errors = np.zeros((5, y.shape[1], 4))  # 评价指标包括auc, f1, precision, recall
for i, (train_id, test_id) in enumerate(kf.split(features)):
    train_peptides, train_features, train_y = [peptides[x] for x in train_id], torch.from_numpy(features[train_id]).float(), torch.from_numpy(y[train_id]).float()
    test_peptides, test_features, test_y = [peptides[x] for x in test_id], torch.from_numpy(features[test_id]).float(), torch.from_numpy(y[test_id]).float()
    train_dataloader = PeptidesDataLoader(train_peptides, train_features, train_y, 512, shuffle=True)
    test_dataloader = PeptidesDataLoader(test_peptides, test_features, test_y, 512, shuffle=False)

    # 初始化模型
    bert_model = BertModel.from_pretrained(checkpoint).to(device)
    bio_model = model_class(768 * 10 + features.shape[1]).to(device)
    # 设置optimizer和criterion
    train_bert_params = [id(bert_model.pooler.dense.bias), id(bert_model.pooler.dense.weight)]
    bert_params = filter(lambda p: id(p) not in train_bert_params, bert_model.parameters())
    optimizer = optim.Adam(
        [
            {"params": bert_params, "lr": 1e-6},
            {"params": bert_model.pooler.dense.bias, "lr": 1e-3},
            {"params": bert_model.pooler.dense.weight, "lr": 1e-3},
            {"params": bio_model.parameters(), "lr": 1e-3}
        ], lr=1e-3
    )

    criterion = nn.MSELoss()

    # 开始训练
    bert_model.train()
    bio_model.train()
    train_epochs = 100
    loss_track = []
    for epoch in tqdm(range(train_epochs), total=train_epochs):
        loss_track_epoch = []
        for peptides_epoch, features_epoch, labels_epoch in train_dataloader:
            optimizer.zero_grad()

            tokens_epoch = tokenizer(peptides_epoch, return_tensors='pt')
            input_ids = tokens_epoch["input_ids"].to(device)
            attention_mask = tokens_epoch["attention_mask"].to(device)
            token_type_ids = tokens_epoch["token_type_ids"].to(device)
            bert_output = bert_model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).last_hidden_state.view(len(peptides_epoch), -1)  # 将embed结果铺平
            break
        break
    break

Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/100 [00:00<?, ?it/s]


In [18]:
peptides[0]

'ASGGMGNK'

In [15]:
import pandas as pd
data = pd.read_csv("./Data/processed_peptides10.csv")

In [17]:
pd.read_csv("./Cache/peptides.csv")

Unnamed: 0,ASGGMGNK
0,KIYNYDCE
1,KIYDYDCE
2,KIYDLDCE
3,ASGGLGNK
4,DTYRYIDY
...,...
18403,YPLHLQYN
18404,YPLSLRSL
18405,YPRNIGGQ
18406,YVRHLINN


In [14]:
tokenizer(peptides, return_tensors='pt')

{'input_ids': tensor([[ 2,  6, 29,  ..., 44, 46,  3],
        [ 2, 14, 47,  ..., 42, 32,  3],
        [ 2, 14, 47,  ..., 42, 32,  3],
        ...,
        [ 2, 27, 33,  ..., 28, 40,  3],
        [ 2, 27, 38,  ..., 44, 44,  3],
        [ 2, 27, 35,  ..., 38, 43,  3]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}