In [1]:
# import libraries
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

import sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, mean_squared_error, mean_absolute_error

from catboost import CatBoostRegressor

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizerFast, BertModel


# 导入其他文件
from extract_features import load_features, load_features_by_name
from models import BioNN, BioDeepNN, BioResNet, LSTMFilter, CNNFilter, LSTMEncoder
from Model_Training import PeptidesDataLoader, BertDataLoader

# constant
SAVE = True
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# 处理数据
# import data and preprocess
data = pd.read_csv("./Data/processed_peptides10.csv")  # load data

# 得到氨基酸序列
peptides = data.iloc[:, 0].values.tolist()  # 肽链的列表（字符串）

# load extracted features
features_x = load_features_by_name(["binary", "aac", "knn"]).iloc[:, 1:].values

# 处理mmp y的数据
all_mmp_y = data.iloc[:, 1:].values
regular_coefficient = np.max(np.abs(all_mmp_y))
all_mmp_y = all_mmp_y / regular_coefficient

# import Bert-Base-Protein model
checkpoint = 'unikei/bert-base-proteins'
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)

# # 定义验证模型效果的函数
# def validate_dl(model_class: nn.Module, peptides: list, features: np.array, y: np.array):
#     # 用5折交叉验证来验证模型效果
#     kf = KFold(n_splits=5, random_state=33, shuffle=True)
#     rg_errors = np.zeros((5, y.shape[1], 3))
#     cl_errors = np.zeros((5, y.shape[1], 4))  # 评价指标包括auc, f1, precision, recall
#     tokens = tokenizer(peptides, return_tensors='pt')
#     input_ids, attention_mask, token_type_ids = tokens["input_ids"], tokens["attention_mask"], tokens["token_type_ids"]
#     for i, (train_id, test_id) in enumerate(kf.split(features)):
#         train_input_ids, train_attention_mask, train_token_type_ids, train_features, train_y = input_ids[train_id], attention_mask[train_id], token_type_ids[train_id], torch.from_numpy(features[train_id]).float(), torch.from_numpy(y[train_id]).float()
#         test_input_ids, test_attention_mask, test_token_type_ids, test_features, test_y = input_ids[test_id], attention_mask[test_id], token_type_ids[test_id], torch.from_numpy(features[test_id]).float(), torch.from_numpy(y[test_id]).float()

#         train_dataloader = BertDataLoader(train_input_ids, train_attention_mask, train_token_type_ids, train_features, train_y, 512, shuffle=True)
#         test_dataloader = BertDataLoader(test_input_ids, test_attention_mask, test_token_type_ids, test_features, test_y, 512, shuffle=False)

#         # 初始化模型
#         bert_model = BertModel.from_pretrained(checkpoint).to(device)
#         filter_model = LSTMFilter(768, 16).to(device)  # 输出是256维的
#         bio_model = model_class(320 + features.shape[1]).to(device)
#         # 设置optimizer和criterion
#         train_bert_params = [id(bert_model.pooler.dense.bias), id(bert_model.pooler.dense.weight)]
#         bert_params = filter(lambda p: id(p) not in train_bert_params, bert_model.parameters())

In [2]:
bert_model = BertModel.from_pretrained(checkpoint).to(device)
filter_model = LSTMFilter(768, 16).to(device)  # 输出是256维的
# 设置optimizer和criterion
train_bert_params = [id(bert_model.pooler.dense.bias), id(bert_model.pooler.dense.weight)]
bert_params = filter(lambda p: id(p) not in train_bert_params, bert_model.parameters())

Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
for x in bert_params:
    x.requires_grad = False