In [2]:
# import libraries
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

import sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, mean_squared_error, mean_absolute_error

from catboost import CatBoostRegressor

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizerFast, BertModel


# 导入其他文件
from extract_features import load_features
from models import BioNN, BioDeepNN, BioResNet
from Model_Training import PeptidesDataLoader, BertDataLoader

# constant
SAVE = True
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# 处理数据
# import data and preprocess
data = pd.read_csv("./Data/processed_peptides10.csv")  # load data

# 得到氨基酸序列
peptides = data.iloc[:, 0].values.tolist()  # 肽链的列表（字符串）

# load extracted features
features_x = load_features().iloc[:, 1:].values

# 处理mmp y的数据
all_mmp_y = data.iloc[:, 1:].values

# import Bert-Base-Protein model
checkpoint = 'unikei/bert-base-proteins'
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)

# 定义验证模型效果的函数
def validate_dl(model_class: nn.Module, peptides: list, features: np.array, y: np.array) -> np.array:
    # 返回一个(5, 100, 2, data_size)的np.array，其中2维中0是pred，1是true
    validation = np.zeros((100, 2, y.shape[0], 18))
    # 用5折交叉验证来验证模型效果
    kf = KFold(n_splits=5, random_state=33, shuffle=True)
    rg_errors = np.zeros((5, y.shape[1], 3))
    cl_errors = np.zeros((5, y.shape[1], 4))  # 评价指标包括auc, f1, precision, recall
    tokens = tokenizer(peptides, return_tensors='pt')
    input_ids, attention_mask, token_type_ids = tokens["input_ids"], tokens["attention_mask"], tokens["token_type_ids"]
    for i, (train_id, test_id) in enumerate(kf.split(features)):
        train_input_ids, train_attention_mask, train_token_type_ids, train_features, train_y = input_ids[train_id], attention_mask[train_id], token_type_ids[train_id], torch.from_numpy(features[train_id]).float(), torch.from_numpy(y[train_id]).float()
        test_input_ids, test_attention_mask, test_token_type_ids, test_features, test_y = input_ids[test_id], attention_mask[test_id], token_type_ids[test_id], torch.from_numpy(features[test_id]).float(), torch.from_numpy(y[test_id]).float()

        train_dataloader = BertDataLoader(train_input_ids, train_attention_mask, train_token_type_ids, train_features, train_y, 512, shuffle=True)
        test_dataloader = BertDataLoader(test_input_ids, test_attention_mask, test_token_type_ids, test_features, test_y, 512, shuffle=False)

        # 初始化模型
        bert_model = BertModel.from_pretrained(checkpoint).to(device)
        bio_model = model_class(768 * 10 + features.shape[1]).to(device)
        # 设置optimizer和criterion
        train_bert_params = [id(bert_model.pooler.dense.bias), id(bert_model.pooler.dense.weight)]
        bert_params = filter(lambda p: id(p) not in train_bert_params, bert_model.parameters())
        for param in bert_params:
            param.requires_grad = False
        optimizer = optim.Adam(
            [
                # {"params": bert_params, "lr": 1e-6},
                {"params": bert_model.pooler.dense.bias, "lr": 1e-3},
                {"params": bert_model.pooler.dense.weight, "lr": 1e-3},
                {"params": bio_model.parameters(), "lr": 1e-3}, 
            ], lr=1e-3
        )

        criterion = nn.MSELoss()

        # 开始训练
        bert_model.train()
        bio_model.train()
        train_epochs = 100
        loss_track = []
        for epoch in tqdm(range(train_epochs), total=train_epochs):
            epoch_pointer = 0
            loss_track_epoch = []
            for epoch_input_ids, epoch_attention_mask, epoch_token_type_ids, features_epoch, labels_epoch in train_dataloader:
                optimizer.zero_grad()

                epoch_input_ids = epoch_input_ids.to(device)
                epoch_attention_mask = epoch_attention_mask.to(device)
                epoch_token_type_ids = epoch_token_type_ids.to(device)

                bert_output = bert_model(input_ids=epoch_input_ids, attention_mask=epoch_attention_mask, token_type_ids=epoch_token_type_ids).last_hidden_state.view(features_epoch.shape[0], -1)  # 将embed结果铺平
                bio_input = torch.cat([bert_output, features_epoch.to(device)], dim=1)

                bio_output = bio_model(bio_input)

                loss = criterion(bio_output.view(labels_epoch.size()), labels_epoch.to(device))  # 在label只有一个特征时需要调整tensor结构
                loss.backward()

                optimizer.step()

                loss_track_epoch.append(loss.detach().to("cpu").item())

            # 训练时验证
            bert_model.eval()
            bio_model.eval()
            pred = []
            with torch.no_grad():
                for epoch_input_ids, epoch_attention_mask, epoch_token_type_ids, features_epoch, labels_epoch in test_dataloader:
                    epoch_input_ids = epoch_input_ids.to(device)
                    epoch_attention_mask = epoch_attention_mask.to(device)
                    epoch_token_type_ids = epoch_token_type_ids.to(device)
                    bert_output = bert_model(input_ids=epoch_input_ids, attention_mask=epoch_attention_mask, token_type_ids=epoch_token_type_ids).last_hidden_state.view(features_epoch.shape[0], -1)
                    bio_input = torch.cat([bert_output, features_epoch.to(device).float()], dim=1)

                    bio_output = bio_model(bio_input)
                    pred.append(bio_output.to("cpu"))
            epoch_pred = torch.cat(pred, dim=0).detach().numpy()
            epoch_true = test_y.to("cpu").numpy()
            validation[epoch, 0, test_id, :] = epoch_pred
            validation[epoch, 1, test_id, :] = epoch_true
            epoch_pointer += epoch_pred.shape[0]

            bert_model.train()
            bio_model.train()

            avg_loss = np.average(loss_track_epoch)
            loss_track.append(avg_loss)
        
        print("Fold {} finished".format(i))
    return validation

def get_rg_error(validation: np.array) -> np.array:
    # 基于训练时保存的结果，计算mse、mae、rmse，会计算每个epoch的分数
    mse_score = np.zeros((100, 18))  # 返回的precision
    mae_score = np.zeros((100, 18))
    pred = validation[:, 0, :, :]
    true = validation[:, 1, :, :]
    for epoch_i in range(100):
        for mmp_i in range(18):
            mse_score[epoch_i, mmp_i] = mean_squared_error(true[epoch_i, :, mmp_i], pred[epoch_i, :, mmp_i])
            mae_score[epoch_i, mmp_i] = mean_absolute_error(true[epoch_i, :, mmp_i], pred[epoch_i, :, mmp_i])
    rg_error = np.stack([mse_score, mae_score, np.sqrt(mse_score)], axis=2)
    return rg_error

def get_cl_error(validate_dl: np.array, threshold: int=1.65) -> np.array:
    # 基于训练时保存的验证结果，计算auc、f1、precision、recall，会计算每个epoch的分数
    pred = validate_dl[:, 0, :, :] > threshold
    true = validate_dl[:, 1, :, :] > 1.65
    p_score = np.zeros((100, 18))
    r_score = np.zeros((100, 18))
    f_score = np.zeros((100, 18))
    auc_score = np.zeros((100, 18))
    for epoch_i in range(100):
        for mmp_i in range(18):
            p_score[epoch_i, mmp_i] = precision_score(true[epoch_i, :, mmp_i], pred[epoch_i, :, mmp_i])
            r_score[epoch_i, mmp_i] = recall_score(true[epoch_i, :, mmp_i], pred[epoch_i, :, mmp_i])
            f_score[epoch_i, mmp_i] = f1_score(true[epoch_i, :, mmp_i], pred[epoch_i, :, mmp_i])
            auc_score[epoch_i, mmp_i] = roc_auc_score(true[epoch_i, :, mmp_i], pred[epoch_i, :, mmp_i])
    cl_error = np.stack([auc_score, f_score, p_score, r_score], axis=2)
    return cl_error


bionn_validation = validate_dl(BioNN, peptides, features_x, all_mmp_y)
bionn_rg_error, bionn_cl_error = get_rg_error(bionn_validation)[-1, :, :], get_cl_error(bionn_validation, threshold=1.65)[-1, :, :]
np.save("./Result/BioNN_rg_error.npy", bionn_rg_error)
np.save("./Result/BioNN_cl_error.npy", bionn_cl_error)

# biodnn_validation = validate_dl(BioDeepNN, peptides, features_x, all_mmp_y)
# biodnn_rg_error, biodnn_cl_error = get_rg_error(biodnn_validation)[-1, :, :], get_cl_error(biodnn_validation, threshold=1.65)[-1, :, :]
# np.save("./Result/BioDNN_rg_error.npy", biodnn_rg_error)
# np.save("./Result/BioDNN_cl_error.npy", biodnn_cl_error)

# biores_validation = validate_dl(BioResNet, peptides, features_x, all_mmp_y)
# biores_rg_error, biores_cl_error = get_rg_error(biores_validation)[-1, :, :], get_cl_error(biores_validation, threshold=1.65)[-1, :, :]
# np.save("./Result/BioRes_rg_error.npy", biores_rg_error)
# np.save("./Result/BioRes_cl_error.npy", biores_cl_error)

# BioNN
# 100 0.71 (without knn)
# 300 0.70
# 100 0.72 (without cksaap)
# 0.78
# BioDeepNN
# 100 0.6872 (without knn)
# BioResNet
# 100 0.73 (without knn)

Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [02:09<00:00,  1.29s/it]


Fold 0 finished


Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [02:08<00:00,  1.28s/it]


Fold 1 finished


Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [02:09<00:00,  1.29s/it]


Fold 2 finished


Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [02:10<00:00,  1.30s/it]


Fold 3 finished


Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [02:09<00:00,  1.30s/it]


Fold 4 finished


NameError: name 'validation' is not defined

In [7]:

def get_rg_error(validation: np.array) -> np.array:
    # 基于训练时保存的结果，计算mse、mae、rmse，会计算每个epoch的分数
    mse_score = np.zeros((100, 18))  # 返回的precision
    mae_score = np.zeros((100, 18))
    pred = validation[:, 0, :, :]
    true = validation[:, 1, :, :]
    for epoch_i in range(100):
        for mmp_i in range(18):
            mse_score[epoch_i, mmp_i] = mean_squared_error(true[epoch_i, :, mmp_i], pred[epoch_i, :, mmp_i])
            mae_score[epoch_i, mmp_i] = mean_absolute_error(true[epoch_i, :, mmp_i], pred[epoch_i, :, mmp_i])
    rg_error = np.stack([mse_score, mae_score, np.sqrt(mse_score)], axis=2)
    return rg_error

def get_cl_error(validate_dl: np.array, threshold: int=1.65) -> np.array:
    # 基于训练时保存的验证结果，计算auc、f1、precision、recall，会计算每个epoch的分数
    pred = validate_dl[:, 0, :, :] > threshold
    true = validate_dl[:, 1, :, :] > 1.65
    p_score = np.zeros((100, 18))
    r_score = np.zeros((100, 18))
    f_score = np.zeros((100, 18))
    auc_score = np.zeros((100, 18))
    for epoch_i in range(100):
        for mmp_i in range(18):
            p_score[epoch_i, mmp_i] = precision_score(true[epoch_i, :, mmp_i], pred[epoch_i, :, mmp_i])
            r_score[epoch_i, mmp_i] = recall_score(true[epoch_i, :, mmp_i], pred[epoch_i, :, mmp_i])
            f_score[epoch_i, mmp_i] = f1_score(true[epoch_i, :, mmp_i], pred[epoch_i, :, mmp_i])
            auc_score[epoch_i, mmp_i] = roc_auc_score(true[epoch_i, :, mmp_i], pred[epoch_i, :, mmp_i])
    cl_error = np.stack([auc_score, f_score, p_score, r_score], axis=2)
    return cl_error


bionn_validation = validate_dl(BioNN, peptides, features_x, all_mmp_y)
bionn_rg_error, bionn_cl_error = get_rg_error(bionn_validation)[-1, :, :], get_cl_error(bionn_validation, threshold=1.65)[-1, :, :]
np.save("./Result/BioNN_rg_error.npy", bionn_rg_error)
np.save("./Result/BioNN_cl_error.npy", bionn_cl_error)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [58]:
x = np.array([[1, 2, 3], [4, 5, 6]])
np.stack([x, x], axis=2).shape

(2, 3, 2)

In [31]:
pred = validation[:, 0, :, :] * regular_coefficient
true = validation[:, 1, :, :] * regular_coefficient
mse_score = np.zeros((100, 18))
for fold_i in range(5):
    for epoch_i in range(100):
        for mmp_i in range(18):
            mse_score[epoch_i, mmp_i] = mean_squared_error(true[epoch_i, :, mmp_i], pred[epoch_i, :, mmp_i])

In [34]:
np.sqrt(mse_score.mean(axis=1))

array([0.99460732, 0.86587827, 0.82924139, 0.81138938, 0.79800584,
       0.78899892, 0.78455034, 0.77435379, 0.77104878, 0.768068  ,
       0.76670798, 0.76806533, 0.75780688, 0.75782859, 0.76241522,
       0.75460084, 0.7577036 , 0.75289422, 0.75017795, 0.74978147,
       0.74771384, 0.74573197, 0.74379285, 0.74489999, 0.7440025 ,
       0.74186524, 0.74285699, 0.73943136, 0.73936087, 0.7398171 ,
       0.73906961, 0.74158903, 0.73782286, 0.73575725, 0.73368339,
       0.73169087, 0.73373427, 0.73161427, 0.73127676, 0.73362617,
       0.73230214, 0.73079013, 0.73006728, 0.72975984, 0.73303983,
       0.73013971, 0.72731659, 0.72895744, 0.72644359, 0.72927401,
       0.72869115, 0.72699915, 0.72604629, 0.72665787, 0.72500266,
       0.72718026, 0.7250505 , 0.7256498 , 0.72282475, 0.72251308,
       0.72427253, 0.72348541, 0.72288456, 0.72509951, 0.72156373,
       0.72166909, 0.72112668, 0.71975462, 0.72162224, 0.72147329,
       0.7203327 , 0.72192879, 0.71880376, 0.71884779, 0.72078

In [54]:
pred = validation[:, 0, :, :] * regular_coefficient
true = validation[:, 1, :, :] * regular_coefficient
p_score = np.zeros((100, 18))
for fold_i in range(5):
    for epoch_i in range(100):
        for mmp_i in range(18):
            p_score[epoch_i, mmp_i] = precision_score(true[epoch_i, :, mmp_i] > 1.65, pred[epoch_i, :, mmp_i] > 2.1)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [55]:
p_score.mean(axis=1)

array([0.08181999, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.05555556,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.05555556, 0.08333333,
       0.09259259, 0.08333333, 0.11111111, 0.09259259, 0.05026455,
       0.375     , 0.26984127, 0.15246914, 0.20108025, 0.20659722,
       0.10897436, 0.37525253, 0.21031746, 0.25326024, 0.30546537,
       0.47723906, 0.36869929, 0.38802702, 0.28738977, 0.39558485,
       0.44070406, 0.47093892, 0.3750278 , 0.44492436, 0.4474014 ,
       0.47394312, 0.23742399, 0.46243687, 0.41531987, 0.44926024,
       0.49799403, 0.51652845, 0.48098191, 0.51401711, 0.47348545,
       0.4560751 , 0.52434511, 0.472582  , 0.45884588, 0.43234502,
       0.50236552, 0.51550861, 0.44495552, 0.5379157 , 0.55707476,
       0.43824909, 0.46777438, 0.49792247, 0.55160964, 0.46067744,
       0.52349861, 0.47586786, 0.48693169, 0.51750813, 0.48778

In [45]:
pred = validation[:, 0, :, :] * regular_coefficient
true = validation[:, 1, :, :] * regular_coefficient
r_score = np.zeros((100, 18))
for fold_i in range(5):
    for epoch_i in range(100):
        for mmp_i in range(18):
            r_score[epoch_i, mmp_i] = recall_score(true[epoch_i, :, mmp_i] > 1.65, pred[epoch_i, :, mmp_i] > 0)

In [46]:
r_score.mean(axis=1)

array([0.84858923, 0.92980457, 0.92911087, 0.92862634, 0.93764545,
       0.93623093, 0.93798009, 0.93928777, 0.93887   , 0.94193135,
       0.94396916, 0.94880837, 0.94042285, 0.94452583, 0.94767451,
       0.94870071, 0.95246655, 0.94797951, 0.94738503, 0.94620479,
       0.95064324, 0.94948944, 0.94527525, 0.95244081, 0.95472779,
       0.9520659 , 0.95172684, 0.95243907, 0.95017054, 0.95386715,
       0.95129721, 0.95695314, 0.95332655, 0.95590298, 0.95139947,
       0.95066496, 0.95172771, 0.95187824, 0.9538792 , 0.95751715,
       0.95336732, 0.95343694, 0.95252089, 0.94756241, 0.95760783,
       0.95261432, 0.95156584, 0.95471629, 0.95186339, 0.95850364,
       0.95710779, 0.95975469, 0.95187739, 0.95138678, 0.95854705,
       0.95256138, 0.95645541, 0.95695384, 0.95415434, 0.95771835,
       0.9547325 , 0.95781269, 0.95291575, 0.95209605, 0.95520727,
       0.95715658, 0.95547646, 0.95753057, 0.95437408, 0.95566645,
       0.95157005, 0.95081512, 0.95274804, 0.9526299 , 0.95488