# Model Training

## Import Libraries

In [1]:
# import libraries
import numpy as np
import pandas as pd
import pickle as pkl

import sklearn
from sklearn.model_selection import KFold, train_test_split

from catboost import CatBoostRegressor

import torch
import torch.nn as nn
from torch.nn.functional import one_hot 
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizerFast, BertModel

from TorchCRF import CRF
from tqdm import tqdm
import blosum as bl

# 导入其他文件
from extract_features import load_features


# constant
SAVE = True
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("CPU")

## 导入数据

In [2]:
# import data
data = pd.read_excel("./Data/peptides10.xlsx")  # load data
data.iloc[:, 0] = data.iloc[:, 0].map(lambda x: x.strip()).map(lambda x: x[1:-1])  # 删除首尾的氨基酸和空格
data = data.iloc[:, :-1]  # 删除最后一列

# 得到氨基酸序列
peptides = data.iloc[:, 0].values.tolist()  # 肽链的列表（字符串）

mmp3_y = data.iloc[:, 3].values  # 得到mmp3的y

data.head()

  warn(msg)


Unnamed: 0.1,Unnamed: 0,MMP1,MMP2,MMP3,MMP7,MMP8,MMP9,MMP10,MMP11,MMP12,MMP13,MMP14,MMP15,MMP16,MMP17,MMP19,MMP20,MMP24,MMP25
0,ASGGMGNK,-0.59,-0.8,-1.0,-1.35,-1.39,-1.14,-0.97,-0.87,-1.37,-1.27,-0.26,-0.6,-0.84,-0.74,-1.55,-0.72,-1.14,0.07
1,KIYNYDCE,-1.08,-1.11,-0.37,-1.07,-0.81,-0.52,-0.58,-0.77,-0.41,-0.59,-0.93,-1.0,-1.4,0.57,0.28,0.45,-0.64,-0.19
2,KIYDYDCE,-0.93,-0.39,0.17,-0.42,-0.83,-0.02,-0.39,-0.58,-0.02,-0.16,-0.76,-0.57,-0.82,0.95,0.96,0.24,-0.66,1.15
3,KIYDLDCE,-1.28,-1.29,-0.66,-0.16,-0.41,-1.04,-0.55,-0.34,-0.23,-0.58,-0.41,-0.76,-1.7,0.33,-0.98,-0.44,-1.09,-0.17
4,ASGGLGNK,-1.28,-1.0,-1.42,-1.64,-0.16,-1.07,-0.93,-1.19,-1.71,-1.35,-0.46,-1.25,-1.12,-0.31,-0.29,-0.7,-0.95,0.3


In [3]:
# load extracted features
features_x = load_features().iloc[:, 1:].values

## 测试Cat模型

In [None]:
# 测试cat(粗调) 1h
def kf_test(lr: float, iter: int) -> float:
    kf = KFold(n_splits=5)
    errors = []
    for train_id, test_id in kf.split(features_x):
        train_x, train_y = features_x[train_id], mmp3_y[train_id]
        test_x, test_y = features_x[test_id], mmp3_y[test_id]

        model = CatBoostRegressor(iterations=iter, depth=10, learning_rate=lr, loss_function="RMSE")

        model.fit(train_x, train_y)
        pred = model.predict(test_x)
        error = sklearn.metrics.mean_squared_error(test_y, pred)
        errors.append(error)
    avg_error = np.average(errors)
    return avg_error

iterations = [500, 1000, 1500]
lrs = [0.01, 0.1, 1]
errors = dict()
for iteration in iterations:
    for lr in lrs:
        avg_error = kf_test(lr, iteration)
        errors[(lr, iteration)] = avg_error

"""
{(0.01, 500): 0.6562976223926196,
 (0.1, 500): 0.5540895155266845,
 (1, 500): 0.7918202606685946,
 (0.01, 1000): 0.6077085439090827,
 (0.1, 1000): 0.5511657178360376,
 (1, 1000): 0.8021398958193586,
 (0.01, 1500): 0.5810863866378174,
 (0.1, 1500): 0.5532692812010497,
 (1, 1500): 0.8025975024267054}
"""

In [4]:
# 测试cat 还是原来的特征好用，即skcaap而没有knn
kf = KFold(n_splits=5)
errors = []
for train_id, test_id in kf.split(features_x):
    train_x, train_y = features_x[train_id], mmp3_y[train_id]
    test_x, test_y = features_x[test_id], mmp3_y[test_id]

    model = CatBoostRegressor(iterations=1000, depth=10, learning_rate=0.1, loss_function="RMSE")

    model.fit(train_x, train_y)
    pred = model.predict(test_x)
    error = sklearn.metrics.mean_squared_error(test_y, pred)
    errors.append(error)
avg_error = np.average(errors)
print(np.sqrt(avg_error))

0:	learn: 0.9681672	total: 179ms	remaining: 2m 58s
1:	learn: 0.9439231	total: 231ms	remaining: 1m 55s
2:	learn: 0.9221624	total: 289ms	remaining: 1m 35s
3:	learn: 0.9037642	total: 337ms	remaining: 1m 23s
4:	learn: 0.8883199	total: 382ms	remaining: 1m 15s
5:	learn: 0.8724323	total: 427ms	remaining: 1m 10s
6:	learn: 0.8608080	total: 475ms	remaining: 1m 7s
7:	learn: 0.8498380	total: 523ms	remaining: 1m 4s
8:	learn: 0.8416077	total: 570ms	remaining: 1m 2s
9:	learn: 0.8324022	total: 616ms	remaining: 1m 1s
10:	learn: 0.8252785	total: 664ms	remaining: 59.7s
11:	learn: 0.8185913	total: 716ms	remaining: 59s
12:	learn: 0.8120186	total: 761ms	remaining: 57.7s
13:	learn: 0.8056713	total: 807ms	remaining: 56.8s
14:	learn: 0.8000415	total: 854ms	remaining: 56.1s
15:	learn: 0.7959454	total: 906ms	remaining: 55.7s
16:	learn: 0.7907166	total: 955ms	remaining: 55.2s
17:	learn: 0.7867771	total: 1s	remaining: 54.9s
18:	learn: 0.7826771	total: 1.05s	remaining: 54.4s
19:	learn: 0.7794035	total: 1.1s	remaini

## Bert-NN Model

In [4]:
# 将特征转化为tensor
features_x = torch.from_numpy(features_x).float()

# import Bert-Base-Protein model
checkpoint = 'unikei/bert-base-proteins'
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)
bert_model = BertModel.from_pretrained(checkpoint).to(device)


# 处理mmp y的数据
labels = data.iloc[:, 1:].values
regular_coefficient = np.max(np.abs(labels))
labels = labels / regular_coefficient  # 归一化处理
labels = torch.from_numpy(labels).float().to(device)

Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 定义Dataset和DataLoader
class Peptides_Dataset(Dataset):
    def __init__(self, peptides, features, labels):
        self.peptides = peptides
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.peptides)

    def __getitem__(self, idx):
        peptide = self.peptides[idx]
        feature = self.features[idx]
        label = self.labels[idx]
        return peptide, feature, label


class Peptides_DataLoader(DataLoader):
    def __init__(self, peptides, features, labels, batch_size, shuffle=True):
        dataset = Peptides_Dataset(peptides, features, labels)
        super().__init__(dataset, batch_size, shuffle)


# 划分训练集和测试集
train_peptides, test_peptides, train_features, test_features, train_labels, test_labels = train_test_split(peptides, features_x, labels, test_size=0.2, random_state=33)
train_dataloader = Peptides_DataLoader(train_peptides, train_features, train_labels, 512, shuffle=True)
test_dataloader = Peptides_DataLoader(test_peptides, test_features, test_labels, 512, shuffle=False)

In [6]:
# define Bert-Bio Model
class Bio_NN(nn.Module):
    def __init__(self, hidden_size: int = 768, labels_num: int = 18) -> None:
        super(Bio_NN, self).__init__()
        self.nn1 = nn.Linear(hidden_size, 1024)
        self.dropout1 = nn.Dropout(0.5)
        self.nn2 = nn.Linear(1024, 512)
        self.dropout2 = nn.Dropout(0.5)
        self.nn3 = nn.Linear(512, labels_num)


    def forward(self, input: torch.Tensor):
        output = F.tanh(self.nn1(input))
        output = self.dropout1(output)
        output = F.tanh(self.nn2(output))
        output = self.dropout2(output)
        output = self.nn3(output)
        return output

# 创建全连接模型
bio_model = Bio_NN(768 * 10 + features_x.shape[1], labels_num=18).to(device)  # 预测全部mmp

In [None]:
# 训练模型

# 设置optimizer
bert_params = bert_model.parameters()
forward_params = bio_model.parameters()
bert_optim = optim.Adam(bert_model.parameters(), lr=1e-6)
forward_optim = optim.Adam(bio_model.parameters(), lr=1e-3)

# 设置criterion
criterion = nn.MSELoss()

# 调整为train模式
bert_model.train()
bio_model.train()

# 开始训练
train_epochs = 100
pbar = tqdm(range(train_epochs), desc="Training: ", total=train_epochs)
pbar.set_postfix(loss=0)
loss_track = []
for epoch in pbar:
    loss_track_epoch = []
    for peptides_epoch, features_epoch, labels_epoch in train_dataloader:
        bert_optim.zero_grad()
        forward_optim.zero_grad()

        tokens_epoch = tokenizer(peptides_epoch, return_tensors='pt').to(device)
        bert_output = bert_model(**tokens_epoch).last_hidden_state.view(len(peptides_epoch), -1)  # 将embed结果铺平
        bio_input = torch.cat([bert_output, features_epoch.to(device)], dim=1)
        
        bio_output = bio_model(bio_input)

        loss = criterion(bio_output.view(labels_epoch.size()), labels_epoch)  # 在label只有一个特征时需要调整tensor结构
        loss.backward()
        
        bert_optim.step()
        forward_optim.step()

        loss_track_epoch.append(loss.detach().to("cpu").item())

    avg_loss = np.average(loss_track_epoch)
    loss_track.append(avg_loss)
    pbar.set_postfix(loss=avg_loss)


Training:  35%|████████████████████▎                                     | 35/100 [01:43<03:08,  2.91s/it, loss=0.0127]

In [None]:
# 测试模型效果
bert_model.eval()
bio_model.eval()

# 预测回归结果
test_pred = []
with torch.no_grad():
    for peptides_epoch, features_epoch, labels_epoch in test_dataloader:
        tokens_epoch = tokenizer(peptides_epoch, return_tensors='pt').to(device)
        bert_output = bert_model(**tokens_epoch).last_hidden_state.view(len(peptides_epoch), -1)  # 将embed结果铺平
        bio_input = torch.cat([bert_output, features_epoch.to(device)], dim=1)
        
        bio_output = bio_model(bio_input)
        test_pred.append(bio_output.to("cpu"))
test_pred = torch.cat(test_pred, dim=0).detach().numpy() * regular_coefficient
test_truth = test_labels.to("cpu").numpy() * regular_coefficient

In [9]:
np.sqrt(sklearn.metrics.mean_squared_error(test_truth, test_pred))
# 100 0.71
# 300 0.70

0.78056246

In [12]:
# check mmp3
np.sqrt(sklearn.metrics.mean_squared_error(test_truth[:, 2], test_pred[:, 2]))

# 0.78

0.78004104