# Classifier Experiments

## Import Libraries

In [1]:
# import libraries
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

import sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, mean_squared_error, mean_absolute_error

from catboost import CatBoostRegressor

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizerFast, BertModel


# 导入其他文件
from extract_features import load_features, load_features_by_name
from models import BioNN, BioDeepNN, BioResNet, LSTMFilter, CNNFilter, LSTMEncoder
from Model_Training import PeptidesDataLoader, BertDataLoader

# constant
SAVE = True
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Load Data

In [2]:
# 处理数据
# import data and preprocess
data = pd.read_csv("./Data/processed_peptides10.csv")  # load data

# 得到氨基酸序列
peptides = data.iloc[:, 0].values.tolist()  # 肽链的列表（字符串）

# load extracted features
features_x = load_features_by_name(["binary", "aac", "knn"]).iloc[:, 1:].values

# 处理mmp y的数据
all_mmp_y = data.iloc[:, 1:].values
all_mmp_y = (all_mmp_y > 1.65).astype(float)

# import Bert-Base-Protein model
checkpoint = 'unikei/bert-base-proteins'
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)

In [3]:
# 定义验证模型效果的函数
def validate_dl(model_class: nn.Module, peptides: list, features: np.array, y: np.array) -> dict:
    # 用5折交叉验证来验证模型效果
    losses = np.zeros((5, 100))  # 记录loss，一共100个epoch
    training_valid_labels = [] # 记录训练过程中验证集的precision
    training_true_labels = [] # 记录训练过程中验证集的recall

    # 构建5折交叉验证
    kf = KFold(n_splits=5, random_state=33, shuffle=True)

    # 构建tokens
    tokens = tokenizer(peptides, return_tensors='pt')
    input_ids, attention_mask, token_type_ids = tokens["input_ids"], tokens["attention_mask"], tokens["token_type_ids"]

    for i, (train_id, test_id) in enumerate(kf.split(features)):
        fold_pred = []  # 记录每个fold的验证结果
        fold_true = []

        # 基于id划分训练集和验证集
        train_input_ids, train_attention_mask, train_token_type_ids, train_features, train_y = input_ids[train_id], attention_mask[train_id], token_type_ids[train_id], torch.from_numpy(features[train_id]).float(), torch.from_numpy(y[train_id]).float()
        test_input_ids, test_attention_mask, test_token_type_ids, test_features, test_y = input_ids[test_id], attention_mask[test_id], token_type_ids[test_id], torch.from_numpy(features[test_id]).float(), torch.from_numpy(y[test_id]).float()

        train_dataloader = BertDataLoader(train_input_ids, train_attention_mask, train_token_type_ids, train_features, train_y, 512, shuffle=True)
        test_dataloader = BertDataLoader(test_input_ids, test_attention_mask, test_token_type_ids, test_features, test_y, 512, shuffle=False)

        # 初始化模型
        bert_model = BertModel.from_pretrained(checkpoint).to(device)
        filter_model = LSTMFilter(768, 16).to(device)  # 输出是256维的
        bio_model = model_class(768 * 10 + features.shape[1]).to(device)
        # 设置optimizer和criterion
        train_bert_params = [id(bert_model.pooler.dense.bias), id(bert_model.pooler.dense.weight)]
        bert_params = filter(lambda p: id(p) not in train_bert_params, bert_model.parameters())
        optimizer = optim.Adam(
            [
                {"params": bert_params, "lr": 1e-6},
                {"params": bert_model.pooler.dense.bias, "lr": 1e-3},
                {"params": bert_model.pooler.dense.weight, "lr": 1e-3},
                {"params": filter_model.parameters(), "lr": 1e-3}, 
                {"params": bio_model.parameters(), "lr": 1e-3}, 
            ], lr=1e-3
        )

        criterion = nn.BCEWithLogitsLoss()  # 用二分类交叉熵损失函数

        # 开始训练
        bert_model.train()
        bio_model.train()
        train_epochs = 100
        loss_track = []
        for epoch in tqdm(range(train_epochs), total=train_epochs):
            loss_track_epoch = []
            for epoch_input_ids, epoch_attention_mask, epoch_token_type_ids, features_epoch, labels_epoch in train_dataloader:
                optimizer.zero_grad()

                epoch_input_ids = epoch_input_ids.to(device)
                epoch_attention_mask = epoch_attention_mask.to(device)
                epoch_token_type_ids = epoch_token_type_ids.to(device)

                bert_output = bert_model(input_ids=epoch_input_ids, attention_mask=epoch_attention_mask, token_type_ids=epoch_token_type_ids).last_hidden_state.view(features_epoch.shape[0], -1)
                bio_input = torch.cat([bert_output, features_epoch.to(device)], dim=1)

                bio_output = bio_model(bio_input)

                loss = criterion(bio_output.view(labels_epoch.size()), labels_epoch.to(device))  # 在label只有一个特征时需要调整tensor结构
                loss.backward()

                optimizer.step()

                loss_track_epoch.append(loss.detach().to("cpu").item())
                losses[i, epoch] = loss.detach().to("cpu").item()

            avg_loss = np.average(loss_track_epoch)
            loss_track.append(avg_loss)

            # 在模型内验证
            bert_model.eval()
            bio_model.eval()
            pred = []
            with torch.no_grad():
                for epoch_input_ids, epoch_attention_mask, epoch_token_type_ids, features_epoch, labels_epoch in test_dataloader:
                    epoch_input_ids = epoch_input_ids.to(device)
                    epoch_attention_mask = epoch_attention_mask.to(device)
                    epoch_token_type_ids = epoch_token_type_ids.to(device)
                    bert_output = bert_model(input_ids=epoch_input_ids, attention_mask=epoch_attention_mask, token_type_ids=epoch_token_type_ids).last_hidden_state.view(features_epoch.shape[0], -1)
                    bio_input = torch.cat([bert_output, features_epoch.to(device).float()], dim=1)

                    bio_output = bio_model(bio_input)
                    pred.append(bio_output.to("cpu"))
            epoch_pred = torch.cat(pred, dim=0).detach().numpy()
            epoch_true = test_y.to("cpu").numpy()
            fold_pred.append(epoch_pred)
            fold_true.append(epoch_true)

            bert_model.train()
            bio_model.train()
        training_valid_labels.append(fold_pred)
        training_true_labels.append(fold_true)
        print("Fold {} finished".format(i))
    validation = {
        "training_valid_labels": training_valid_labels, 
        "training_true_labels": training_true_labels, 
        "losses": losses, 
    }
    return validation

validation = validate_dl(BioNN, peptides, features_x, all_mmp_y)

Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [05:08<00:00,  3.08s/it]


Fold 0 finished


Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [05:02<00:00,  3.03s/it]


Fold 1 finished


Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [04:56<00:00,  2.97s/it]


Fold 2 finished


Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [05:03<00:00,  3.04s/it]


Fold 3 finished


Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [05:34<00:00,  3.35s/it]

Fold 4 finished





In [22]:
valid_y = validation["training_valid_labels"]
true_y = validation["training_true_labels"]

epochs_num = 100

precision_scores = np.zeros((epochs_num, 18))  # epoch数
recall_scores = np.zeros((epochs_num, 18))
for epoch_i in range(epochs_num):
    epoch_precision = np.zeros((5, 18))
    epoch_recall = np.zeros((5, 18))
    for fold_i in range(5):
        epoch_fold_valid = (valid_y[fold_i][epoch_i] > 0.5).astype(float)
        for mmp_i in range(18):
            epoch_precision[fold_i, mmp_i] = precision_score(true_y[fold_i][epoch_i][:, mmp_i], epoch_fold_valid[:, mmp_i])
            epoch_recall[fold_i, mmp_i] = recall_score(true_y[fold_i][epoch_i][:, mmp_i], epoch_fold_valid[:, mmp_i])
    epoch_precision = epoch_precision.mean(axis=0)
    epoch_recall = epoch_recall.mean(axis=0)
    precision_scores[epoch_i, :] = epoch_precision
    recall_scores[epoch_i, :] = epoch_recall

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [23]:
precision_scores.mean(axis=1)

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.03015873, 0.03796296, 0.02593294, 0.07123061,
       0.1357646 , 0.15169459, 0.19802202, 0.2028204 , 0.21169868,
       0.26476017, 0.26655275, 0.32624844, 0.37682082, 0.36275137,
       0.43015463, 0.39791742, 0.35928357, 0.45571815, 0.43085358,
       0.42321177, 0.45353399, 0.49556075, 0.45676329, 0.46876559,
       0.52471529, 0.50187743, 0.53717105, 0.5087609 , 0.48919376,
       0.52890182, 0.54897892, 0.50260299, 0.51541754, 0.55776881,
       0.54636582, 0.52187713, 0.55838888, 0.53920283, 0.53150324,
       0.54924172, 0.54594882, 0.56974217, 0.56733414, 0.55692281,
       0.57549527, 0.58046383, 0.55167937, 0.56888946, 0.58424048,
       0.53634252, 0.60862073, 0.57663046, 0.58749472, 0.57597635,
       0.56195722, 0.56955015, 0.55865935, 0.56750676, 0.56676088,
       0.59649042, 0.60402131, 0.59679509, 0.55849336, 0.54894713,
       0.57797982, 0.58179083, 0.58230568, 0.58649799, 0.56237

In [24]:
recall_scores.mean(axis=1)

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00068142, 0.00100447, 0.00176307, 0.0043418 ,
       0.01632357, 0.01265862, 0.01111933, 0.02146698, 0.01840436,
       0.02816216, 0.04785646, 0.04323433, 0.05433214, 0.05014992,
       0.04510072, 0.06419833, 0.06947416, 0.09475365, 0.07702594,
       0.08443891, 0.10021436, 0.10299282, 0.10295818, 0.11243778,
       0.12572226, 0.09999593, 0.11344736, 0.11948206, 0.12480455,
       0.12068415, 0.12907795, 0.13763098, 0.14220419, 0.14373737,
       0.13473531, 0.14372712, 0.1461445 , 0.16654155, 0.14193777,
       0.16545626, 0.15262054, 0.16670215, 0.15729587, 0.15865445,
       0.14414079, 0.14869807, 0.16991736, 0.17583031, 0.18790827,
       0.17833131, 0.16087707, 0.1653559 , 0.19057437, 0.17430706,
       0.17676212, 0.19021786, 0.17040661, 0.19960481, 0.19466848,
       0.17752005, 0.17076525, 0.1750743 , 0.18216241, 0.19186595,
       0.20710248, 0.19415222, 0.18287749, 0.18009387, 0.18980

In [15]:
f1 = 2 * precision_scores.mean(axis=1) * recall_scores.mean(axis=1) / (precision_scores.mean(axis=1) + recall_scores.mean(axis=1))
f1

  f1 = 2 * precision_scores.mean(axis=1) * recall_scores.mean(axis=1) / (precision_scores.mean(axis=1) + recall_scores.mean(axis=1))


array([       nan,        nan,        nan,        nan,        nan,
              nan, 0.00133272, 0.00195715, 0.00330167, 0.0081847 ,
       0.02914314, 0.02336728, 0.02105631, 0.03882467, 0.03386465,
       0.05090918, 0.08114438, 0.07635069, 0.09497085, 0.08811768,
       0.08164152, 0.11055946, 0.1164337 , 0.15688708, 0.13068808,
       0.14078792, 0.16415622, 0.17054179, 0.168039  , 0.18137184,
       0.20284312, 0.16676499, 0.18733142, 0.19351685, 0.19887223,
       0.19652538, 0.20901218, 0.21608895, 0.22290789, 0.22857168,
       0.21616399, 0.22538287, 0.23165819, 0.25448216, 0.22404452,
       0.25430457, 0.23855327, 0.25793463, 0.24630312, 0.24695665,
       0.23053971, 0.23674811, 0.25981244, 0.26863262, 0.28435872,
       0.26766522, 0.25448577, 0.25701079, 0.28779305, 0.26762352,
       0.26893231, 0.28518866, 0.261154  , 0.29533404, 0.28979833,
       0.27361131, 0.26625617, 0.2707284 , 0.27472006, 0.28434774,
       0.30493887, 0.29114503, 0.27834015, 0.27556956, 0.28382