# 导入模块

In [18]:
import datetime
import numpy as np
import pandas as pd
import torch
from torch import nn
import os
import logging

from torch.utils.tensorboard import SummaryWriter
from tqdm.auto import tqdm
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split

import optuna
from torch import optim
import math
from torch.nn import init, MultiheadAttention
from torch.nn import functional as F

os.chdir("D:/WorkPath/PycharmProjects/MutTm-pred")
from DeepLearning.Util import (PonDataset, PonMetrics, EarlyStopping, logger_init, embedding_dataset_creator_full_length,
                               embedding_model_getter)

# 全局变量

In [2]:
import os
project_path = "D:/WorkPath/PycharmProjects/MutTm-pred"
os.chdir(project_path)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# region train&test dataset
from Dataset.Process4Dataset.DatasetCeator4PonDT import Dataset4MutTm
dataset = Dataset4MutTm(package_path=r"D:\WorkPath\PycharmProjects\MutTm-pred\Dataset\Process4Dataset",
                        train_dataset_path=r"D:\WorkPath\PycharmProjects\MutTm-pred\Dataset\BasicData\ProThermDB\Common\excllent_ProThermDB_Training.csv",
                        test_dataset_path=r"D:\WorkPath\PycharmProjects\MutTm-pred\Dataset\BasicData\ProThermDB\Common\excllent_ProThermDB_Testing.csv",
                        training_version="ProThermDB_Common",
                        testing_version="ProThermDBTest_Common",
                        selected_columns=["UniProt_ID", "Mutation", "ΔTm"],
                        mult_mode="Average",
                        features=["neighbor"],
                        R_path=r"C:\Program Files\R\R-4.3.2",
                        context_length=200)
embedding_model_path = "DeepLearning/EmbeddingModels/ESM-2/esm2_t33_650M_UR50D"
embedding_model_name = "ESM-2-650M"
embedding_tokenizer, embedding_model = embedding_model_getter(embedding_model_path,
                                                              embedding_model_name,
                                                              device)
train_dataset, test_dataset = embedding_dataset_creator_full_length(df=dataset,
                                                                    embedding_model_name=embedding_model_name,
                                                                    model=embedding_model,
                                                                    tokenizer=embedding_tokenizer,
                                                                    device=device)
del embedding_model, embedding_tokenizer
# endregion

===正在从训练集版本为ProThermDB_Common、训练集版本为ProThermDBTest_Common的原始数据集中进行数据清洗和生物特征提取工作===
1.预处理训练集数据...
删除数据缺失行及非法行共计0行
丢弃pH/Tm的缺失值
前一数据集采用了后一数据集中的0条数据，现已删除
获取[序列]信息.....该数据集已经经过处理，直接使用缓存文件
删除了1031个非法长度的蛋白质，当前蛋白质长度被限制在(200, 5000)
2.预处理测试集数据...
删除数据缺失行及非法行共计0行
丢弃pH/Tm的缺失值
获取[序列]信息.....该数据集已经经过处理，直接使用缓存文件
删除了109个非法长度的蛋白质，当前蛋白质长度被限制在(200, 5000)
3.为训练集数据提取生物特征...
获取[neighbor特征].....该数据集已经经过处理，直接使用缓存文件
4.为测试集数据提取生物特征...
获取[neighbor特征].....该数据集已经经过处理，直接使用缓存文件
6.从全数据集中提取生物特征集、标签集和基本信息集...
7.数据清洗和生物特征提取工作完成==>


Some weights of EsmModel were not initialized from the model checkpoint at DeepLearning/EmbeddingModels/ESM-2/esm2_t33_650M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 4/4 [02:44<00:00, 41.05s/it]


# 网络模块

In [39]:
class ESM650M_ConvNet_Sub(nn.Module):

    def __init__(self,
                 trial):
        super(ESM650M_ConvNet_Sub, self).__init__()
        dropout_rate = trial.suggest_float("dropout", 0.2, 0.7)
        self.conv_layer = nn.Sequential(
            torch.nn.Conv1d(1, 8, 3, padding=1),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(8),
            torch.nn.Conv1d(8, 32, 3, padding=1),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(32),
        )
        self.fc_layer = torch.nn.Sequential(
            torch.nn.Linear(1280 * 32, 128),
            nn.Dropout(dropout_rate),
            torch.nn.BatchNorm1d(128),
            torch.nn.Linear(128, 1),
        )

    def forward(self, wild_embedding, mutant_embedding):
        output = torch.sub(wild_embedding, mutant_embedding)
        output = self.conv_layer(output)
        output = self.fc_layer(output.reshape(output.shape[0], -1))
        return output


class ESM650M_ConvNet_Comb(nn.Module):

    def __init__(self,
                 trial):
        super(ESM650M_ConvNet_Comb, self).__init__()
        dropout_rate = trial.suggest_float("dropout", 0.2, 0.7)
        self.conv_layer = nn.Sequential(
            torch.nn.Conv1d(1, 8, 3, padding=1),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(8),
            torch.nn.Conv1d(8, 32, 3, padding=1),
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(32),
        )
        self.fc_layer = torch.nn.Sequential(
            torch.nn.Linear(2560 * 32, 1024),
            torch.nn.Linear(1024, 128),
            nn.Dropout(dropout_rate),
            torch.nn.BatchNorm1d(128),
            torch.nn.Linear(128, 1),
        )

    def forward(self, wild_embedding, mutant_embedding):
        output = torch.concat([wild_embedding, mutant_embedding], dim=2)
        output = self.conv_layer(output)
        output = self.fc_layer(output.reshape(output.shape[0], -1))
        return output


class ESM650M_AttentionNet_Sub(nn.Module):
    def __init__(self,
                 trial):
        super(ESM650M_AttentionNet_Sub, self).__init__()
        dropout_rate_attention = trial.suggest_float("dropout", 0.2, 0.7)
        dropout_rate_bn = trial.suggest_float("dropout", 0.2, 0.7)
        num_heads = trial.suggest_categorical("num_heads", [1, 2, 4, 8])
        bias = trial.suggest_categorical("bias", [True, False])
        self.attention_module = MultiheadAttention(embed_dim=1280,
                                                   num_heads=num_heads,
                                                   bias=bias,
                                                   dropout=dropout_rate_attention)
        self.middle_module = nn.Sequential(
            torch.nn.ReLU(),
            torch.nn.BatchNorm1d(1280),
            nn.Dropout(dropout_rate_bn)
        )

        self.fc_layer = nn.Sequential(
            nn.Linear(1280, 256, dtype=torch.float32), nn.ReLU(inplace=True),
            nn.Linear(256, 128, dtype=torch.float32), nn.ReLU(inplace=True),
            nn.Linear(128, 1, dtype=torch.float32)
        )

    def forward(self, wild_embedding, mutant_embedding):
        output = torch.sub(wild_embedding, mutant_embedding)
        output, attention = self.attention_module(output, output, output)
        output = self.middle_module(output.view(-1, output.shape[-1]))
        output = self.fc_layer(output.reshape(output.shape[0], -1))
        return output

# 定义optuna调优参数方法

In [41]:
def objective(trial, optuna_dataset):
    # region 训练参数
    model = ESM650M_AttentionNet_Sub(trial).to(device)
    params = {
        "optimizer_name": trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"]),
        "learning_rate": trial.suggest_float("lr", 1e-4, 1e-1, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 1e-5, 1e-1, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [32, 64]),
    }
    loss_fn_name = "MAE"
    optimizer = getattr(optim, params["optimizer_name"])(model.parameters(),
                                                         lr=params["learning_rate"],
                                                         weight_decay=params["weight_decay"])
    loss_fn = nn.L1Loss(reduction="mean") if loss_fn_name == "MAE" else nn.MSELoss()

    # 加载并分割训练/验证集
    train_index, valid_index = train_test_split(range(len(optuna_dataset)), test_size=0.1, random_state=42)
    train_fold, val_fold = optuna_dataset.subDataset(train_index), optuna_dataset.subDataset([valid_index])
    trainLoader = DataLoader(dataset=train_fold, batch_size=params["batch_size"], shuffle=True, drop_last=True)
    validateLoader = DataLoader(dataset=val_fold, batch_size=params["batch_size"], shuffle=True, drop_last=True)
    # endregion

    # region 模型训练
    # 设定模型训练参数(回调点、早停点参数等)
    model = model.to(device)
    loss_fn = loss_fn.to(device)

    min_loss = 10.0
    # 训练/验证
    for epoch in range(400):

        # region Training
        train_loss = 0.0
        train_steps = 0
        model.train()
        for wild_embedding, mutant_embedding, bio, label in trainLoader:
            # 载入GPU
            wild_embedding = wild_embedding.to(device)
            mutant_embedding = mutant_embedding.to(device)
            label = label.to(device)
            output = model(wild_embedding, mutant_embedding)  # 预测模型
            running_loss = loss_fn(output, label)  # 损失函数并运行计算梯度

            optimizer.zero_grad()  # 优化器梯度清零
            running_loss.requires_grad_(True)  # 允许梯度
            running_loss.backward()  # 反向传播
            optimizer.step()  # 重置参数
            train_loss += running_loss.item()
            train_steps += 1

        # endregion

        # region Validation
        valid_loss = 0.0
        valid_steps = 0
        model.eval()
        with torch.no_grad():
            for wild_embedding, mutant_embedding, bio, label in validateLoader:
                # 载入GPU
                wild_embedding = wild_embedding.to(device)
                mutant_embedding = mutant_embedding.to(device)
                label = label.to(device)
                output = model(wild_embedding, mutant_embedding)  # 预测模型
                running_loss = loss_fn(output, label)  # 损失函数并运行计算梯度
                valid_loss += running_loss.item()
                valid_steps += 1
        # endregion
        
        min_loss = min(min_loss, valid_loss / valid_steps)
        trial.report(valid_loss / valid_steps, epoch)
        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
                
    return min_loss

study = optuna.create_study(study_name="ESM650M-Attention-Subtration-4-16",
                            pruner=optuna.pruners.HyperbandPruner(),
                            direction="minimize",
                            storage="sqlite:///db.sqlite3",
                            load_if_exists=True)
study.optimize(lambda trial: objective(trial, train_dataset), n_trials=300)

[I 2024-04-16 22:31:48,158] A new study created in RDB with name: ESM650M-Attention-Subtration-4-16
[I 2024-04-16 22:32:15,420] Trial 0 finished with value: 4.573757886886597 and parameters: {'dropout': 0.507871140048874, 'num_heads': 1, 'bias': True, 'optimizer': 'SGD', 'lr': 0.017166355607103532, 'weight_decay': 0.0006680900918200253, 'batch_size': 64}. Best is trial 0 with value: 4.573757886886597.
[I 2024-04-16 22:32:43,316] Trial 1 finished with value: 4.328893423080444 and parameters: {'dropout': 0.528501308279987, 'num_heads': 1, 'bias': False, 'optimizer': 'RMSprop', 'lr': 0.0006064444680349237, 'weight_decay': 0.007402089667701735, 'batch_size': 64}. Best is trial 1 with value: 4.328893423080444.
[I 2024-04-16 22:33:41,100] Trial 2 finished with value: 4.556233584880829 and parameters: {'dropout': 0.5023329008481388, 'num_heads': 8, 'bias': True, 'optimizer': 'Adam', 'lr': 0.00792353798650842, 'weight_decay': 0.009694300903290449, 'batch_size': 32}. Best is trial 1 with value: