In [1]:
import pickle

import numpy as np
import pandas as pd
import tomli
import xgboost as xgb
from sklearn.metrics import matthews_corrcoef, roc_auc_score

In [2]:
with open("src/config.toml", "rb") as f:
    config = tomli.load(f)

1. 数据读取与处理(Data loading and processing)

In [None]:
# esp数据读取
# ESP data loading
df_test = pd.read_pickle(
    config["espData"]["encoded_path"] + "df_test_with_ESM1b_ts.pkl"
)
df_test = df_test.loc[df_test["ESM1b"] != ""]
df_test.reset_index(inplace=True, drop=True)

df_train = pd.read_pickle(
    config["espData"]["encoded_path"] + "df_train_with_ESM1b_ts.pkl"
)
df_train = df_train.loc[df_train["ESM1b"] != ""]
df_train.reset_index(inplace=True, drop=True)

  result = libops.scalar_compare(x.ravel(), y, op)
  result = libops.scalar_compare(x.ravel(), y, op)


In [None]:
# 特征提取函数
# Feature extraction function


def create_input_and_output_data(df):
    X = ()
    y = ()
    ids = []
    for ind in df.index:
        emb = df["ESM1b"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)
        ids.append(ind)
        X = X + (np.concatenate([ecfp, emb]),)
        y = y + (df["Binding"][ind],)

    return (X, y, ids)


# 载入esp数据
# Load ESP data
train_X, train_y, _ = create_input_and_output_data(df=df_train)
test_X, test_y, _ = create_input_and_output_data(df=df_test)

feature_names = ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_" + str(i) for i in range(1280)]

In [None]:
# 读取Seq-P450数据
# Seq-P450 data loading
p450plant0 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant0.pkl")
p450plant1 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant1.pkl")
p450plant2 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant2.pkl")
p450plant3 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant3.pkl")
p450plant4 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant4.pkl")

p450plants = [p450plant0, p450plant1, p450plant2, p450plant3, p450plant4]

train_Xloop = []
train_yloop = []
test_Xloop = []
test_yloop = []

train_idsloop = []
test_idsloop = []

train_set_loop = []
test_set_loop = []
num_folds = 5

# 进行五折交叉验证数据组合
# Perform five-fold cross-validation data splitting
for i in range(num_folds):
    # 取出训练集
    # Extract training set
    train_cache = pd.concat(p450plants[:i] + p450plants[i + 1 :])
    # 重置索引
    # Reset index
    train_cache.reset_index(inplace=True, drop=True)
    # 使用方法对数据处理
    # Process data using the method
    Mou_X_train, Mou_y_train, train_ids = create_input_and_output_data(df=train_cache)
    Mou_X_test, Mou_y_test, test_ids = create_input_and_output_data(df=p450plants[i])
    # 新旧数据拼接
    # Concatenate new and old data
    train_Xloop.append(np.concatenate([train_X, Mou_X_train]))
    train_yloop.append(np.concatenate([train_y, Mou_y_train]))
    test_Xloop.append(Mou_X_test)
    test_yloop.append(Mou_y_test)

    train_idsloop.append(train_ids)
    test_idsloop.append(test_ids)

    train_set_loop.append(pd.concat([df_train] + p450plants[:i] + p450plants[i + 1 :]))
    test_set_loop.append(p450plants[i])

2. 模型训练与保存(Model training and saving)

In [None]:
# 使用esp原作者的方法及参数进行训练
# Train using the original ESP author's method and parameters

for i in range(num_folds):
    # 参数设置
    # Parameter settings
    param = {
        "learning_rate": 0.31553117247348733,
        "max_delta_step": 1.7726044219753656,
        "max_depth": 10,
        "min_child_weight": 1.3845040588450772,
        "num_rounds": 342.68325188584106,
        "reg_alpha": 0.531395259755843,
        "reg_lambda": 3.744980563764689,
        "weight": 0.26187490421514203,
    }

    num_round = param["num_rounds"]
    param["objective"] = "binary:logistic"
    param["eval_metric"] = ["error", "logloss"]

    weightss = np.array(
        [
            param["weight"] if binding == 0 else 1.0
            for binding in train_set_loop[i]["Binding"]
        ]
    )

    del param["num_rounds"]
    del param["weight"]
    # 数据集导入
    # Dataset import
    dtrain = xgb.DMatrix(
        np.array(train_Xloop[i]),
        weight=weightss,
        label=np.array(train_yloop[i]),
        feature_names=feature_names,
    )
    dtest = xgb.DMatrix(
        np.array(test_X), label=np.array(test_y), feature_names=feature_names
    )
    dtest_new = xgb.DMatrix(
        np.array(test_Xloop[i]),
        label=np.array(test_yloop[i]),
        feature_names=feature_names,
    )
    # 评估标准
    # Evaluation metrics
    evallist = [(dtest_new, "eval"), (dtrain, "train")]
    # 进行训练，并收集模型
    # Perform training and collect the model
    bst = xgb.train(param, dtrain, int(num_round), evallist, verbose_eval=10)
    # -------------------------------------------------------------------------------------------
    # 一系列数据评估
    # A series of data evaluations
    y_test_pred = np.round(bst.predict(dtest))
    acc_test = np.mean(y_test_pred == np.array(test_y))
    roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))

    index_of_ones = np.where(np.array(test_y) == 1)[0]
    values_of_ones = bst.predict(dtest)[index_of_ones]
    acc_1 = np.mean(np.round(values_of_ones) == 1)

    index_of_zeros = np.where(np.array(test_y) == 0)[0]
    values_of_zeros = bst.predict(dtest)[index_of_zeros]
    acc_0 = np.mean(np.round(values_of_zeros) == 0)
    print("------------------------%d----------------------------------" % i)
    print(
        "Accuracy on test set: %s, ROC-AUC score for test set: %s" % (acc_test, roc_auc)
    )
    print("Accuracy on 1 set: %s, Accuracy on 0 set: %s" % (acc_1, acc_0))
    # -------------------------------------------------------------------------------------------
    y_test_new_pred = np.round(bst.predict(dtest_new))
    acc_test_new = np.mean(y_test_new_pred == np.array(test_yloop[i]))
    try:
        roc_auc_new = roc_auc_score(np.array(test_yloop[i]), bst.predict(dtest_new))
    except:
        roc_auc_new = 0
    try:
        mcc = matthews_corrcoef(np.array(test_yloop[i]), y_test_new_pred)
    except:
        mcc = 0
    index_of_ones = np.where(np.array(test_yloop[i]) == 1)[0]
    values_of_ones = bst.predict(dtest_new)[index_of_ones]
    acc_1_new = np.mean(np.round(values_of_ones) == 1)

    index_of_zeros = np.where(np.array(test_yloop[i]) == 0)[0]
    values_of_zeros = bst.predict(dtest_new)[index_of_zeros]
    acc_0_new = np.mean(np.round(values_of_zeros) == 0)

    print(
        "Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"
        % (acc_test_new, roc_auc_new, mcc)
    )
    print("Accuracy on 1 set: %s, Accuracy on 0 set: %s" % (acc_1_new, acc_0_new))
    print(len(train_set_loop[i]))
    print(len(test_set_loop[i]))
    print("-------------------------------------------------------")
    print("\n\n")
    # -------------------------------------------------------------------------------------------
    # 模型保存
    # Model saving
    pickle.dump(
        bst,
        open(
            config["SeqP450Data"]["model_path"] + "slice" + str(i) + "model.dat", "wb"
        ),
    )

    np.save(
        config["SeqP450Data"]["encoded_path"] + "train" + str(i) + "_y_test_pred.npy",
        bst.predict(dtest_new),
    )
    np.save(
        config["SeqP450Data"]["encoded_path"] + "train" + str(i) + "_y_test.npy",
        np.array(test_yloop[i]),
    )

[0]	eval-error:0.55987	eval-logloss:0.70835	train-error:0.27139	train-logloss:0.62562
[10]	eval-error:0.50485	eval-logloss:0.71420	train-error:0.14055	train-logloss:0.41934
[20]	eval-error:0.37217	eval-logloss:0.63800	train-error:0.08870	train-logloss:0.34014
[30]	eval-error:0.32039	eval-logloss:0.58696	train-error:0.06423	train-logloss:0.29236
[40]	eval-error:0.27185	eval-logloss:0.55403	train-error:0.04689	train-logloss:0.25467
[50]	eval-error:0.22977	eval-logloss:0.53077	train-error:0.03688	train-logloss:0.22287
[60]	eval-error:0.23625	eval-logloss:0.50277	train-error:0.02961	train-logloss:0.19721
[70]	eval-error:0.20712	eval-logloss:0.47980	train-error:0.02328	train-logloss:0.17343
[80]	eval-error:0.22007	eval-logloss:0.47789	train-error:0.01934	train-logloss:0.15528
[90]	eval-error:0.21359	eval-logloss:0.46974	train-error:0.01556	train-logloss:0.13749
[100]	eval-error:0.22654	eval-logloss:0.46949	train-error:0.01266	train-logloss:0.12391
[110]	eval-error:0.22007	eval-logloss:0.466