In [1]:
import pickle

import numpy as np
import pandas as pd
import tomli
import xgboost as xgb
from sklearn.metrics import roc_auc_score

In [2]:
with open("src/config.toml", "rb") as f:
    config = tomli.load(f)

1. 数据读取(Data loading)

In [None]:
# esp数据读取
# ESP data loading
df_test = pd.read_pickle(
    config["espData"]["encoded_path"] + "df_test_with_ESM1b_ts.pkl"
)
df_test = df_test.loc[df_test["ESM1b"] != ""]
df_test.reset_index(inplace=True, drop=True)

df_train = pd.read_pickle(
    config["espData"]["encoded_path"] + "df_train_with_ESM1b_ts.pkl"
)
df_train = df_train.loc[df_train["ESM1b"] != ""]
df_train.reset_index(inplace=True, drop=True)

  result = libops.scalar_compare(x.ravel(), y, op)
  result = libops.scalar_compare(x.ravel(), y, op)


In [None]:
# Seq-P450数据读取
# Seq-P450 data loading
p450plant0 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant0.pkl")
p450plant1 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant1.pkl")
p450plant2 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant2.pkl")
p450plant3 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant3.pkl")
p450plant4 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant4.pkl")

p450plant = pd.concat(
    [p450plant0, p450plant1, p450plant2, p450plant3, p450plant4], ignore_index=True
)

2. 特征提取(Feature extraction)

In [5]:
def create_input_and_output_data(df):
    X = ()
    y = ()
    for ind in df.index:
        emb = df["ESM1b"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)

        X = X + (np.concatenate([ecfp, emb]),)
        y = y + (df["Binding"][ind],)

    return (X, y)


feature_names = ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_" + str(i) for i in range(1280)]

train_X, train_y = create_input_and_output_data(df=df_train)
test_X, test_y = create_input_and_output_data(df=df_test)

test_new_X, test_new_y = create_input_and_output_data(df=p450plant)
train_X = np.concatenate([train_X, test_new_X])
train_y = np.concatenate([train_y, test_new_y])

3. 模型训练(Model training)

In [6]:
param = {
    "learning_rate": 0.60553117247348733,
    "max_delta_step": 1.7726044219753656,
    "max_depth": 10,
    "min_child_weight": 1.3845040588450772,
    "num_rounds": 342.68325188584106,
    "reg_alpha": 0.531395259755843,
    "reg_lambda": 3.744980563764689,
    "weight": 0.26187490421514203,
}

num_round = param["num_rounds"]
param["objective"] = "binary:logistic"
param["eval_metric"] = ["error", "logloss"]

weights1 = np.array(
    [param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]]
)
weights2 = np.array(
    [param["weight"] if binding == 0 else 1.0 for binding in test_new_y]
)

weights = np.concatenate([weights1, weights2])


del param["num_rounds"]
del param["weight"]

dtrain = xgb.DMatrix(
    np.array(train_X),
    weight=weights,
    label=np.array(train_y),
    feature_names=feature_names,
)
dtest = xgb.DMatrix(
    np.array(test_X), label=np.array(test_y), feature_names=feature_names
)

evallist = [(dtrain, "train")]

bst = xgb.train(param, dtrain, int(num_round), evallist, verbose_eval=10)
y_test_pred = np.round(bst.predict(dtest))
acc_test = np.mean(y_test_pred == np.array(test_y))
roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))

print("Accuracy on test set: %s, ROC-AUC score for test set: %s" % (acc_test, roc_auc))

[0]	train-error:0.26911	train-logloss:0.57980
[10]	train-error:0.11887	train-logloss:0.36296
[20]	train-error:0.06895	train-logloss:0.26987
[30]	train-error:0.04057	train-logloss:0.20366
[40]	train-error:0.02649	train-logloss:0.16278
[50]	train-error:0.01834	train-logloss:0.13417
[60]	train-error:0.01345	train-logloss:0.11097
[70]	train-error:0.00986	train-logloss:0.09622
[80]	train-error:0.00780	train-logloss:0.08389
[90]	train-error:0.00606	train-logloss:0.07305
[100]	train-error:0.00470	train-logloss:0.06417
[110]	train-error:0.00373	train-logloss:0.05723
[120]	train-error:0.00294	train-logloss:0.05078
[130]	train-error:0.00254	train-logloss:0.04644
[140]	train-error:0.00233	train-logloss:0.04239
[150]	train-error:0.00202	train-logloss:0.03886
[160]	train-error:0.00186	train-logloss:0.03596
[170]	train-error:0.00159	train-logloss:0.03344
[180]	train-error:0.00153	train-logloss:0.03111
[190]	train-error:0.00143	train-logloss:0.02910
[200]	train-error:0.00136	train-logloss:0.02735
[21

In [7]:
pickle.dump(
    bst, open(config["SeqP450Data"]["model_path"] + "p450normalmodel.dat", "wb")
)