In [1]:
import numpy as np
import pandas as pd
import tomli
import xgboost as xgb
from sklearn.metrics import matthews_corrcoef, roc_auc_score

In [2]:
with open("src/config.toml", "rb") as f:
    config = tomli.load(f)

1. 数据读取与处理(Data loading and processing)

In [None]:
# 数据读取
# Data loading
p450plant0 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant0.pkl")
p450plant1 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant1.pkl")
p450plant2 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant2.pkl")
p450plant3 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant3.pkl")
p450plant4 = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "p450plant4.pkl")

p450plant = pd.concat(
    [p450plant0, p450plant1, p450plant2, p450plant3, p450plant4], ignore_index=True
)

In [None]:
# 特征读取
# Feature loading
def create_input_and_output_data(df):
    X = ()
    y = ()
    for ind in df.index:
        emb = df["ESM1b"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)
        X = X + (np.concatenate([ecfp, emb]),)
        y = y + (df["Binding"][ind],)
    return (X, y)


feature_names = ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_" + str(i) for i in range(1280)]

data_X, data_y = create_input_and_output_data(df=p450plant)

2. 模型预测(Model prediction)

In [None]:
# 模型预测
# Model prediction
bst = pd.read_pickle(config["espData"]["model_path"] + "p450authormodel.dat")
dtest_new = xgb.DMatrix(
    np.array(data_X), label=np.array(data_y), feature_names=feature_names
)

3. 模型评估(Model evaluation)

In [None]:
# 模型评估
# Model evaluation
y_test_new_pred = np.round(bst.predict(dtest_new))
acc_test_new = np.mean(y_test_new_pred == np.array(data_y))
try:
    roc_auc_new = roc_auc_score(np.array(data_y), bst.predict(dtest_new))
    mcc = matthews_corrcoef(np.array(data_y), y_test_new_pred)
except:
    roc_auc_new = 0
    mcc = 0

print(
    "Accuracy on test set: %s, ROC-AUC score for test set: %s, MCC: %s"
    % (acc_test_new, roc_auc_new, mcc)
)

index_of_ones = np.where(np.array(data_y) == 1)[0]
values_of_ones = bst.predict(dtest_new)[index_of_ones]
acc_1 = np.mean(np.round(values_of_ones) == 1)

index_of_zeros = np.where(np.array(data_y) == 0)[0]
values_of_zeros = bst.predict(dtest_new)[index_of_zeros]
acc_0 = np.mean(np.round(values_of_zeros) == 0)

print("Accuracy on 1 set: %s, Accuracy on 0 set: %s" % (acc_1, acc_0))
print(len(data_y))

Accuracy on test set: 0.5983236621534493, ROC-AUC score for test set: 0.5700608704436023, MCC: 0.07629463605771931
Accuracy on 1 set: 0.3539651837524178, Accuracy on 0 set: 0.7205029013539652
1551


In [None]:
# 数据保存
# Data saving
np.save(
    config["espData"]["encoded_path"] + "notrain_y_test_pred.npy",
    bst.predict(dtest_new),
)
np.save(config["espData"]["encoded_path"] + "notrain_y_test.npy", np.array(data_y))