In [1]:
import numpy as np
import pandas as pd
import tomli
import xgboost as xgb

In [2]:
with open("src/config.toml", "rb") as f:
    config = tomli.load(f)

1. 数据读取(Data loading)

In [3]:
# 数据读取
# Data loading
slice0data = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "slice0data.pkl")
slice1data = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "slice1data.pkl")
slice2data = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "slice2data.pkl")
slice3data = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "slice3data.pkl")
slice4data = pd.read_pickle(config["SeqP450Data"]["encoded_path"] + "slice4data.pkl")

In [4]:
# 模型读取
# Model loading
slice0model = pd.read_pickle(config["SeqP450Data"]["model_path"] + "slice0model.dat")
slice1model = pd.read_pickle(config["SeqP450Data"]["model_path"] + "slice1model.dat")
slice2model = pd.read_pickle(config["SeqP450Data"]["model_path"] + "slice2model.dat")
slice3model = pd.read_pickle(config["SeqP450Data"]["model_path"] + "slice3model.dat")
slice4model = pd.read_pickle(config["SeqP450Data"]["model_path"] + "slice4model.dat")

2. 数据预测(Data prediction)

In [5]:
# 特征提取方法
# Feature extraction method
def create_input_and_output_data(df):
    X = ()
    y = ()
    for ind in df.index:
        emb = df["ESM1b"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)

        X = X + (np.concatenate([ecfp, emb]),)
        y = y + (df["Binding"][ind],)

    return (X, y)


# 进行特征提取操作
# Perform feature extraction
feature_names = ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_" + str(i) for i in range(1280)]
slice0data_X, slice0data_y = create_input_and_output_data(df=slice0data)
slice1data_X, slice1data_y = create_input_and_output_data(df=slice1data)
slice2data_X, slice2data_y = create_input_and_output_data(df=slice2data)
slice3data_X, slice3data_y = create_input_and_output_data(df=slice3data)
slice4data_X, slice4data_y = create_input_and_output_data(df=slice4data)

In [6]:
# 数据合并
# Data merging
slicedatas = [slice0data, slice1data, slice2data, slice3data, slice4data]
slicedatas_x = [slice0data_X, slice1data_X, slice2data_X, slice3data_X, slice4data_X]
slicedatas_y = [slice0data_y, slice1data_y, slice2data_y, slice3data_y, slice4data_y]
slicemodels = [slice0model, slice1model, slice2model, slice3model, slice4model]

In [7]:
# 数据预测
# Data prediction
for i in range(len(slicedatas)):
    bst = slicemodels[i]
    dwant = xgb.DMatrix(
        np.array(slicedatas_x[i]),
        label=np.array(slicedatas_y[i]),
        feature_names=feature_names,
    )
    y_test_pred = bst.predict(dwant)
    slicedatas[i]["scores"] = y_test_pred

3. 数据保存(Data saving)

In [8]:
# 预测数据整合
# Prediction data integration
merged_data = pd.concat(slicedatas, ignore_index=True)
sorted_data = merged_data.sort_values(
    by=["substrate", "scores"], ascending=[True, False]
)
sorted_data["ranking"] = sorted_data.groupby("substrate").cumcount() + 1
# 数据存储
# Data storage
sorted_data.to_pickle(config["SeqP450Data"]["encoded_path"] + "5foldsdata.pkl")