In [36]:
import numpy as np
import pandas as pd
import tomli
import xgboost as xgb

In [37]:
with open("src/config.toml", "rb") as f:
    config = tomli.load(f)

1. 数据汇总(Data aggregation)

In [38]:
dataname = "Arabidopsis_thaliana"
# dataname = "Erigeron_breviscapus"
# dataname = "Glycine_max"
# dataname = "Zea_mays"

data = pd.read_pickle(config["screeningData"]["encoded_path"] + f"{dataname}_data.pkl")
delete = pd.read_pickle(
    config["screeningData"]["encoded_path"] + f"{dataname}_deletedata.pkl"
)
alldata = pd.concat([data, delete], ignore_index=True)

2. 模型预测(Model prediction)

In [39]:
def create_input_and_output_data(df):
    X = ()
    y = ()
    for ind in df.index:
        emb = df["ESM1b"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)

        X = X + (np.concatenate([ecfp, emb]),)
        y = y + (df["Binding"][ind],)

    return (X, y)


feature_names = ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_" + str(i) for i in range(1280)]

In [40]:
bst = pd.read_pickle(
    config["screeningData"]["model_path"] + f"{dataname}_deletedatamodel.dat",
)

In [41]:
data_X, data_y = create_input_and_output_data(df=alldata)
dwant = xgb.DMatrix(
    np.array(data_X), label=np.array(data_y), feature_names=feature_names
)
y_test_pred = bst.predict(dwant)
alldata["scores"] = y_test_pred
alldata.to_pickle(config["screeningData"]["encoded_path"] + f"{dataname}_scores.pkl")

In [42]:
print(len(alldata))
print(alldata["enzyme"].nunique())
print(alldata["substrate"].nunique())

5435
235
23
