In [33]:
import pandas as pd
import tomli
import torch
from Bio import SeqIO
from rdkit import Chem
from rdkit.Chem import AllChem

In [34]:
with open("src/config.toml", "rb") as f:
    config = tomli.load(f)

1. 数据读取(Data loading)

In [35]:
# queryname = "Arabidopsis_thaliana"
# queryname = "Erigeron_breviscapus"
# queryname = "Glycine_max"
queryname = "Zea_mays"

In [None]:
# 从编码数据中读取所有底物
# Read all substrates from encoded data
deletedata = pd.read_pickle(
    config["screeningData"]["encoded_path"] + f"{queryname}_deletedata.pkl"
)
sublist = deletedata["substrate"].unique().tolist()

# 读取所有可能的组合方式
# Read all possible combinations
enzyme_list = []
sequence_list = []
substrate_list = []

for record in SeqIO.parse(
    config["screeningData"]["enzyme_path"] + f"{queryname}.fasta",
    "fasta",
):
    for i in sublist:
        substrate_list.append(i)
        enzyme_list.append(record.id)
        sequence_list.append(str(record.seq))

for record in SeqIO.parse(
    config["screeningData"]["enzyme_path"] + f"{queryname}.pep", "fasta"
):
    for i in sublist:
        substrate_list.append(i)
        enzyme_list.append(record.id)
        sequence_list.append(str(record.seq))

# 放入dataframe
# Insert into DataFrame
df = pd.DataFrame(
    {"enzyme": enzyme_list, "sequence": sequence_list, "substrate": substrate_list}
)
# 删除重复项
# Remove duplicates
df = df.drop_duplicates(keep="first")
df = df.drop_duplicates(subset=["enzyme", "substrate"])

2. 数据特征提取与汇总(Data feature extraction and aggregation)

In [None]:
# 使用esm进行序列特征提取（仅需要执行一次即可）
# Extract sequence features using ESM (needs to be executed only once)

# ! python src/codes/extract.py esm1b_t33_650M_UR50S {config['screeningData']['enzyme_path']}{queryname}.pep {config['screeningData']['esm_path']} --repr_layers 33 --include mean

Transferred model to GPU
Read data/screeningData/enzyme/Zea_mays.pep with 229 sequences
Processing 1 of 32 batches (9 sequences)
Processing 2 of 32 batches (8 sequences)
Processing 3 of 32 batches (8 sequences)
Processing 4 of 32 batches (8 sequences)
Processing 5 of 32 batches (8 sequences)
Processing 6 of 32 batches (8 sequences)
Processing 7 of 32 batches (8 sequences)
Processing 8 of 32 batches (8 sequences)
Processing 9 of 32 batches (8 sequences)
Processing 10 of 32 batches (8 sequences)
Processing 11 of 32 batches (7 sequences)
Processing 12 of 32 batches (7 sequences)
Processing 13 of 32 batches (7 sequences)
Processing 14 of 32 batches (7 sequences)
Processing 15 of 32 batches (7 sequences)
Processing 16 of 32 batches (7 sequences)
Processing 17 of 32 batches (7 sequences)
Processing 18 of 32 batches (7 sequences)
Processing 19 of 32 batches (7 sequences)
Processing 20 of 32 batches (7 sequences)
Processing 21 of 32 batches (7 sequences)
Processing 22 of 32 batches (7 sequence

In [38]:
df["ESM1b"] = ""
df["ECFP"] = ""

for ind in df.index:
    esms = torch.load(
        config["screeningData"]["esm_path"] + str(df["enzyme"][ind]) + ".pt"
    )
    sdf_file_path = (
        config["SeqP450Data"]["substrate_path"] + df["substrate"][ind] + ".sdf"
    )
    mol = Chem.MolFromMolFile(sdf_file_path)
    ecfpso = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024).ToBitString()
    df["ESM1b"][ind] = esms["mean_representations"][33].tolist()
    df["ECFP"][ind] = ecfpso
df["Binding"] = 0

In [None]:
# 数据存储
# Data storage
df.to_pickle(config["screeningData"]["encoded_path"] + f"{queryname}_data.pkl")

In [None]:
# 数据验证
# Data validation
print(df["substrate"].nunique())
print(df["enzyme"].nunique())
print(df["substrate"].nunique() * df["enzyme"].nunique())
print(df.shape[0])

13
232
3016
3016
