In [1]:
import itertools
import random

import numpy as np
import pandas as pd
import tomli
import torch
from Bio import SeqIO
from rdkit import Chem
from rdkit.Chem import AllChem

In [2]:
with open("src/config.toml", "rb") as f:
    config = tomli.load(f)

1. 数据读取(Data loading)

In [None]:
# 酶底物数据对读取
# Read enzyme-substrate data pairs
ourdata = pd.read_csv(
    config["SeqP450Data"]["pair_path"] + "P450_Substrate.txt",
    sep="\t",
    header=0,
    names=["enzyme", "substrate"],
)
# 随机排序
# Random shuffle
ourdata = ourdata.sample(frac=1, random_state=42)

# 序列读取
# Sequence loading
sequences = SeqIO.to_dict(
    SeqIO.parse(
        config["SeqP450Data"]["enzyme_path"] + "FunP450_All_06_new_Plant_q.fasta",
        "fasta",
    )
)

In [None]:
# 将序列接入
# Input sequences

ourdata["sequence"] = ""
for index, row in ourdata.iterrows():
    nowenzyme = row["enzyme"]
    nowsubstrate = row["substrate"]
    try:
        target_sequence = sequences[nowenzyme].seq
    except:
        target_sequence = ""
        print(nowenzyme)
    ourdata.loc[index, "sequence"] = str(target_sequence)

In [None]:
# 空值检测
# Missing value detection

has_null_values = ourdata.isna().any().any()
has_empty_strings = ourdata.applymap(lambda x: x == "")
if has_empty_strings.any().any():
    print("DataFrame contains empty strings.")
    ourdata = ourdata[ourdata["sequence"] != ""]
if has_null_values:
    print("DataFrame contains empty values.")
    ourdata = ourdata[ourdata["sequence"] != ""]

In [6]:
# 结果展示
ourdata

Unnamed: 0,enzyme,substrate,sequence
304,CYP71AY5,GEI,MEVMQLSFSYPALFLFVFFLFMLVKQLRRPKNLPPGPNKLPIIGNL...
501,CYP85A2,CAT,MGIMMMILGLLVIIVCLCTALLRWNQMRYSKKGLPPGTMGWPIFGE...
441,CYP716A94,BAM,MQLFYVPLLSLFVLFVSLSFYFLFYKSKSGSTSGLPLPPGKTGWPV...
153,CYP80B2,NME,MEVLSIAIVSFSFLLFLFFILRDSRPKNLPPGPRPSPIVGNLLQLG...
503,CYP85A69,DOC,MAFFLVFLASFFGLCIFSTSLLRWNQVKYNNKNLPPGTMGWPLFGE...
...,...,...,...
71,CYP82E1,NCT,MYHLLSPIEAIVGLVTFAFLLYLLWTKKQSKILNPLPPKIPGGWPV...
106,CYP706V2,KEN,MMKVVLVDSAPAMAAVLILTLSIIWILQKMFNPRATKDVLPPGPRG...
270,CYP71D495,CAS,MLFFITVLFIFIALRIWKKSKANSTPNLPPGPNKLPLIGNVHNLVG...
435,CYP716A155,LUP,MEFFYASLLCLFVSLVFLSLHLLFYKTKTGSLPPGKTGWPVIGESL...


In [None]:
# 使用esm进行序列特征提取（仅需要执行一次即可）
# Extract sequence features using ESM (needs to be executed only once)

# ! python src/codes/extract.py esm1b_t33_650M_UR50S {config['SeqP450Data']['enzyme_path']}FunP450_All_06_new_Plant_q.fasta {config['SeqP450Data']['esm_path']} --repr_layers 33 --include mean

Transferred model to GPU
Read data/SeqP450Data/enzyme/FunP450_All_06_new_Plant_q.fasta with 578 sequences
Processing 1 of 77 batches (8 sequences)
Processing 2 of 77 batches (8 sequences)
Processing 3 of 77 batches (8 sequences)
Processing 4 of 77 batches (8 sequences)
Processing 5 of 77 batches (8 sequences)
Processing 6 of 77 batches (8 sequences)
Processing 7 of 77 batches (8 sequences)
Processing 8 of 77 batches (8 sequences)
Processing 9 of 77 batches (8 sequences)
Processing 10 of 77 batches (8 sequences)
Processing 11 of 77 batches (8 sequences)
Processing 12 of 77 batches (8 sequences)
Processing 13 of 77 batches (8 sequences)
Processing 14 of 77 batches (8 sequences)
Processing 15 of 77 batches (8 sequences)
Processing 16 of 77 batches (8 sequences)
Processing 17 of 77 batches (8 sequences)
Processing 18 of 77 batches (8 sequences)
Processing 19 of 77 batches (8 sequences)
Processing 20 of 77 batches (8 sequences)
Processing 21 of 77 batches (8 sequences)
Processing 22 of 77 b

In [None]:
ourdata["ESM1b"] = ""
ourdata["ECFP"] = ""


# 为每行记录增添新的特征
# Add new features to each row record
for ind in ourdata.index:
    esms = torch.load(
        config["SeqP450Data"]["esm_path"] + str(ourdata["enzyme"][ind]) + ".pt"
    )
    mol = Chem.MolFromMolFile(
        config["SeqP450Data"]["substrate_path"] + ourdata["substrate"][ind] + ".sdf"
    )
    ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024).ToBitString()
    ourdata["ESM1b"][ind] = esms["mean_representations"][33].tolist()
    ourdata["ECFP"][ind] = ecfp
# 标记Binding为True
# Set Binding to True
ourdata["Binding"] = 1

In [None]:
# 异常值检测
# Outlier detection
has_null_values = ourdata.isna().any().any()
has_empty_strings = ourdata.applymap(lambda x: x == "")
if has_empty_strings.any().any():
    print("DataFrame contains empty strings.")
if has_null_values:
    print("DataFrame contains empty values.")

In [None]:
# 数据统计并输出
# Data statistics and output

enzyme_unique_count = ourdata["enzyme"].nunique()
substrate_unique_count = ourdata["substrate"].nunique()
ecfp_unique_count = ourdata["ECFP"].nunique()

print("enzyme_unique_count", enzyme_unique_count)
print("substrate_unique_count", substrate_unique_count)
print("ecfp_unique_count", ecfp_unique_count)

enzyme_unique_count 517
substrate_unique_count 237
ecfp_unique_count 231


2. 数据切分(Data splitting)

In [None]:
# 五倍交叉验证
# Five-fold cross-validation
num_partitions = 5
partition_size = len(ourdata) // num_partitions
# 随机数种子
# Random seed
random.seed(42)
all_substrate = ourdata["substrate"].unique()

all_enzyme = np.array([])

# 正样本数据划分，将切分的数据进行拼接整理
# Split positive sample data and concatenate the segments for organization
ourdatas = []
for i in range(num_partitions):
    if int(i + 1) == num_partitions:
        ourdatas.append(ourdata.iloc[i * partition_size :])
    else:
        ourdatas.append(ourdata.iloc[i * partition_size : (i + 1) * partition_size])

# 负样本数据划分，为每个酶随机选取两未记录的底物，作为负样本
# Negative sample data splitting: randomly select two unrecorded substrates for each enzyme as negative samples
ourdata_negs = []
for num, ourdata_i in enumerate(ourdatas):
    enzyme_unique = ourdata_i["enzyme"].unique()
    ourdata_neg_i = pd.DataFrame(columns=["enzyme", "substrate", "sequence", "ESM1b"])
    rows_to_add = []
    # 为每个酶随机选取两未记录的底物，作为负样本
    # Randomly select two unrecorded substrates for each enzyme as negative samples
    for enzyme in enzyme_unique:
        selected_substrates = random.sample(list(all_substrate), 2)
        rows_to_add.append({"enzyme": enzyme, "substrate": selected_substrates[0]})
        rows_to_add.append({"enzyme": enzyme, "substrate": selected_substrates[1]})
    ourdata_neg_i = pd.concat(
        [ourdata_neg_i, pd.DataFrame(rows_to_add)], ignore_index=True
    )
    # 连接esm特征
    # Concatenate ESM features
    for index, row in ourdata_i.iterrows():
        matching_rows = ourdata_neg_i[(ourdata_neg_i["enzyme"] == row["enzyme"])]
        if not matching_rows.empty:
            for matching_index in matching_rows.index:
                ourdata_neg_i.at[matching_index, "ESM1b"] = row["ESM1b"]
    # 连接ecfp特征
    # Concatenate ecfp features
    for index, row in ourdata.iterrows():
        matching_rows = ourdata_neg_i[(ourdata_neg_i["substrate"] == row["substrate"])]
        if not matching_rows.empty:
            for matching_index in matching_rows.index:
                ourdata_neg_i.at[matching_index, "ECFP"] = row["ECFP"]
    # 标记binding为false
    ourdata_neg_i["Binding"] = 0
    # 数据拼接
    # Set binding to False
    ourdata_negs.append(ourdata_neg_i)
    # 正负样本合并
    # Positive and negative sample merging
    merged_data = pd.concat([ourdata_i, ourdata_neg_i], ignore_index=True)
    selected_columns = ["enzyme", "substrate", "Binding", "ECFP", "ESM1b"]
    final_data = merged_data.loc[:, selected_columns]
    # 数据保存
    # Data saving
    final_data.to_pickle(
        config["SeqP450Data"]["encoded_path"] + "p450plant" + str(num) + ".pkl"
    )
    slice_enzyme = ourdata_i["enzyme"].unique()
    slice_enzyme = np.setdiff1d(slice_enzyme, all_enzyme)

    # 整理所有可能的组合
    # Organize all possible combinations
    for enzyme_j in slice_enzyme:
        if enzyme_j not in all_enzyme:
            all_enzyme = np.append(all_enzyme, enzyme_j)

    combinations = list(itertools.product(slice_enzyme, all_substrate))
    result_df = pd.DataFrame(combinations, columns=["enzyme", "substrate"])
    for index, row in result_df.iterrows():
        the_enzyme = row["enzyme"]
        the_substrate = row["substrate"]
        matching_row = ourdata_i[
            (ourdata_i["enzyme"] == the_enzyme)
            & (ourdata_i["substrate"] == the_substrate)
            & (ourdata_i["Binding"] == 1)
        ]
        if not matching_row.empty:
            result_df.at[index, "Binding"] = 1
        else:
            result_df.at[index, "Binding"] = 0
    result_df["ESM1b"] = None
    result_df["ECFP"] = None
    # 补充特征
    # Supplement features
    for index, row in ourdata_i.iterrows():
        matching_rows = result_df[(result_df["enzyme"] == row["enzyme"])]
        if not matching_rows.empty:
            for matching_index in matching_rows.index:
                result_df.at[matching_index, "ESM1b"] = row["ESM1b"]
    for index, row in ourdata.iterrows():
        matching_rows = result_df[(result_df["substrate"] == row["substrate"])]
        if not matching_rows.empty:
            for matching_index in matching_rows.index:
                result_df.at[matching_index, "ECFP"] = row["ECFP"]
    # 清除所有输出
    result_df.to_pickle(
        config["SeqP450Data"]["encoded_path"] + "slice" + str(num) + "data.pkl"
    )

  if enzyme_j not in all_enzyme:
