# 0.加载模块 Load module


In [1]:
# 导入所需的库
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

import os
import re
import time
import json
import subprocess
import requests
from requests.adapters import HTTPAdapter, Retry

import pubchempy as pcp
from pubchempy import PubChemHTTPError
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem

import torch
from torch import nn
from torch.nn import functional as F
import transformers
from transformers import (
    AutoTokenizer,
    AutoModel,
)
from torch.utils.data import Dataset, DataLoader

from d2l import torch as d2l
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    balanced_accuracy_score,
)

import sys
import seaborn as sns
# 设置调色板
# Set palette
sns.set_palette("muted")
# 设置环境变量为false
# Set the environment variable to false
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 导入模型文件
# Import model files
sys.path.append('.//code')
from model import Net, NetESM

# 导入数据处理文件
# Import dataprocess files
from dataprocess import DataProcess, MyDatasetPredict

In [2]:
# 定义关键词enzyme的原始数据文件
# Define the original file for keyword "enzyme"
DATE = "20240611"
path_source_enzyme = f"../data/data_reviewed_enzyme_{DATE}.tsv"
path_enzyme_tem = path_source_enzyme.replace(f'{DATE}', f'{DATE}_tem')
path_enzyme_SMILES = path_source_enzyme.replace(f'{DATE}', f'{DATE}_SMILES')

# 定义额外的测试数据的原始数据文件
# Define the original files of additional test data
path_source_EC11 = f"../data/data_reviewed_EC11_{DATE}.tsv"
path_source_EC27 = f"../data/data_reviewed_EC27_{DATE}.tsv"
path_source_EC31 = f"../data/data_reviewed_EC31_{DATE}.tsv"
path_source_EC42 = f"../data/data_reviewed_EC42_{DATE}.tsv"
path_source_EC56 = f"../data/data_reviewed_EC56_{DATE}.tsv"
path_source_EC63 = f"../data/data_reviewed_EC63_{DATE}.tsv"
path_source_EC71 = f"../data/data_reviewed_EC71_{DATE}.tsv"


# # 定义具有Catalytic Activity的原始数据文件，DATE后添加01，来区分源于path_source_enzyme的训练集、验证集和测试集文件
# # Define the original data file with Catalytic Activity, add '01' after DATE to distinguish the training set, validation set and test set files originating from path_source_enzyme
# DATE = "20240611"
# DATE += '01'
# path_source_CA = f"../data/data_reviewed_CA_{DATE}.tsv"
# path_CA_tem = path_source_CA.replace(f'{DATE}', f'{DATE}_tem')
# path_CA_SMILES = path_source_CA.replace(f'{DATE}', f'{DATE}_SMILES')

# 定义提取的同一个催化反应的底物/产物的最大数量
# Define the maximum number of substrates/products extracted from the same catalytic reaction
NUMBER_REACTION = 10

# 定义序列聚类的一致性阈值，比如0.8,0.6,0.4
# Define the sequence identity threshold for sequence clustering, e.g., 0.8, 0.6, 0.4
SEQ_IDENTITY = 0.80
# 定义序列聚类的文件
# Define the sequence clustering files
path_infile_fasta = f"../data/review_sequence_{DATE}.fasta"
path_outfile_fasta = f"../data/review_sequence_{DATE}_cdhit_{int(SEQ_IDENTITY*100)}"
CLSTR_FILE = path_outfile_fasta + ".clstr"
path_train_fasta = f"../data/review_sequence_train_{DATE}_{int(SEQ_IDENTITY*100)}.fasta"
path_test_fasta = f"../data/review_sequence_test_{DATE}_{int(SEQ_IDENTITY*100)}.fasta"

# 定义训练集、验证集和测试集文件
# Define the files of training set, validation set, and testing set
path_train_data = f"../data/train_data_{DATE}_{int(SEQ_IDENTITY*100)}.tsv"
path_valid_data = f"../data/valid_data_{DATE}_{int(SEQ_IDENTITY*100)}.tsv"
path_test_data = f"../data/test_data_{DATE}_{int(SEQ_IDENTITY*100)}.tsv"
path_test_data_80_100 = f"../data/test_data_80_100_{DATE}_{int(SEQ_IDENTITY*100)}.tsv"
path_test_data_60_80 = f"../data/test_data_60_80_{DATE}_{int(SEQ_IDENTITY*100)}.tsv"
path_test_data_40_60 = f"../data/test_data_40_60_{DATE}_{int(SEQ_IDENTITY*100)}.tsv"
path_test_data_0_40 = f"../data/test_data_0_40_{DATE}_{int(SEQ_IDENTITY*100)}.tsv"

path_name_to_smiles_cache = '../data/name_to_smiles_cache_20240611.json'
dataProcess = DataProcess(path_name_to_smiles_cache = path_name_to_smiles_cache)

# 1.获取Uniprot数据 Get Uniprot data

In [4]:
re_next_link = re.compile(r'<(.+)>; rel="next"')
retries = Retry(total=50, backoff_factor=0.5,
                status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))


def get_next_link(headers):
    '''
    获取下一个链接
    :param headers: 响应头
    :return: 下一个链接
    Get next link
    :param headers: response headers
    :return: next link
    '''
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)

def get_batch(batch_url):
    '''
    获取批次
    :param batch_url: 批次链接
    :return: 响应和总数
    Get batch
    :param batch_url: batch link
    :return: response and total
    '''
    while batch_url:
        response = session.get(batch_url)
        response.raise_for_status()
        total = response.headers["x-total-results"]
        yield response, total
        batch_url = get_next_link(response.headers)
def get_data(path, url, progress = 0):
    '''
    获取数据
    :param path: 文件路径
    :param url: 链接
    :param progress: 进度
    retrieve data
    :param path: file path
    :param url: link
    :param progress: progress
    '''
    with open(path, "w") as f:
        for batch, total in get_batch(url):
            lines = batch.text.splitlines()
            if not progress:
                print(lines[0], file=f)
            for line in lines[1:]:
                print(line, file=f)
            progress += len(lines[1:])
            print(f"{progress} / {total}")

In [3]:
url = "https://rest.uniprot.org/uniprotkb/search?fields=accession%2Cid%2Cprotein_name%2Csequence%2Ccc_catalytic_activity%2Corganism_name%2Cgene_names%2Clength%2Cmass%2Cec&format=tsv&query=%28enzyme%29+AND+%28reviewed%3Atrue%29&size=500"

source_file = path_source_enzyme

get_data(path=source_file,url=url)

500 / 64742
1000 / 64742
1500 / 64742
2000 / 64742
2500 / 64742
3000 / 64742
3500 / 64742
4000 / 64742
4500 / 64742
5000 / 64742
5500 / 64742
6000 / 64742
6500 / 64742
7000 / 64742
7500 / 64742
8000 / 64742
8500 / 64742
9000 / 64742
9500 / 64742
10000 / 64742
10500 / 64742
11000 / 64742
11500 / 64742
12000 / 64742
12500 / 64742
13000 / 64742
13500 / 64742
14000 / 64742
14500 / 64742
15000 / 64742
15500 / 64742
16000 / 64742
16500 / 64742
17000 / 64742
17500 / 64742
18000 / 64742
18500 / 64742
19000 / 64742
19500 / 64742
20000 / 64742
20500 / 64742
21000 / 64742
21500 / 64742
22000 / 64742
22500 / 64742
23000 / 64742
23500 / 64742
24000 / 64742
24500 / 64742
25000 / 64742
25500 / 64742
26000 / 64742
26500 / 64742
27000 / 64742
27500 / 64742
28000 / 64742
28500 / 64742
29000 / 64742
29500 / 64742
30000 / 64742
30500 / 64742
31000 / 64742
31500 / 64742
32000 / 64742
32500 / 64742
33000 / 64742
33500 / 64742
34000 / 64742
34500 / 64742
35000 / 64742
35500 / 64742
36000 / 64742
36500 / 6474

In [4]:
url = 'https://rest.uniprot.org/uniprotkb/search?fields=accession%2Cid%2Cprotein_name%2Csequence%2Ccc_catalytic_activity%2Corganism_name%2Cgene_names%2Clength%2Cmass%2Cec&format=tsv&query=%28%28cc_catalytic_activity%3A*%29%29+AND+%28reviewed%3Atrue%29&size=500'

source_file = path_source_CA
        
get_data(path=source_file,url=url)

500 / 255402
1000 / 255402
1500 / 255402
2000 / 255402
2500 / 255402
3000 / 255402
3500 / 255402
4000 / 255402
4500 / 255402
5000 / 255402
5500 / 255402
6000 / 255402
6500 / 255402
7000 / 255402
7500 / 255402
8000 / 255402
8500 / 255402
9000 / 255402
9500 / 255402
10000 / 255402
10500 / 255402
11000 / 255402
11500 / 255402
12000 / 255402
12500 / 255402
13000 / 255402
13500 / 255402
14000 / 255402
14500 / 255402
15000 / 255402
15500 / 255402
16000 / 255402
16500 / 255402
17000 / 255402
17500 / 255402
18000 / 255402
18500 / 255402
19000 / 255402
19500 / 255402
20000 / 255402
20500 / 255402
21000 / 255402
21500 / 255402
22000 / 255402
22500 / 255402
23000 / 255402
23500 / 255402
24000 / 255402
24500 / 255402
25000 / 255402
25500 / 255402
26000 / 255402
26500 / 255402
27000 / 255402
27500 / 255402
28000 / 255402
28500 / 255402
29000 / 255402
29500 / 255402
30000 / 255402
30500 / 255402
31000 / 255402
31500 / 255402
32000 / 255402
32500 / 255402
33000 / 255402
33500 / 255402
34000 / 255402


In [3]:
url = 'https://rest.uniprot.org/uniprotkb/search?fields=accession%2Cid%2Cprotein_name%2Csequence%2Ccc_catalytic_activity%2Corganism_name%2Cgene_names%2Clength%2Cmass%2Cec&format=tsv&query=%28%28cc_catalytic_activity%3A*%29%29+AND+%28reviewed%3Afalse%29+AND+%28annotation_score%3A5%29&size=500'

source_file = path_source_CA_unreviewed_5
get_data(path=source_file,url=url)

500 / 97517
1000 / 97517
1500 / 97517
2000 / 97517
2500 / 97517
3000 / 97517
3500 / 97517
4000 / 97517
4500 / 97517
5000 / 97517
5500 / 97517
6000 / 97517
6500 / 97517
7000 / 97517
7500 / 97517
8000 / 97517
8500 / 97517
9000 / 97517
9500 / 97517
10000 / 97517
10500 / 97517
11000 / 97517
11500 / 97517
12000 / 97517
12500 / 97517
13000 / 97517
13500 / 97517
14000 / 97517
14500 / 97517
15000 / 97517
15500 / 97517
16000 / 97517
16500 / 97517
17000 / 97517
17500 / 97517
18000 / 97517
18500 / 97517
19000 / 97517
19500 / 97517
20000 / 97517
20500 / 97517
21000 / 97517
21500 / 97517
22000 / 97517
22500 / 97517
23000 / 97517
23500 / 97517
24000 / 97517
24500 / 97517
25000 / 97517
25500 / 97517
26000 / 97517
26500 / 97517
27000 / 97517
27500 / 97517
28000 / 97517
28500 / 97517
29000 / 97517
29500 / 97517
30000 / 97517
30500 / 97517
31000 / 97517
31500 / 97517
32000 / 97517
32500 / 97517
33000 / 97517
33500 / 97517
34000 / 97517
34500 / 97517
35000 / 97517
35500 / 97517
36000 / 97517
36500 / 9751

In [6]:
url = 'https://rest.uniprot.org/uniprotkb/search?fields=accession%2Cid%2Cprotein_name%2Csequence%2Ccc_catalytic_activity%2Corganism_name%2Cgene_names%2Clength%2Cmass%2Cec&format=tsv&query=%28%28ec%3A1.1.*%29%29+AND+%28reviewed%3Atrue%29&size=500'

source_file = path_source_EC11
get_data(path=source_file,url=url)

500 / 19470
1000 / 19470
1500 / 19470
2000 / 19470
2500 / 19470
3000 / 19470
3500 / 19470
4000 / 19470
4500 / 19470
5000 / 19470
5500 / 19470
6000 / 19470
6500 / 19470
7000 / 19470
7500 / 19470
8000 / 19470
8500 / 19470
9000 / 19470
9500 / 19470
10000 / 19470
10500 / 19470
11000 / 19470
11500 / 19470
12000 / 19470
12500 / 19470
13000 / 19470
13500 / 19470
14000 / 19470
14500 / 19470
15000 / 19470
15500 / 19470
16000 / 19470
16500 / 19470
17000 / 19470
17500 / 19470
18000 / 19470
18500 / 19470
19000 / 19470
19470 / 19470


In [7]:
url = 'https://rest.uniprot.org/uniprotkb/search?fields=accession%2Cid%2Cprotein_name%2Csequence%2Ccc_catalytic_activity%2Corganism_name%2Cgene_names%2Clength%2Cmass%2Cec&format=tsv&query=%28%28ec%3A2.7.*%29%29+AND+%28reviewed%3Atrue%29&size=500'

source_file = path_source_EC27
get_data(path=source_file,url=url)

500 / 36893
1000 / 36893
1500 / 36893
2000 / 36893
2500 / 36893
3000 / 36893
3500 / 36893
4000 / 36893
4500 / 36893
5000 / 36893
5500 / 36893
6000 / 36893
6500 / 36893
7000 / 36893
7500 / 36893
8000 / 36893
8500 / 36893
9000 / 36893
9500 / 36893
10000 / 36893
10500 / 36893
11000 / 36893
11500 / 36893
12000 / 36893
12500 / 36893
13000 / 36893
13500 / 36893
14000 / 36893
14500 / 36893
15000 / 36893
15500 / 36893
16000 / 36893
16500 / 36893
17000 / 36893
17500 / 36893
18000 / 36893
18500 / 36893
19000 / 36893
19500 / 36893
20000 / 36893
20500 / 36893
21000 / 36893
21500 / 36893
22000 / 36893
22500 / 36893
23000 / 36893
23500 / 36893
24000 / 36893
24500 / 36893
25000 / 36893
25500 / 36893
26000 / 36893
26500 / 36893
27000 / 36893
27500 / 36893
28000 / 36893
28500 / 36893
29000 / 36893
29500 / 36893
30000 / 36893
30500 / 36893
31000 / 36893
31500 / 36893
32000 / 36893
32500 / 36893
33000 / 36893
33500 / 36893
34000 / 36893
34500 / 36893
35000 / 36893
35500 / 36893
36000 / 36893
36500 / 3689

In [8]:
url = 'https://rest.uniprot.org/uniprotkb/search?fields=accession%2Cid%2Cprotein_name%2Csequence%2Ccc_catalytic_activity%2Corganism_name%2Cgene_names%2Clength%2Cmass%2Cec&format=tsv&query=%28%28ec%3A3.1.*%29%29+AND+%28reviewed%3Atrue%29&size=500'

source_file = path_source_EC31
get_data(path=source_file,url=url)

500 / 21696
1000 / 21696
1500 / 21696
2000 / 21696
2500 / 21696
3000 / 21696
3500 / 21696
4000 / 21696
4500 / 21696
5000 / 21696
5500 / 21696
6000 / 21696
6500 / 21696
7000 / 21696
7500 / 21696
8000 / 21696
8500 / 21696
9000 / 21696
9500 / 21696
10000 / 21696
10500 / 21696
11000 / 21696
11500 / 21696
12000 / 21696
12500 / 21696
13000 / 21696
13500 / 21696
14000 / 21696
14500 / 21696
15000 / 21696
15500 / 21696
16000 / 21696
16500 / 21696
17000 / 21696
17500 / 21696
18000 / 21696
18500 / 21696
19000 / 21696
19500 / 21696
20000 / 21696
20500 / 21696
21000 / 21696
21500 / 21696
21696 / 21696


In [9]:
url = 'https://rest.uniprot.org/uniprotkb/search?fields=accession%2Cid%2Cprotein_name%2Csequence%2Ccc_catalytic_activity%2Corganism_name%2Cgene_names%2Clength%2Cmass%2Cec&format=tsv&query=%28%28ec%3A4.2.*%29%29+AND+%28reviewed%3Atrue%29&size=500'

source_file = path_source_EC42
get_data(path=source_file,url=url)

500 / 12026
1000 / 12026
1500 / 12026
2000 / 12026
2500 / 12026
3000 / 12026
3500 / 12026
4000 / 12026
4500 / 12026
5000 / 12026
5500 / 12026
6000 / 12026
6500 / 12026
7000 / 12026
7500 / 12026
8000 / 12026
8500 / 12026
9000 / 12026
9500 / 12026
10000 / 12026
10500 / 12026
11000 / 12026
11500 / 12026
12000 / 12026
12026 / 12026


In [10]:
url = 'https://rest.uniprot.org/uniprotkb/search?fields=accession%2Cid%2Cprotein_name%2Csequence%2Ccc_catalytic_activity%2Corganism_name%2Cgene_names%2Clength%2Cmass%2Cec&format=tsv&query=%28%28ec%3A5.6.*%29%29+AND+%28reviewed%3Atrue%29&size=500'

source_file = path_source_EC56
get_data(path=source_file,url=url)

500 / 2007
1000 / 2007
1500 / 2007
2000 / 2007
2007 / 2007


In [11]:
url = 'https://rest.uniprot.org/uniprotkb/search?fields=accession%2Cid%2Cprotein_name%2Csequence%2Ccc_catalytic_activity%2Corganism_name%2Cgene_names%2Clength%2Cmass%2Cec&format=tsv&query=%28%28ec%3A6.3.*%29%29+AND+%28reviewed%3Atrue%29&size=500'

source_file = path_source_EC63
get_data(path=source_file,url=url)

500 / 13548
1000 / 13548
1500 / 13548
2000 / 13548
2500 / 13548
3000 / 13548
3500 / 13548
4000 / 13548
4500 / 13548
5000 / 13548
5500 / 13548
6000 / 13548
6500 / 13548
7000 / 13548
7500 / 13548
8000 / 13548
8500 / 13548
9000 / 13548
9500 / 13548
10000 / 13548
10500 / 13548
11000 / 13548
11500 / 13548
12000 / 13548
12500 / 13548
13000 / 13548
13500 / 13548
13548 / 13548


In [12]:
url = 'https://rest.uniprot.org/uniprotkb/search?fields=accession%2Cid%2Cprotein_name%2Csequence%2Ccc_catalytic_activity%2Corganism_name%2Cgene_names%2Clength%2Cmass%2Cec&format=tsv&query=%28%28ec%3A7.1.*%29%29+AND+%28reviewed%3Atrue%29&size=500'

source_file = path_source_EC71
get_data(path=source_file,url=url)

500 / 9168
1000 / 9168
1500 / 9168
2000 / 9168
2500 / 9168
3000 / 9168
3500 / 9168
4000 / 9168
4500 / 9168
5000 / 9168
5500 / 9168
6000 / 9168
6500 / 9168
7000 / 9168
7500 / 9168
8000 / 9168
8500 / 9168
9000 / 9168
9168 / 9168


# 2.数据处理 Process data


解析化学反应中的底物和产物
Analyze substrates and products in chemical reactions

In [3]:
# 加载原始数据
# Load raw data
source_file = path_source_enzyme # 关键词enzyme的原始数据
data01 = pd.read_csv(source_file, sep="\t")
data01

Unnamed: 0,Entry,Entry Name,Protein names,Sequence,Catalytic activity,Organism,Gene Names,Length,Mass,EC number
0,P12821,ACE_HUMAN,Angiotensin-converting enzyme (ACE) (EC 3.4.15...,MGAASGRRGPGLLLPLPLLLLLPPQPALALDPGLQPGNFSADEAGA...,CATALYTIC ACTIVITY: Reaction=Release of a C-te...,Homo sapiens (Human),ACE DCP DCP1,1306,149715,3.4.15.1
1,P23368,MAOM_HUMAN,"NAD-dependent malic enzyme, mitochondrial (NAD...",MLSRLRVVSTTCTLACRHLHIKEKGKPLMLNPRTNKGMAFTLQERQ...,CATALYTIC ACTIVITY: Reaction=(S)-malate + NAD(...,Homo sapiens (Human),ME2,584,65444,1.1.1.38
2,P49427,UB2R1_HUMAN,Ubiquitin-conjugating enzyme E2 R1 (EC 2.3.2.2...,MARPLVPSSQKALLLELKGLQEEPVEGFRVTLVDEGDLYNWEVAIF...,CATALYTIC ACTIVITY: Reaction=S-ubiquitinyl-[E1...,Homo sapiens (Human),CDC34 UBCH3 UBE2R1,236,26737,2.3.2.23; 2.3.2.24
3,P62256,UBE2H_HUMAN,Ubiquitin-conjugating enzyme E2 H (EC 2.3.2.23...,MSSPSPGKRRMDTDVVKLIESKHEVTILGGLNEFVVKFYGPQGTPY...,CATALYTIC ACTIVITY: Reaction=S-ubiquitinyl-[E1...,Homo sapiens (Human),UBE2H,183,20655,2.3.2.23; 2.3.2.24
4,P61077,UB2D3_HUMAN,Ubiquitin-conjugating enzyme E2 D3 (EC 2.3.2.2...,MALKRINKELSDLARDPPAQCSAGPVGDDMFHWQATIMGPNDSPYQ...,CATALYTIC ACTIVITY: Reaction=S-ubiquitinyl-[E1...,Homo sapiens (Human),UBE2D3 UBC5C UBCH5C,147,16687,2.3.2.23; 2.3.2.24
...,...,...,...,...,...,...,...,...,...,...
64737,P14182,LICB_HAEIN,Protein LicB,MRGYLFGILSAVFWALSGLLYNELPLSEYTALGKVISLLFLIDFCS...,,Haemophilus influenzae (strain ATCC 51907 / DS...,licB HI_1538,292,32449,
64738,P42459,Y250_CORGL,Uncharacterized protein Cgl0250/cg0304 (ORFX),MQSNLLAVLFALASALTIAWGTVVRHRIALRTPKDGSLRSSPLLNA...,,Corynebacterium glutamicum (strain ATCC 13032 ...,Cgl0250 cg0304,289,31382,
64739,P70742,DSRD_ARCFU,Protein DsrD,MADYTEEDKQKVLAQLSKKTWKIPELAKILKMDKKVVKKIVQDLIN...,,Archaeoglobus fulgidus (strain ATCC 49558 / DS...,dsrD AF_0425,77,8822,
64740,Q1HPL8,NDUBA_BOMMO,NADH dehydrogenase [ubiquinone] 1 beta subcomp...,MVQDGNPPDDNVFRAFCNALYNTVDAPVTWFRETVVEPNQKKYPWY...,,Bombyx mori (Silk moth),,159,18999,


In [4]:
def extract_reaction(text):
    '''
    提取化学反应方程式Reaction
    Extract chemical reaction equation Reaction
    '''
    if not isinstance(text, str):
        return None

    # 定义一个空列表，用于存储提取出来的化学方程式
    # Define an empty list to store the extracted chemical equations
    reactions = []

    pattern = r"Reaction=(.*?)[.;]|;"

    allparts = re.findall(pattern, text)
    for part in allparts:
        if " = " in part:
            reactions.append(part.strip())

    return ";".join(np.unique(np.array(reactions)))

In [5]:
# 调用函数，将Catalytic activity列中的化学方程式提取出来，作为data01的新的一列Reaction
# Call the function to extract the chemical equation in the Catalytic activity column as a new column of Reaction in data01
data01["Reaction"] = data01["Catalytic activity"].apply(extract_reaction)
data01.replace(r"^\s*$", np.nan, regex=True, inplace=True)
# 去掉Reaction为空的数据
# Remove empty data in Reaction
data01.dropna(axis=0, subset=["Reaction"], inplace=True, ignore_index=True)
data01

Unnamed: 0,Entry,Entry Name,Protein names,Sequence,Catalytic activity,Organism,Gene Names,Length,Mass,EC number,Reaction
0,P12821,ACE_HUMAN,Angiotensin-converting enzyme (ACE) (EC 3.4.15...,MGAASGRRGPGLLLPLPLLLLLPPQPALALDPGLQPGNFSADEAGA...,CATALYTIC ACTIVITY: Reaction=Release of a C-te...,Homo sapiens (Human),ACE DCP DCP1,1306,149715,3.4.15.1,H2O + Leu-enkephalin = L-phenylalanyl-L-leucin...
1,P23368,MAOM_HUMAN,"NAD-dependent malic enzyme, mitochondrial (NAD...",MLSRLRVVSTTCTLACRHLHIKEKGKPLMLNPRTNKGMAFTLQERQ...,CATALYTIC ACTIVITY: Reaction=(S)-malate + NAD(...,Homo sapiens (Human),ME2,584,65444,1.1.1.38,(S)-malate + NAD(+) = CO2 + NADH + pyruvate;H(...
2,P49427,UB2R1_HUMAN,Ubiquitin-conjugating enzyme E2 R1 (EC 2.3.2.2...,MARPLVPSSQKALLLELKGLQEEPVEGFRVTLVDEGDLYNWEVAIF...,CATALYTIC ACTIVITY: Reaction=S-ubiquitinyl-[E1...,Homo sapiens (Human),CDC34 UBCH3 UBE2R1,236,26737,2.3.2.23; 2.3.2.24,S-ubiquitinyl-[E1 ubiquitin-activating enzyme]...
3,P62256,UBE2H_HUMAN,Ubiquitin-conjugating enzyme E2 H (EC 2.3.2.23...,MSSPSPGKRRMDTDVVKLIESKHEVTILGGLNEFVVKFYGPQGTPY...,CATALYTIC ACTIVITY: Reaction=S-ubiquitinyl-[E1...,Homo sapiens (Human),UBE2H,183,20655,2.3.2.23; 2.3.2.24,S-ubiquitinyl-[E1 ubiquitin-activating enzyme]...
4,P61077,UB2D3_HUMAN,Ubiquitin-conjugating enzyme E2 D3 (EC 2.3.2.2...,MALKRINKELSDLARDPPAQCSAGPVGDDMFHWQATIMGPNDSPYQ...,CATALYTIC ACTIVITY: Reaction=S-ubiquitinyl-[E1...,Homo sapiens (Human),UBE2D3 UBC5C UBCH5C,147,16687,2.3.2.23; 2.3.2.24,S-ubiquitinyl-[E1 ubiquitin-activating enzyme]...
...,...,...,...,...,...,...,...,...,...,...,...
46792,Q55034,STPA_SYNY3,Glucosylglycerol-phosphate phosphatase (GGP-P)...,MVLHQQRFSLDHGAFCQTLAQTENLLIVQDLDGVCMELVQDPLSRR...,CATALYTIC ACTIVITY: Reaction=2-O-(alpha-D-gluc...,Synechocystis sp. (strain PCC 6803 / Kazusa),stpA slr0746,422,46542,3.1.3.69,2-O-(alpha-D-glucopyranosyl)-sn-glycerol 3-pho...
46793,A0A411PQQ5,AGN2_PAEDI,Anthrone oxygenase AgnL2 (EC 1.10.3.-) (Agnest...,MSTTSAQATAVVTGSFLSGAMISLSLMAVPVLLDTTTEPTQLFFQW...,CATALYTIC ACTIVITY: Reaction=emodin anthrone +...,Paecilomyces divaricatus (Penicillium divarica...,AgnL2,164,17992,1.10.3.-,emodin anthrone + O2 = emodin + H(+) + H2O
46794,P0C988,RPB1_ASFWA,DNA-directed RNA polymerase RPB1 homolog (RPB1...,MEAGYAEIAAVQFNIAGDNDHKRQGVMEVTISNLFEGTLPAEGGIY...,CATALYTIC ACTIVITY: Reaction=a ribonucleoside ...,African swine fever virus (isolate Warthog/Nam...,War-109,1450,163822,2.7.7.6,a ribonucleoside 5'-triphosphate + RNA(n) = di...
46795,P0C987,RPB1_ASFP4,DNA-directed RNA polymerase RPB1 homolog (RPB1...,MEAGYAEIAAVQFNIAGDNDHKRQGVMEVTISNLFEGTLPAEGGIY...,CATALYTIC ACTIVITY: Reaction=a ribonucleoside ...,African swine fever virus (isolate Tick/South ...,Pret-111,1450,163880,2.7.7.6,a ribonucleoside 5'-triphosphate + RNA(n) = di...


In [6]:
# 从化学反应方程式Reaction中解析底物和产物
# Parse substrates and products from chemical Reaction
def reaction_to_labels(
    reaction, max_reactions=NUMBER_REACTION, max_substances=NUMBER_REACTION
):
    '''
    将reaction转化为目标标签
    Convert reaction into target tag
    '''
    # 初始化一个空字典，用于存储目标标签
    # Initialize an empty dictionary to store target tags
    labels = {}
    # 将催化反应按分号分割成列表
    # Split the catalytic reaction into a list by semicolon
    try:
        reaction_list = reaction.split(";")
    except:
        reaction_list = ""
    # 遍历每个催化反应，最多考虑前max_reactions个
    # Traverse each catalytic reaction, considering at most the first max_reactions
    for i in range(max_reactions):
        # 如果催化反应列表中还有元素，则取出第一个元素
        # If there are other elements in the catalytic reaction list, remove the first element
        if reaction_list and reaction:
            r = str(reaction_list.pop(0)).strip()
            # 将催化反应按等号分割成substrate和product
            # Split the catalytic reaction into substrate and product according to the equal sign
            substrates, products = r.split(" = ")
            # 将substrate和product按加号分割成列表
            # Split substrate and product into lists by plus sign
            substrates_list = substrates.split(" + ")
            products_list = products.split(" + ")
            # 遍历每个substrate，最多考虑前max_substances个
            # Traverse each substrate, considering at most the first max_substances
            for j in range(max_substances):
                # 如果substrate列表中还有元素，则取出第一个元素并去除空格
                # If there are still elements in the substrate list, remove the first element and remove the spaces
                if substrates_list:
                    s = substrates_list.pop(0).strip()
                    pattern01 = r'^(?:(\d*n?)|\(\d*n?\)|(an?)|\(an?\)|\(\d*n\+\d+\)|\(in\)|\(out\))\s'
                    s = re.sub(pattern01, '', s).strip()
                    pattern02 = r'\(in\)|\(out\)$'
                    s = re.sub(pattern02, '', s).strip()

                else:
                    # 否则，用""表示缺失数据
                    # Otherwise, use "" to indicate missing data
                    s = ""
                # 生成目标标签的键，格式为"reactioni_substratej"
                # Generate the key of the target tag in the format of "reactioni_substratej"
                key = "reaction" + str(i + 1) + "_substrate" + str(j + 1)

                # 将键和值存入字典中
                # Store keys and values ​​in the dictionary
                labels[key] = s
            # 遍历每个product，考虑前max_substances个
            # Traverse each product, considering the first max_substances
            for j in range(max_substances):
                # 如果产物列表中还有元素，则取出第一个元素并去除空格
                # If there are still elements in the product list, remove the first element and remove the spaces
                if products_list:
                    p = products_list.pop(0).strip()
                    pattern01 = r'^(?:(\d*n?)|\(\d*n?\)|(an?)|\(an?\)|\(\d*n\+\d+\)|\(in\)|\(out\))\s'
                    p = re.sub(pattern01, '', p).strip()
                    pattern02 = r'\(in\)|\(out\)$'
                    p = re.sub(pattern02, '', p).strip()
                else:
                    # 否则，用""表示缺失数据
                    # Otherwise, use "" to indicate missing data
                    p = ""
                # 生成目标标签的键，格式为"reactioni_productj"
                # Generate the key of the target tag in the format of "reactioni_productj"
                key = "reaction" + str(i + 1) + "_product" + str(j + 1) 
                # 将键和值存入字典中
                # Store keys and values ​​in the dictionary
                labels[key] = p
        else:
            # 如果催化反应列表中没有元素了，则用None表示缺失数据
            # If there are no elements in the catalytic reaction list, use None to indicate missing data.
            for j in range(max_substances):
                key = "reaction" + str(i + 1) + "_substrate" + str(j + 1)
                labels[key] = ""
                key = "reaction" + str(i + 1) + "_product" + str(j + 1)
                labels[key] = ""
    # 将目标标签的字典转换成数据框，并指定索引为原来的reaction列的值
    # Convert the dictionary of the target label into a dataframe, and specify the index as the value of the original reaction column
    labels_df = pd.DataFrame(labels, index=[reaction])
    # 返回目标标签的数据框
    # Return the data frame of the target label
    return labels_df


def chunk_to_labels(df, path):
    '''
    分批处理转化目标标签
    Process conversion goal tags in batches
    '''
    df.to_csv(path, sep="\t", index=False)
    chunksize = 1000
    chunks = pd.read_csv(path, sep="\t", chunksize=chunksize)
    df01 = pd.DataFrame()

    for chunk in chunks:
        chunk_labels_df_list = chunk["Reaction"].apply(reaction_to_labels)
        chunk_labels_df = pd.concat(np.array(chunk_labels_df_list))
        chunk_data01 = pd.merge(
            left=chunk,
            right=chunk_labels_df,
            how="left",
            left_on="Reaction",
            right_index=True,
        )
        chunk_data01 = chunk_data01.drop_duplicates()
        df01 = pd.concat([df01, chunk_data01])
    return df01

In [7]:
path_tem = path_enzyme_tem
data01 = chunk_to_labels(df=data01, path=path_tem)
data01

Unnamed: 0,Entry,Entry Name,Protein names,Sequence,Catalytic activity,Organism,Gene Names,Length,Mass,EC number,...,reaction10_product1,reaction10_product2,reaction10_product3,reaction10_product4,reaction10_product5,reaction10_product6,reaction10_product7,reaction10_product8,reaction10_product9,reaction10_product10
0,P12821,ACE_HUMAN,Angiotensin-converting enzyme (ACE) (EC 3.4.15...,MGAASGRRGPGLLLPLPLLLLLPPQPALALDPGLQPGNFSADEAGA...,CATALYTIC ACTIVITY: Reaction=Release of a C-te...,Homo sapiens (Human),ACE DCP DCP1,1306,149715,3.4.15.1,...,L-lysyl-L-proline,N-acetyl-L-seryl-L-aspartate,,,,,,,,
1,P23368,MAOM_HUMAN,"NAD-dependent malic enzyme, mitochondrial (NAD...",MLSRLRVVSTTCTLACRHLHIKEKGKPLMLNPRTNKGMAFTLQERQ...,CATALYTIC ACTIVITY: Reaction=(S)-malate + NAD(...,Homo sapiens (Human),ME2,584,65444,1.1.1.38,...,,,,,,,,,,
2,P49427,UB2R1_HUMAN,Ubiquitin-conjugating enzyme E2 R1 (EC 2.3.2.2...,MARPLVPSSQKALLLELKGLQEEPVEGFRVTLVDEGDLYNWEVAIF...,CATALYTIC ACTIVITY: Reaction=S-ubiquitinyl-[E1...,Homo sapiens (Human),CDC34 UBCH3 UBE2R1,236,26737,2.3.2.23; 2.3.2.24,...,,,,,,,,,,
3,P62256,UBE2H_HUMAN,Ubiquitin-conjugating enzyme E2 H (EC 2.3.2.23...,MSSPSPGKRRMDTDVVKLIESKHEVTILGGLNEFVVKFYGPQGTPY...,CATALYTIC ACTIVITY: Reaction=S-ubiquitinyl-[E1...,Homo sapiens (Human),UBE2H,183,20655,2.3.2.23; 2.3.2.24,...,,,,,,,,,,
4,P61077,UB2D3_HUMAN,Ubiquitin-conjugating enzyme E2 D3 (EC 2.3.2.2...,MALKRINKELSDLARDPPAQCSAGPVGDDMFHWQATIMGPNDSPYQ...,CATALYTIC ACTIVITY: Reaction=S-ubiquitinyl-[E1...,Homo sapiens (Human),UBE2D3 UBC5C UBCH5C,147,16687,2.3.2.23; 2.3.2.24,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46792,Q55034,STPA_SYNY3,Glucosylglycerol-phosphate phosphatase (GGP-P)...,MVLHQQRFSLDHGAFCQTLAQTENLLIVQDLDGVCMELVQDPLSRR...,CATALYTIC ACTIVITY: Reaction=2-O-(alpha-D-gluc...,Synechocystis sp. (strain PCC 6803 / Kazusa),stpA slr0746,422,46542,3.1.3.69,...,,,,,,,,,,
46793,A0A411PQQ5,AGN2_PAEDI,Anthrone oxygenase AgnL2 (EC 1.10.3.-) (Agnest...,MSTTSAQATAVVTGSFLSGAMISLSLMAVPVLLDTTTEPTQLFFQW...,CATALYTIC ACTIVITY: Reaction=emodin anthrone +...,Paecilomyces divaricatus (Penicillium divarica...,AgnL2,164,17992,1.10.3.-,...,,,,,,,,,,
46794,P0C988,RPB1_ASFWA,DNA-directed RNA polymerase RPB1 homolog (RPB1...,MEAGYAEIAAVQFNIAGDNDHKRQGVMEVTISNLFEGTLPAEGGIY...,CATALYTIC ACTIVITY: Reaction=a ribonucleoside ...,African swine fever virus (isolate Warthog/Nam...,War-109,1450,163822,2.7.7.6,...,,,,,,,,,,
46795,P0C987,RPB1_ASFP4,DNA-directed RNA polymerase RPB1 homolog (RPB1...,MEAGYAEIAAVQFNIAGDNDHKRQGVMEVTISNLFEGTLPAEGGIY...,CATALYTIC ACTIVITY: Reaction=a ribonucleoside ...,African swine fever virus (isolate Tick/South ...,Pret-111,1450,163880,2.7.7.6,...,,,,,,,,,,


In [8]:
def concat_reaction(df, cols=['Sequence'], number_reaction=NUMBER_REACTION):
    '''
    拼接酶反应样本，每个酶的每个催化反应是一个样本，将reaction2_substrate1之后的列拼接至reaction1_substrate1,...,reaction1_product10
    Splicing enzyme reaction samples, each catalytic reaction of each enzyme is a sample, splice the columns after reaction2_substrate1 to reaction1_substrate1,...,reaction1_product10
    '''
    cols_reaction = [
                f"reaction1_{j}{k}"
                for j in ["substrate", "product"]
                for k in range(1, 1 + number_reaction)
            ]
    df01 = df.loc[
        :,
        [
            *df.columns[:11],
            *cols_reaction,
        ],
    ]
    for i in range(2, number_reaction + 1):
        col_map = {
            f"reaction{i}_{j}{k}": f"reaction1_{j}{k}"
            for j in ["substrate", "product"]
            for k in range(1, number_reaction + 1)
        }
        data_tem = df.loc[
            :,
            [
                *df.columns[:11],
                *[
                    f"reaction{i}_{j}{k}"
                    for j in ["substrate", "product"]
                    for k in range(1, number_reaction + 1)
                ],
            ],
        ].rename(columns=col_map)
        data_tem.replace(r"^\s*$", np.nan, regex=True, inplace=True)
        data_tem.dropna(how="all", subset=cols_reaction, inplace=True)
        df01 = pd.concat([df01.copy(), data_tem.copy()], axis=0)
    # 去除空值和重复值
    # Remove null and duplicate values
    df01.replace(np.nan, "", inplace=True)
    df01.drop_duplicates(
        keep="first",
        subset=["Sequence", *cols_reaction],
        inplace=True,
        ignore_index=True,
    )
    df01.replace(r"^\s*$", np.nan, regex=True, inplace=True)
    df01.dropna(
        subset=cols_reaction, axis=0, how="all", inplace=True, ignore_index=True
    )
    return df01[[*cols, *cols_reaction]].replace(np.nan, "")

In [9]:
data02 = concat_reaction(data01)
data02

Unnamed: 0,Sequence,reaction1_substrate1,reaction1_substrate2,reaction1_substrate3,reaction1_substrate4,reaction1_substrate5,reaction1_substrate6,reaction1_substrate7,reaction1_substrate8,reaction1_substrate9,...,reaction1_product1,reaction1_product2,reaction1_product3,reaction1_product4,reaction1_product5,reaction1_product6,reaction1_product7,reaction1_product8,reaction1_product9,reaction1_product10
0,MGAASGRRGPGLLLPLPLLLLLPPQPALALDPGLQPGNFSADEAGA...,H2O,Leu-enkephalin,,,,,,,,...,L-phenylalanyl-L-leucine,L-tyrosylglycylglycine,,,,,,,,
1,MLSRLRVVSTTCTLACRHLHIKEKGKPLMLNPRTNKGMAFTLQERQ...,(S)-malate,NAD(+),,,,,,,,...,CO2,NADH,pyruvate,,,,,,,
2,MARPLVPSSQKALLLELKGLQEEPVEGFRVTLVDEGDLYNWEVAIF...,S-ubiquitinyl-[E1 ubiquitin-activating enzyme]...,[E2 ubiquitin-conjugating enzyme]-L-cysteine,,,,,,,,...,[E1 ubiquitin-activating enzyme]-L-cysteine,S-ubiquitinyl-[E2 ubiquitin-conjugating enzyme...,,,,,,,,
3,MSSPSPGKRRMDTDVVKLIESKHEVTILGGLNEFVVKFYGPQGTPY...,S-ubiquitinyl-[E1 ubiquitin-activating enzyme]...,[E2 ubiquitin-conjugating enzyme]-L-cysteine,,,,,,,,...,[E1 ubiquitin-activating enzyme]-L-cysteine,S-ubiquitinyl-[E2 ubiquitin-conjugating enzyme...,,,,,,,,
4,MALKRINKELSDLARDPPAQCSAGPVGDDMFHWQATIMGPNDSPYQ...,S-ubiquitinyl-[E1 ubiquitin-activating enzyme]...,[E2 ubiquitin-conjugating enzyme]-L-cysteine,,,,,,,,...,[E1 ubiquitin-activating enzyme]-L-cysteine,S-ubiquitinyl-[E2 ubiquitin-conjugating enzyme...,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67527,MGLYRIRVSTGASLYAGSNNQVQLWLVGQHGEAALGTRLWPARGKE...,"(4Z,7Z,10Z,13Z,16Z,19Z)-docosahexaenoate",O2,,,,,,,,...,"(14S)-hydroperoxy-(4Z,7Z,10Z,12E,16Z,19Z)-doco...",,,,,,,,,
67528,MWIMFSVFGLPIPISATELLLASAVFCLVFWVVRTWRPRVPQGLKS...,"(5Z,8Z,11Z,14Z)-eicosatetraenoate",O2,reduced [NADPH--hemoprotein reductase],,,,,,,...,"(14S,15R)-epoxy-(5Z,8Z,11Z)-eicosatrienoate",H(+),H2O,oxidized [NADPH--hemoprotein reductase],,,,,,
67529,MSTSAMELLLTATIFCLVLWVVRIFRPQVPKGLKSPPGPWGWPLIG...,"(5Z,8Z,11Z,14Z)-eicosatetraenoate",O2,reduced [NADPH--hemoprotein reductase],,,,,,,...,"(14S,15R)-epoxy-(5Z,8Z,11Z)-eicosatrienoate",H(+),H2O,oxidized [NADPH--hemoprotein reductase],,,,,,
67530,MGLRRGPCPAALLPGGFLFLLLLADPALLAGRRPPVVLVPGDLGNQ...,"1,2-di-(9Z-octadecenoyl)-sn-glycero-3-phosphoc...",H2O,,,,,,,,...,(9Z)-octadecenoate,2-(9Z-octadecenoyl)-sn-glycero-3-phosphocholine,H(+),,,,,,,


In [10]:
def sample_sequence_substrate(df, number_reaction=NUMBER_REACTION):
    '''
    按照酶序列和底物合并产物,每个序列和多个底物对应一个样本
    Merge products according to enzyme sequence and substrate, each sequence and multiple substrates correspond to one sample
    '''
    # 获取后number_reaction列的列名
    # Get the column name of the number_reaction column
    columns_to_agg = df.columns[-number_reaction:].tolist()
    # 创建一个字典，将前number_reaction+1列和后number_reaction列的累加函数关联起来
    # Create a dictionary to associate the accumulation functions of the first number_reaction+1 column and the last number_reaction column
    agg_functions = {
        column: (lambda x: ";".join(x).strip()
                 ) if column in columns_to_agg else "first"
        for column in df.columns
    }
    # 将序列和底物拼接为一列
    # Concatenate sequence and substrate into one column
    cols=[f'reaction1_substrate{j}' for j in range(1,1+number_reaction)]
    df["group"] = df[['Sequence', *cols]].apply(lambda x: "".join(x), axis=1)

    # 按照group列进行聚合
    # Aggregate according to group column
    df02 = df.groupby("group", as_index=False).agg(agg_functions)
    # 删除group列
    # Delete group column
    df02.drop("group", axis=1, inplace=True)
    # 拆分产物
    # Split products
    df03 = df02.copy()
    df03.iloc[:, -number_reaction:] = ""
    for i in range(len(df02)):
        product_all = np.array([])
        for j in range(len(df02.columns[-number_reaction:])):
            product_arr = df02.iloc[i, -number_reaction + j].split(";")
            product_arr = [x.strip() for x in product_arr]
            product_all = np.append(product_all, product_arr)
        unique_product_all = np.unique(product_all)
        unique_product_all = unique_product_all[
            np.logical_and(unique_product_all != "", unique_product_all != " ")
        ]
        if len(unique_product_all) >= number_reaction:
            for k in range(number_reaction):
                df03.iloc[i, -number_reaction + k] = unique_product_all[k]
        elif len(unique_product_all) > 0:
            for k in range(len(unique_product_all)):
                df03.iloc[i, -number_reaction + k] = unique_product_all[k]
        else:
            continue
    return df03.apply(lambda x: x.strip() if isinstance(x, str) else x)

In [11]:
data0201 = sample_sequence_substrate(data02, NUMBER_REACTION)
data0201

Unnamed: 0,Sequence,reaction1_substrate1,reaction1_substrate2,reaction1_substrate3,reaction1_substrate4,reaction1_substrate5,reaction1_substrate6,reaction1_substrate7,reaction1_substrate8,reaction1_substrate9,...,reaction1_product1,reaction1_product2,reaction1_product3,reaction1_product4,reaction1_product5,reaction1_product6,reaction1_product7,reaction1_product8,reaction1_product9,reaction1_product10
0,AAAATQAVPAPNQQPEVFYNQIFINNEWHDAVSKKTFPTVNPSTGE...,aldehyde,H2O,NAD(+),,,,,,,...,H(+),NADH,carboxylate,,,,,,,
1,AAAWMLNGCLQVMDSRTIPANRNADNVDPALQTATHLCFPTRPVRV...,(3R)-hydroxyacyl-[ACP],NADP(+),,,,,,,,...,3-oxoacyl-[ACP],H(+),NADPH,,,,,,,
2,AAAWMLNGCLQVMDSRTIPANRNADNVDPALQTATHLCFPTRPVRV...,acetyl-CoA,H(+),malonyl-CoA,NADPH,,,,,,...,CO2,CoA,H2O,NADP(+),long-chain fatty acyl-CoA,,,,,
3,AAAWMLNGCLQVMDSRTIPANRNADNVDPALQTATHLCFPTRPVRV...,fatty acyl-[ACP],H(+),malonyl-[ACP],,,,,,,...,3-oxoacyl-[ACP],CO2,holo-[ACP],,,,,,,
4,AADGYARARGVGACVVTFTVGGLSVLNAIAGAYSENLPLICIVGGP...,2-oxocarboxylate,H(+),,,,,,,,...,CO2,aldehyde,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66650,YIAGLLTGRPNSKAVGPSGVVLTAKQAFELANINSEFYELQPKEGL...,L-phenylalanine,,,,,,,,,...,(E)-cinnamate,NH4(+),,,,,,,,
66651,YLPAQQIDVQSSLLSDPSKVAGKTYDYIIAGGGLTGLTVAAKLTEN...,beta-D-glucose,O2,,,,,,,,...,"D-glucono-1,5-lactone",H2O2,,,,,,,,
66652,YQHLFWFFGHPEVYILILPGFGMISHIVAYYAGKKEPFGYMGMVWA...,Fe(II)-[cytochrome c],H(+),O2,,,,,,,...,Fe(III)-[cytochrome c],H(+),H2O,,,,,,,
66653,YQHLFWFFGHPEVYILILPGFGMISHIVAYYAGKKEPFGYMGMVWA...,Fe(II)-[cytochrome c],H(+),O2,,,,,,,...,Fe(III)-[cytochrome c],H(+),H2O,,,,,,,


In [13]:
# 转化为SMILES字符
#Convert to SMILES strings
data03 = dataProcess.df_to_smiles(
    data0201, 
    cols=[f'reaction1_{i}{j}' for i in ['substrate', 'product'] for j in range(1,1+NUMBER_REACTION)],
)
data03

Unnamed: 0,Sequence,reaction1_substrate1_x,reaction1_substrate2_x,reaction1_substrate3_x,reaction1_substrate4_x,reaction1_substrate5_x,reaction1_substrate6_x,reaction1_substrate7_x,reaction1_substrate8_x,reaction1_substrate9_x,...,reaction1_product1_y,reaction1_product2_y,reaction1_product3_y,reaction1_product4_y,reaction1_product5_y,reaction1_product6_y,reaction1_product7_y,reaction1_product8_y,reaction1_product9_y,reaction1_product10_y
0,AAAATQAVPAPNQQPEVFYNQIFINNEWHDAVSKKTFPTVNPSTGE...,aldehyde,H2O,NAD(+),,,,,,,...,[H+],C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,,,,,,,,
1,AAAWMLNGCLQVMDSRTIPANRNADNVDPALQTATHLCFPTRPVRV...,(3R)-hydroxyacyl-[ACP],NADP(+),,,,,,,,...,,[H+],C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,,,,,,,
2,AAAWMLNGCLQVMDSRTIPANRNADNVDPALQTATHLCFPTRPVRV...,acetyl-CoA,H(+),malonyl-CoA,NADPH,,,,,,...,C(=O)=O,CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=...,O,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,,,,,,
3,AAAWMLNGCLQVMDSRTIPANRNADNVDPALQTATHLCFPTRPVRV...,fatty acyl-[ACP],H(+),malonyl-[ACP],,,,,,,...,,C(=O)=O,,,,,,,,
4,AADGYARARGVGACVVTFTVGGLSVLNAIAGAYSENLPLICIVGGP...,2-oxocarboxylate,H(+),,,,,,,,...,C(=O)=O,CC=O,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66650,YIAGLLTGRPNSKAVGPSGVVLTAKQAFELANINSEFYELQPKEGL...,L-phenylalanine,,,,,,,,,...,C1=CC=C(C=C1)C=CC(=O)O,[NH4+],,,,,,,,
66651,YLPAQQIDVQSSLLSDPSKVAGKTYDYIIAGGGLTGLTVAAKLTEN...,beta-D-glucose,O2,,,,,,,,...,C(C1C(C(C(C(=O)O1)O)O)O)O,OO,,,,,,,,
66652,YQHLFWFFGHPEVYILILPGFGMISHIVAYYAGKKEPFGYMGMVWA...,Fe(II)-[cytochrome c],H(+),O2,,,,,,,...,,[H+],O,,,,,,,
66653,YQHLFWFFGHPEVYILILPGFGMISHIVAYYAGKKEPFGYMGMVWA...,Fe(II)-[cytochrome c],H(+),O2,,,,,,,...,,[H+],O,,,,,,,


In [14]:
def remove_invalid_smiles(df, number_reaction=NUMBER_REACTION):
    '''
    去除无法转化SMILES的样本
    Remove samples that cannot be converted to SMILES
    '''
    df.replace(r"^\s*$", np.nan, regex=True, inplace=True)
    df.dropna(subset=df.columns[-2*number_reaction:], how="all", inplace=True)
    columns = [
        f"{y}{x}"
        for y in ["reaction1_substrate", "reaction1_product"]
        for x in np.arange(1, 1 + number_reaction)
    ]
    for column in columns:
        df = df[(df[f"{column}_x"].notna()) == (df[f"{column}_y"].notna())]
        
    df.dropna(subset=df.columns[-2*number_reaction:], how="all", inplace=True)
    return df.reset_index(drop=True)

In [15]:
data04 = remove_invalid_smiles(data03)
data04

Unnamed: 0,Sequence,reaction1_substrate1_x,reaction1_substrate2_x,reaction1_substrate3_x,reaction1_substrate4_x,reaction1_substrate5_x,reaction1_substrate6_x,reaction1_substrate7_x,reaction1_substrate8_x,reaction1_substrate9_x,...,reaction1_product1_y,reaction1_product2_y,reaction1_product3_y,reaction1_product4_y,reaction1_product5_y,reaction1_product6_y,reaction1_product7_y,reaction1_product8_y,reaction1_product9_y,reaction1_product10_y
0,AADIFAKFKTSMEVK,coenzyme B,methyl-coenzyme M,,,,,,,,...,CC(C(C(=O)[O-])NC(=O)CCCCCCSSCCS(=O)(=O)[O-])O...,C,,,,,,,,
1,AAVPSGASTGIYEALELRAVEHINKTIAPALVSKLAMQEFMILPVG...,(2R)-2-phosphoglycerate,,,,,,,,,...,O,C=C(C(=O)O)OP(=O)(O)O,,,,,,,,
2,ADAADKPDDPNFFRMVEGFFDRGASIVEDKLVEDLRTKETPEQKKG...,H2O,L-glutamate,NAD(+),,,,,,,...,C(CC(=O)O)C(=O)C(=O)O,[H+],C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,[NH4+],,,,,,
3,ADAADKPDDPNFFRMVEGFFDRGASIVEDKLVEDLRTKETPEQKKG...,H2O,L-glutamate,NADP(+),,,,,,,...,C(CC(=O)O)C(=O)C(=O)O,[H+],C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,[NH4+],,,,,,
4,ADDINPKEECFFEDDYYEFE,H2O,L-isoleucine,O2,,,,,,,...,CCC(C)C(=O)C(=O)O,OO,[NH4+],,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39610,XXXXXXXXXXXDGHCIPALGFGTYKPIEVPKSKAMEAANLAIGVGY...,morphine,NAD(+),,,,,,,,...,[H+],C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,CN1CCC23C4C1CC5=C2C(=C(C=C5)O)OC3C(=O)C=C4,,,,,,,
39611,XXXXXXXXXXXDGHCIPALGFGTYKPIEVPKSKAMEAANLAIGVGY...,morphine,NADP(+),,,,,,,,...,[H+],C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,CN1CCC23C4C1CC5=C2C(=C(C=C5)O)OC3C(=O)C=C4,,,,,,,
39612,YGPNELPAEEGKNAESAIEALKEYEPEMGKEIVPGDLVEISVGDKI...,ATP,Ca(2+),H2O,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[Ca+2],[H+],[O-]P(=O)([O-])[O-],,,,,,
39613,YIAGLLTGRPNSKAVGPSGVVLTAKQAFELANINSEFYELQPKEGL...,L-phenylalanine,,,,,,,,,...,C1=CC=C(C=C1)C=CC(=O)O,[NH4+],,,,,,,,


In [16]:
def df_sort_mw(df, dataProcess, number_reaction=NUMBER_REACTION):
    '''
    排序产物/底物，按照分子量由大到小排序
    Sort products/substrates by molecular weight from large to small
    '''
    df01 = df.copy()
    df01.iloc[:, -2 * number_reaction:] = ""
    for i in range(len(df)):
        substrate_smiles_all = np.array([])
        for j in range(len(df.columns[-2 * number_reaction: -number_reaction])):
            substrate_smiles_all = np.append(
                substrate_smiles_all, df.iloc[i, -2 * number_reaction + j]
            )
        arr_substrate_smiles_sort = dataProcess.sort_smiles_by_mw(
            substrate_smiles_all
        )

        for j in range(len(arr_substrate_smiles_sort)):
            df01.iloc[i, -2 * number_reaction +
                      j] = arr_substrate_smiles_sort[j]

        product_smiles_all = np.array([])
        for j in range(len(df.columns[-number_reaction:])):
            product_smiles_all = np.append(
                product_smiles_all, df.iloc[i, -number_reaction + j]
            )
        arr_product_smiles_sort = dataProcess.sort_smiles_by_mw(
            product_smiles_all
        )

        for j in range(len(arr_product_smiles_sort)):
            df01.iloc[i, -number_reaction + j] = arr_product_smiles_sort[j]
    cols=[f'reaction1_{i}{j}_y' for i in ['substrate', 'product'] for j in range(1,1+number_reaction)]
    df01.drop_duplicates(
        subset=['Sequence',*cols],
        inplace=True,
        ignore_index=True,
    )
    return df01

In [17]:
data05 = df_sort_mw(data04, dataProcess=dataProcess)
# data05.to_csv(path_enzyme_SMILES, sep="\t", index=False)
data05



Unnamed: 0,Sequence,reaction1_substrate1_x,reaction1_substrate2_x,reaction1_substrate3_x,reaction1_substrate4_x,reaction1_substrate5_x,reaction1_substrate6_x,reaction1_substrate7_x,reaction1_substrate8_x,reaction1_substrate9_x,...,reaction1_product1_y,reaction1_product2_y,reaction1_product3_y,reaction1_product4_y,reaction1_product5_y,reaction1_product6_y,reaction1_product7_y,reaction1_product8_y,reaction1_product9_y,reaction1_product10_y
0,AADIFAKFKTSMEVK,coenzyme B,methyl-coenzyme M,,,,,,,,...,CC(C(C(=O)[O-])NC(=O)CCCCCCSSCCS(=O)(=O)[O-])O...,C,,,,,,,,
1,AAVPSGASTGIYEALELRAVEHINKTIAPALVSKLAMQEFMILPVG...,(2R)-2-phosphoglycerate,,,,,,,,,...,C=C(C(=O)O)OP(=O)(O)O,O,,,,,,,,
2,ADAADKPDDPNFFRMVEGFFDRGASIVEDKLVEDLRTKETPEQKKG...,H2O,L-glutamate,NAD(+),,,,,,,...,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,C(CC(=O)O)C(=O)C(=O)O,[NH4+],[H+],,,,,,
3,ADAADKPDDPNFFRMVEGFFDRGASIVEDKLVEDLRTKETPEQKKG...,H2O,L-glutamate,NADP(+),,,,,,,...,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,C(CC(=O)O)C(=O)C(=O)O,[NH4+],[H+],,,,,,
4,ADDINPKEECFFEDDYYEFE,H2O,L-isoleucine,O2,,,,,,,...,CCC(C)C(=O)C(=O)O,OO,[NH4+],,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39341,XXXXXXXXXXXDGHCIPALGFGTYKPIEVPKSKAMEAANLAIGVGY...,morphine,NAD(+),,,,,,,,...,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,CN1CCC23C4C1CC5=C2C(=C(C=C5)O)OC3C(=O)C=C4,[H+],,,,,,,
39342,XXXXXXXXXXXDGHCIPALGFGTYKPIEVPKSKAMEAANLAIGVGY...,morphine,NADP(+),,,,,,,,...,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,CN1CCC23C4C1CC5=C2C(=C(C=C5)O)OC3C(=O)C=C4,[H+],,,,,,,
39343,YGPNELPAEEGKNAESAIEALKEYEPEMGKEIVPGDLVEISVGDKI...,ATP,Ca(2+),H2O,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[Ca+2],[H+],,,,,,
39344,YIAGLLTGRPNSKAVGPSGVVLTAKQAFELANINSEFYELQPKEGL...,L-phenylalanine,,,,,,,,,...,C1=CC=C(C=C1)C=CC(=O)O,[NH4+],,,,,,,,


In [18]:
def data_process(
    source_file,
    dataProcess,
    cols=['Sequence'],
    number_reaction=NUMBER_REACTION,
):
    '''
    数据处理函数
    
    参数：
    source_file: 数据文件路径
    number_reaction: 底物/产物的数量
    name_to_smiles_cache: 分子名称和SMILES字符串的缓存
    
    返回：
    处理后的数据
    '''
    path_tem = source_file.replace(f'{DATE}', f'{DATE}_tem')
    path_SMILES = source_file.replace(f'{DATE}', f'{DATE}_SMILES')
    # 加载数据
    # Load data
    data01 = pd.read_csv(source_file, sep="\t")

    # 提取化学反应方程式Reaction
    # Extract chemical reaction equation
    # 调用函数，将Catalytic activity列中的化学方程式提取出来，作为data01的新的一列Reaction
    # Call the function to extract the chemical equation in the Catalytic activity column as a new column of Reaction in data01
    data01["Reaction"] = data01["Catalytic activity"].apply(extract_reaction)
    data01.replace(r"^\s*$", np.nan, regex=True, inplace=True)
    # 去掉Reaction为空的数据
    # Remove empty data in Reaction
    data01.dropna(axis=0, subset=["Reaction"], inplace=True, ignore_index=True)

    # 从化学反应方程式Reaction中解析底物和产物
    # Resolve substrates and products from chemical reaction equations
    data01 = chunk_to_labels(df=data01, path=path_tem)

    # 拼接酶反应样本，每个酶的每个催化反应是一个样本，将reaction2_substrate1之后的列拼接至reaction1_substrate1,...,reaction1_product10
    # Splicing enzyme reaction samples, each catalytic reaction of each enzyme is a sample, splice the columns after reaction2_substrate1 to reaction1_substrate1,...,reaction1_product10
    data02 = concat_reaction(data01, cols=cols)

    # 按照序列和底物合并产物,每个序列和多个底物对应一个样本
    # Merge products according to sequence and substrate, each sequence and multiple substrates correspond to one sample
    data02 = sample_sequence_substrate(data02, number_reaction)

    # 将分子转化为SMILES字符串
    # Convert molecular to SMILES string
    data03 = dataProcess.df_to_smiles(
        data02, 
        cols=[f'reaction1_{i}{j}' for i in ['substrate', 'product'] for j in range(1,1+number_reaction)],
        )

    # 去除无法转化SMILES的样本
    # Remove samples that cannot be converted into SMILES
    data04 = remove_invalid_smiles(data03)

    # 排序产物/底物，按照分子量由大到小排序
    # Sort products/substrates by molecular weight from large to small
    data05 = df_sort_mw(data04, dataProcess)

    # 保存最终文件
    # Save final file
    data05.to_csv(path_SMILES, sep="\t", index=False)

    return data05

In [20]:
# # 对具有Catalytic Activity的原始数据进行数据处理
# # Perform data processing on raw data with Catalytic Activity
# data_CA = data_process(
#     source_file=path_source_CA,
#     dataProcess=dataProcess,
#     number_reaction=NUMBER_REACTION,
# )

# data_CA



Unnamed: 0,Sequence,reaction1_substrate1_x,reaction1_substrate2_x,reaction1_substrate3_x,reaction1_substrate4_x,reaction1_substrate5_x,reaction1_substrate6_x,reaction1_substrate7_x,reaction1_substrate8_x,reaction1_substrate9_x,...,reaction1_product1_y,reaction1_product2_y,reaction1_product3_y,reaction1_product4_y,reaction1_product5_y,reaction1_product6_y,reaction1_product7_y,reaction1_product8_y,reaction1_product9_y,reaction1_product10_y
0,AAAAAGRGRSFSPAAPAPSSVRLPGRQAPAPAAASALAVEADPAAD...,GTP,IMP,L-aspartate,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O)O...,C1=NC2=C(N1C3C(C(C(O3)COP(=O)(O)OP(=O)(O)O)O)O...,[O-]P(=O)([O-])[O-],[H+],,,,,,
1,AAAQRRQNDSSVFLAIMVAAAVESESSLTDGDAGALLLQDISEWDE...,H2O2,,,,,,,,,...,O=O,O,,,,,,,,
2,AADIFAKFKTSMEVK,coenzyme B,methyl-coenzyme M,,,,,,,,...,CC(C(C(=O)[O-])NC(=O)CCCCCCSSCCS(=O)(=O)[O-])O...,C,,,,,,,,
3,AAGGIGQALALLLKNRLPAGSDLALYDIAPVTPGVAADLSHIPTPV...,(S)-malate,NAD(+),,,,,,,,...,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,C(C(=O)C(=O)O)C(=O)O,[H+],,,,,,,
4,AAGRGAWVRTWAPLAMAAKVDLSTSTDWKEAKSFLKGLSDKQREEH...,2''-O-acetyl-ADP-D-ribose,H2O,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,CC(=O)[O-],[H+],,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151395,YYTPEYKTKDTDILAAFRMTPQPGVPAEEAGAAVAAESSTGTWTTV...,(2R)-3-phosphoglycerate,H(+),,,,,,,,...,C(C(C(C(=O)COP(=O)(O)O)O)O)OP(=O)(O)O,C(=O)=O,O,,,,,,,
151396,YYTPEYKTKDTDILAAFRMTPQPGVPAEEAGAAVAAESSTGTWTTV...,"D-ribulose 1,5-bisphosphate",O2,,,,,,,,...,C(C(C(=O)[O-])O)OP(=O)([O-])[O-],C(C(=O)[O-])OP(=O)([O-])[O-],[H+],,,,,,,
151397,YYTPKYETKDTDILAAFRMTPQPGVPPEEAGAAVAAESSTGTWTTV...,(2R)-3-phosphoglycerate,H(+),,,,,,,,...,C(C(C(C(=O)COP(=O)(O)O)O)O)OP(=O)(O)O,C(=O)=O,O,,,,,,,
151398,YYTPKYETKDTDILAAFRMTPQPGVPPEEAGAAVAAESSTGTWTTV...,"D-ribulose 1,5-bisphosphate",O2,,,,,,,,...,C(C(C(=O)[O-])O)OP(=O)([O-])[O-],C(C(=O)[O-])OP(=O)([O-])[O-],[H+],,,,,,,


In [21]:
data_EC11 = data_process(
    source_file=path_source_EC11,
    dataProcess=dataProcess,
    number_reaction=NUMBER_REACTION,
)

data_EC11

Unnamed: 0,Sequence,reaction1_substrate1_x,reaction1_substrate2_x,reaction1_substrate3_x,reaction1_substrate4_x,reaction1_substrate5_x,reaction1_substrate6_x,reaction1_substrate7_x,reaction1_substrate8_x,reaction1_substrate9_x,...,reaction1_product1_y,reaction1_product2_y,reaction1_product3_y,reaction1_product4_y,reaction1_product5_y,reaction1_product6_y,reaction1_product7_y,reaction1_product8_y,reaction1_product9_y,reaction1_product10_y
0,AAAQRRQNDSSVFLAIMVAAAVESESSLTDGDAGALLLQDISEWDE...,H2O2,,,,,,,,,...,O=O,O,,,,,,,,
1,AAGGIGQALALLLKNRLPAGSDLALYDIAPVTPGVAADLSHIPTPV...,(S)-malate,NAD(+),,,,,,,,...,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,C(C(=O)C(=O)O)C(=O)O,[H+],,,,,,,
2,ADKKIALVGAGNIGGTLAHLIGLKXLL,(S)-malate,NAD(+),,,,,,,,...,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,C(C(=O)C(=O)O)C(=O)O,[H+],,,,,,,
3,ADLYENPMGLMGFEFIELASPTPNTLEPIFEIMGFTKVATHRSKDV...,3-(4-hydroxyphenyl)pyruvate,O2,,,,,,,,...,C1=CC(=C(C=C1O)CC(=O)O)O,C(=O)=O,,,,,,,,
4,AEPNVTVTGAAGQIGYALLFRI,(S)-malate,NAD(+),,,,,,,,...,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,C(C(=O)C(=O)O)C(=O)O,[H+],,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15554,YIDAETMTLHHDKHHATYVANANAALEKHPEIGEDLEALLADVEQI...,H(+),superoxide,,,,,,,,...,OO,O=O,,,,,,,,
15555,YIDEETMHLHHDKHHQTYVNNVNAALEKHPEIGEDLESLLADVESI...,H(+),superoxide,,,,,,,,...,OO,O=O,,,,,,,,
15556,YIDKETMILHHDKHHATYLANANAALEKHPEIGEDLEFLLSDVTRI...,H(+),superoxide,,,,,,,,...,OO,O=O,,,,,,,,
15557,YKDQGLEILAFPCNQFG,glutathione,H2O2,,,,,,,,...,C(CC(=O)NC(CSSCC(C(=O)NCC(=O)O)NC(=O)CCC(C(=O)...,O,,,,,,,,


In [22]:
data_EC27 = data_process(
    source_file=path_source_EC27,
    dataProcess=dataProcess,
    number_reaction=NUMBER_REACTION,
)

data_EC27

Unnamed: 0,Sequence,reaction1_substrate1_x,reaction1_substrate2_x,reaction1_substrate3_x,reaction1_substrate4_x,reaction1_substrate5_x,reaction1_substrate6_x,reaction1_substrate7_x,reaction1_substrate8_x,reaction1_substrate9_x,...,reaction1_product1_y,reaction1_product2_y,reaction1_product3_y,reaction1_product4_y,reaction1_product5_y,reaction1_product6_y,reaction1_product7_y,reaction1_product8_y,reaction1_product9_y,reaction1_product10_y
0,ADEKXVVIGLAADSG,ATP,D-ribulose 5-phosphate,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,C(C(C(C(=O)COP(=O)(O)O)O)O)OP(=O)(O)O,[H+],,,,,,,
1,AGVDPLVPLRQAMFNYFQVPDRLGILTHLYRIAEGAQQGDPLSRSW...,ATP,N-acetyl-D-glucosamine,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,CC(=O)NC1C(C(C(OC1O)COP(=O)(O)O)O)O,[H+],,,,,,,
2,AIMRMSGEGPTFDANTECAIAYHHTKYQVEQGTAQLYAGDDMAQDT...,ATP,H2O,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,,
3,AKKSVGDLTKADLEGKRVFVRADLNVPLDKEQKXTD,(2R)-3-phosphoglycerate,ATP,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,C(C(C(=O)OP(=O)([O-])[O-])O)OP(=O)([O-])[O-],,,,,,,,
4,AKVLTLDLYKKLRDKSTPSGFTLDDIIQNEHLGYVLTCPSNLGTXL...,ATP,creatine,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,CN(CC(=O)O)C(=NP(=O)(O)O)N,[H+],,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18523,VDFNVPLKEGVVKDPTRIAGSIPSIKKILETNPRGLVLMSHLGRPD...,(2R)-3-phosphoglycerate,ATP,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,C(C(C(=O)OP(=O)([O-])[O-])O)OP(=O)([O-])[O-],,,,,,,,
18524,VSDSQNSQDGLDPE,alpha-D-glucose 1-phosphate,ATP,H(+),,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)([O-])...,[O-]P(=O)([O-])OP(=O)([O-])[O-],,,,,,,,
18525,VTDSVIGEGCVIKNCKIHHSVVGLRSCISEGAIIEDTLLMGADYYA...,alpha-D-glucose 1-phosphate,ATP,H(+),,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)([O-])...,[O-]P(=O)([O-])OP(=O)([O-])[O-],,,,,,,,
18526,XEKXIVVGLAADSG,ATP,D-ribulose 5-phosphate,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,C(C(C(C(=O)COP(=O)(O)O)O)O)OP(=O)(O)O,[H+],,,,,,,


In [23]:
data_EC31 = data_process(
    source_file=path_source_EC31,
    dataProcess=dataProcess,
    number_reaction=NUMBER_REACTION,
)

data_EC31

Unnamed: 0,Sequence,reaction1_substrate1_x,reaction1_substrate2_x,reaction1_substrate3_x,reaction1_substrate4_x,reaction1_substrate5_x,reaction1_substrate6_x,reaction1_substrate7_x,reaction1_substrate8_x,reaction1_substrate9_x,...,reaction1_product1_y,reaction1_product2_y,reaction1_product3_y,reaction1_product4_y,reaction1_product5_y,reaction1_product6_y,reaction1_product7_y,reaction1_product8_y,reaction1_product9_y,reaction1_product10_y
0,AAGRGAWVRTWAPLAMAAKVDLSTSTDWKEAKSFLKGLSDKQREEH...,2''-O-acetyl-ADP-D-ribose,H2O,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,CC(=O)[O-],[H+],,,,,,,
1,AAGRGAWVRTWAPLAMAAKVDLSTSTDWKEAKSFLKGLSDKQREEH...,3''-O-acetyl-ADP-D-ribose,H2O,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,CC(=O)[O-],[H+],,,,,,,
2,AAGRGAWVRTWAPLAMAAKVDLSTSTDWKEAKSFLKGLSDKQREEH...,alpha-NAD(+),H2O,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,C1=CC(=CN=C1)C(=O)N,[H+],,,,,,,
3,AEFGPSQPFKGAK,H2O,S-adenosyl-L-homocysteine,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)CO)O)O)N,C(CS)C(C(=O)O)N,,,,,,,,
4,AEGREDPELLVTVRGGRLRGLRLKAPGGPVSAFLGIPFEEPPVGPR...,acetylcholine,H2O,,,,,,,,...,C[N+](C)(C)CCO,CC(=O)[O-],[H+],,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6444,TEFGPSQPFKGAK,H2O,S-adenosyl-L-homocysteine,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)CO)O)O)N,C(CS)C(C(=O)O)N,,,,,,,,
6445,VPSPRPQNATVMVWIFGGGFAYGTSSLNVYDGRYLAQAEGAIVVSM...,acetylcholine,H2O,,,,,,,,...,C[N+](C)(C)CCO,CC(=O)[O-],[H+],,,,,,,
6446,VQPPHSHGDNFYIWT,digallate,H2O,,,,,,,,...,C1=C(C=C(C(=C1O)[O-])O)C(=O)O,[H+],,,,,,,,
6447,WTLSLLLGAVVGNEVCYERLGCFSDDSPWAGIVERPLKILPWSPEK...,"1,2,3-tributanoylglycerol",H2O,,,,,,,,...,CCCC(=O)C(C(C(C(=O)CCC)O)O)O,CCCC(=O)[O-],[H+],,,,,,,


In [24]:
data_EC42 = data_process(
    source_file=path_source_EC42,
    dataProcess=dataProcess,
    number_reaction=NUMBER_REACTION,
)

data_EC42

Unnamed: 0,Sequence,reaction1_substrate1_x,reaction1_substrate2_x,reaction1_substrate3_x,reaction1_substrate4_x,reaction1_substrate5_x,reaction1_substrate6_x,reaction1_substrate7_x,reaction1_substrate8_x,reaction1_substrate9_x,...,reaction1_product1_y,reaction1_product2_y,reaction1_product3_y,reaction1_product4_y,reaction1_product5_y,reaction1_product6_y,reaction1_product7_y,reaction1_product8_y,reaction1_product9_y,reaction1_product10_y
0,AAVPSGASTGIYEALELRAVEHINKTIAPALVSKLAMQEFMILPVG...,(2R)-2-phosphoglycerate,,,,,,,,,...,C=C(C(=O)O)OP(=O)(O)O,O,,,,,,,,
1,AAVPSGASTGVYEALELRFRAPVEPY,(2R)-2-phosphoglycerate,,,,,,,,,...,C=C(C(=O)O)OP(=O)(O)O,O,,,,,,,,
2,AHAWGYGPTDGPDKWVSNFPIADGPRQSPIDILPGGASYDSGLKPL...,H(+),hydrogencarbonate,,,,,,,,...,C(=O)=O,O,,,,,,,,
3,AIIPKQFLKRIERTGFGQFLFYEWRFDEAGNINPEFEPNKPRYAGA...,"(2R,3S)-3-isopropylmalate",,,,,,,,,...,CC(C)C(CC(=O)O)(C(=O)O)O,,,,,,,,,
4,APRKFFVGGNWKMNGDKKSLGELIQTLNAAKVPFTGEIVCAPPEAY...,D-glyceraldehyde 3-phosphate,,,,,,,,,...,C(C(=O)COP(=O)(O)O)O,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9598,VAAAIACALFNLKCKIYMGYKDIKRQSPNVFRMKLMGAEVISVENG...,"(1S,2R)-1-C-(indol-3-yl)glycerol 3-phosphate",L-serine,,,,,,,,...,C1=CC=C2C(=C1)C(=CN2)CC(C(=O)O)N,C(C(C=O)O)OP(=O)(O)O,O,,,,,,,
9599,VAVSIACALFNLKCKIYMGYKDIKRQSPNVFRMKLMGAEVISVRNG...,"(1S,2R)-1-C-(indol-3-yl)glycerol 3-phosphate",L-serine,,,,,,,,...,C1=CC=C2C(=C1)C(=CN2)CC(C(=O)O)N,C(C(C=O)O)OP(=O)(O)O,O,,,,,,,
9600,VDRPFKEVEANEGYRLSIDLAEQTLTTPGGETFTFDITEHRKHCLL...,"(2R,3S)-3-isopropylmalate",,,,,,,,,...,CC(C)C(CC(=O)O)(C(=O)O)O,,,,,,,,,
9601,VTKAVENINAIIAPALKGMDPVKQAEIDQKMKDLDGTDNKGKLGAN...,(2R)-2-phosphoglycerate,,,,,,,,,...,C=C(C(=O)O)OP(=O)(O)O,O,,,,,,,,


In [19]:
data_EC56 = data_process(
    source_file=path_source_EC56,
    dataProcess=dataProcess,
    number_reaction=NUMBER_REACTION,
)

data_EC56

Unnamed: 0,Sequence,reaction1_substrate1_x,reaction1_substrate2_x,reaction1_substrate3_x,reaction1_substrate4_x,reaction1_substrate5_x,reaction1_substrate6_x,reaction1_substrate7_x,reaction1_substrate8_x,reaction1_substrate9_x,...,reaction1_product1_y,reaction1_product2_y,reaction1_product3_y,reaction1_product4_y,reaction1_product5_y,reaction1_product6_y,reaction1_product7_y,reaction1_product8_y,reaction1_product9_y,reaction1_product10_y
0,MAALPQNNLQKQLELFPAKGTSNKLSLQKTKSSVFTFKKKCSPNVS...,ATP,H2O,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,,
1,MAAVPLNNLQEQLQRHSARKLNNQPSLSKPKSLGFTFKKKTSEGDV...,ATP,H2O,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,,
2,MAAVPLRPCRASGRFLPLLRGGARSRGASPWAAGPGRVAQRRYKKD...,ATP,H2O,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,,
3,MAAVPQNNLQEQLERHSARTLNNKLSLSKPKFSGFTFKKKTSSDNN...,ATP,H2O,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,,
4,MADREYTLSQKQAINSSGHNILVSASAGSGKTSVLVERVIQKIING...,ATP,H2O,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434,MVVTRGDKFAGSSLACKSMIGANKMSGSHLHEVNNSRSHFPQTNWL...,ATP,H2O,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,,
435,MVYLRYFKGLILSDAYAPGLKWSDELKAYSALAFKYRDVRKYFLEK...,ATP,H2O,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,,
436,MVYSDTKTSKKIKKYKINIMKKKLNIFQIPLKGIHLIEASAGTGKT...,ATP,H2O,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,,
437,MWLLLRRAYPLRILLPLRGEWVGRRGLPRSLAPGPPRRRYRKEALP...,ATP,H2O,,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,,


In [26]:
data_EC63 = data_process(
    source_file=path_source_EC63,
    dataProcess=dataProcess,
    number_reaction=NUMBER_REACTION,
)

data_EC63

Unnamed: 0,Sequence,reaction1_substrate1_x,reaction1_substrate2_x,reaction1_substrate3_x,reaction1_substrate4_x,reaction1_substrate5_x,reaction1_substrate6_x,reaction1_substrate7_x,reaction1_substrate8_x,reaction1_substrate9_x,...,reaction1_product1_y,reaction1_product2_y,reaction1_product3_y,reaction1_product4_y,reaction1_product5_y,reaction1_product6_y,reaction1_product7_y,reaction1_product8_y,reaction1_product9_y,reaction1_product10_y
0,AAAAAGRGRSFSPAAPAPSSVRLPGRQAPAPAAASALAVEADPAAD...,GTP,IMP,L-aspartate,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O)O...,C1=NC2=C(N1C3C(C(C(O3)COP(=O)(O)OP(=O)(O)O)O)O...,[O-]P(=O)([O-])[O-],[H+],,,,,,
1,ALGDQLLSVFVDHTLVDEVA,ATP,H2O,L-glutamine,XMP,,,,,,...,C1=NC2=C(N1C3C(C(C(O3)COP(=O)(O)O)O)O)N=C(NC2=O)N,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O)O...,[O-]P(=O)([O-])OP(=O)([O-])[O-],C(CC(=O)O)C(C(=O)O)N,[H+],,,,,
2,DPRVRKQYIQEQGAPIVIKADGLAAGKGVTVAMTLEEAYKAVDSML...,5-phospho-beta-D-ribosylamine,ATP,glycine,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,C(C1C(C(C(O1)NC(=O)C[NH3+])O)O)OP(=O)([O-])[O-],[O-]P(=O)([O-])[O-],[H+],,,,,,
3,DPYINIDAGTFSPYEHGEVFVLDDGGEVDLDLGNYERFLDIRLTKD...,ATP,H2O,L-glutamine,UTP,,,,,,...,C1=CN(C(=O)N=C1N)C2C(C(C(O2)COP(=O)(O)OP(=O)(O...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,C(CC(=O)O)C(C(=O)O)N,[O-]P(=O)([O-])[O-],[H+],,,,,
4,DVNWPLGWPVGGYPG,ATP,L-glutamate,NH4(+),,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,C(CC(=O)N)C(C(=O)O)N,[O-]P(=O)([O-])[O-],[H+],,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10613,TMVTKLEKDSLVSKLYNSDVALERHRHRYEFNNKYKKDLESVGLRF...,ATP,NH4(+),UTP,,,,,,,...,C1=CN(C(=O)N=C1N)C2C(C(C(O2)COP(=O)(O)OP(=O)(O...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,
10614,TMVTKLEKDSLVSKLYNSDVALERHRHRYEFNNKYKKDLESVGLRF...,H2O,L-glutamine,,,,,,,,...,C(CC(=O)O)C(C(=O)O)N,[NH4+],,,,,,,,
10615,TTPQEVLSRIKDQGIKLIDLKFI,ATP,L-glutamate,NH4(+),,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,C(CC(=O)N)C(C(=O)O)N,[O-]P(=O)([O-])[O-],[H+],,,,,,
10616,WFDPLRITETTTGSVTLK,ATP,L-aspartate,L-citrulline,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O)O...,C(CC(C(=O)O)N)CN=C(N)NC(CC(=O)O)C(=O)O,[O-]P(=O)([O-])OP(=O)([O-])[O-],[H+],,,,,,


In [27]:
data_EC71 = data_process(
    source_file=path_source_EC71,
    dataProcess=dataProcess,
    number_reaction=NUMBER_REACTION,
)

data_EC71

Unnamed: 0,Sequence,reaction1_substrate1_x,reaction1_substrate2_x,reaction1_substrate3_x,reaction1_substrate4_x,reaction1_substrate5_x,reaction1_substrate6_x,reaction1_substrate7_x,reaction1_substrate8_x,reaction1_substrate9_x,...,reaction1_product1_y,reaction1_product2_y,reaction1_product3_y,reaction1_product4_y,reaction1_product5_y,reaction1_product6_y,reaction1_product7_y,reaction1_product8_y,reaction1_product9_y,reaction1_product10_y
0,AAIFFAGTPILESAMVYPLAICGACILTSIAGTFFVKLGTNNSIMG...,diphosphate,H(+),H2O,,,,,,,...,[O-]P(=O)([O-])[O-],[H+],,,,,,,,
1,AEVLMDFPQLTMTLPDGREESVMKLTTLVA,ATP,H(+),H2O,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,,
2,AEVLMDFPQLTMTLPDGREESVMKRTTLVA,ATP,H(+),H2O,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,,
3,AEVPMPLRQLTMTLPDGREESVMERTTLVA,ATP,H(+),H2O,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,,
4,AFENKLMKNYRFLGISTSSEGYITQIIGPVLDVAFPVGKMPNIFNS...,ATP,H(+),H2O,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[H+],,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8191,WIIPFVPLPVPMLIGVGLLLFPIXTNKLRRMWAFPSILLLSIVMIF...,plastoquinone,H(+),NADPH,,,,,,,...,CC1=C(C=C(C(=C1C)O)CC=C(C)CCC=C(C)CCC=C(C)CCC=...,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,[H+],,,,,,,
8192,WVIPLLPLPVIMSMGFGLFFIPTATKNLRRIWAFPSVLFLSIAIVY...,plastoquinone,H(+),NADH,,,,,,,...,CC1=C(C=C(C(=C1C)O)CC=C(C)CCC=C(C)CCC=C(C)CCC=...,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)(O)OP(=O)(O...,[H+],,,,,,,
8193,WVIPLLPLPVIMSMGFGLFFIPTATKNLRRIWAFPSVLFLSIAIVY...,plastoquinone,H(+),NADPH,,,,,,,...,CC1=C(C=C(C(=C1C)O)CC=C(C)CCC=C(C)CCC=C(C)CCC=...,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,[H+],,,,,,,
8194,XSGKVLSEEEKAAANVYIKKME,ubiquinone,H(+),NADH,,,,,,,...,CC1=C(C(=C(C(=C1O)OC)OC)O)CC=C(C)CCC=C(C)CCC=C...,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)(O)OP(=O)(O...,[H+],,,,,,,


In [None]:
# 保存json文件
# Save json file
path_name_to_smiles_cache = f'data/name_to_smiles_cache_{DATE}.json'
dataProcess.save_json(dict=dataProcess.name_to_smiles_cache,path=path_name_to_smiles_cache)
print(len(dataProcess.name_to_smiles_cache))

# 3.拆分数据 Split data


In [3]:
# 读取数据集
# Read the dataset
PATH_SMILES = path_enzyme_SMILES # 39k数据
# PATH_SMILES = path_CA_SMILES # 151k数据
data01 = pd.read_csv(PATH_SMILES, sep="\t")
data01.replace(np.nan, "", inplace=True)
data01

Unnamed: 0,Sequence,reaction1_substrate1_x,reaction1_substrate2_x,reaction1_substrate3_x,reaction1_substrate4_x,reaction1_substrate5_x,reaction1_substrate6_x,reaction1_substrate7_x,reaction1_substrate8_x,reaction1_substrate9_x,...,reaction1_product1_y,reaction1_product2_y,reaction1_product3_y,reaction1_product4_y,reaction1_product5_y,reaction1_product6_y,reaction1_product7_y,reaction1_product8_y,reaction1_product9_y,reaction1_product10_y
0,AADIFAKFKTSMEVK,coenzyme B,methyl-coenzyme M,,,,,,,,...,CC(C(C(=O)[O-])NC(=O)CCCCCCSSCCS(=O)(=O)[O-])O...,C,,,,,,,,
1,AAVPSGASTGIYEALELRAVEHINKTIAPALVSKLAMQEFMILPVG...,(2R)-2-phosphoglycerate,,,,,,,,,...,C=C(C(=O)O)OP(=O)(O)O,O,,,,,,,,
2,ADAADKPDDPNFFRMVEGFFDRGASIVEDKLVEDLRTKETPEQKKG...,H2O,L-glutamate,NAD(+),,,,,,,...,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,C(CC(=O)O)C(=O)C(=O)O,[NH4+],[H+],,,,,,
3,ADAADKPDDPNFFRMVEGFFDRGASIVEDKLVEDLRTKETPEQKKG...,H2O,L-glutamate,NADP(+),,,,,,,...,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,C(CC(=O)O)C(=O)C(=O)O,[NH4+],[H+],,,,,,
4,ADDINPKEECFFEDDYYEFE,H2O,L-isoleucine,O2,,,,,,,...,CCC(C)C(=O)C(=O)O,OO,[NH4+],,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39341,XXXXXXXXXXXDGHCIPALGFGTYKPIEVPKSKAMEAANLAIGVGY...,morphine,NAD(+),,,,,,,,...,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,CN1CCC23C4C1CC5=C2C(=C(C=C5)O)OC3C(=O)C=C4,[H+],,,,,,,
39342,XXXXXXXXXXXDGHCIPALGFGTYKPIEVPKSKAMEAANLAIGVGY...,morphine,NADP(+),,,,,,,,...,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,CN1CCC23C4C1CC5=C2C(=C(C=C5)O)OC3C(=O)C=C4,[H+],,,,,,,
39343,YGPNELPAEEGKNAESAIEALKEYEPEMGKEIVPGDLVEISVGDKI...,ATP,Ca(2+),H2O,,,,,,,...,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,[O-]P(=O)([O-])[O-],[Ca+2],[H+],,,,,,
39344,YIAGLLTGRPNSKAVGPSGVVLTAKQAFELANINSEFYELQPKEGL...,L-phenylalanine,,,,,,,,,...,C1=CC=C(C=C1)C=CC(=O)O,[NH4+],,,,,,,,


In [4]:
def write_fasta(path, ds):
    '''
    将Series写入FASTA文件
    Write Series to fasta file
    '''
    with open(path, "w") as f:
        for index, row in ds.items():
            f.write(f">{index}\n{row}\n")


def run_cd_hit(infile, outfile, cutoff, memory=1000):
    """
    运行特定的cd-hit命令
    Run a specific cd-hit command
    """
    # 设置词长
    # get the right word size for the cutoff
    if cutoff < 0.5:
        word = 2
    elif cutoff < 0.6:
        word = 3
    elif cutoff < 0.7:
        word = 4
    else:
        word = 5
    
    mycmd = f"/home/mingxuan/cd-hit-v4.8.1-2019-0228/cd-hit -i {infile} -o {outfile} -c {cutoff} -n {word} -T 8 -M {memory} -d 0"
    
    print(mycmd)
    process = subprocess.Popen(mycmd, shell=True, stdout=subprocess.PIPE)
    process.wait()


def parse_cdhit_clstr(clstr_file):
    '''
    将cd-hit聚类结果转化为规范df数据
    Convert cd-hit clustering results into canonical df data
    '''
    clusters = []
    current_cluster = None

    with open(clstr_file, "r") as file:
        for line in file:
            line = line.strip()
            # 处理Cluster行
            # Process Cluster rows
            if line.startswith(">Cluster"):
                if current_cluster is not None:
                    clusters.append(current_cluster)
                cluster_id = int(re.search(r"\d+", line).group())
                current_cluster = {"Cluster": cluster_id, "Members": []}
            # 处理index行
            # Process index rows
            elif line:
                match = re.search(r"(\d+)aa, >(\d+)", line)
                if match:
                    # 获取第二个group
                    # Get the second group
                    index = int(match.group(2))
                    current_cluster["Members"].append(index)

    if current_cluster is not None:
        clusters.append(current_cluster)
    # 将clusters数组转化为DataFrame
    # Convert clusters to DataFrame
    list_rows = []
    for cluster in clusters:
        cluster_id = cluster["Cluster"]
        for member in cluster["Members"]:
            list_rows.append({"Cluster": cluster_id, "ID": member})

    return pd.DataFrame(list_rows)

def split_train_valid_test(
    data,
    df_train_ID=None,
    train_size=0.8,
    test_size=0.6,
    random_state=3,
    identity=SEQ_IDENTITY,
):
    '''
    获取训练数据，验证数据，测试数据
    Get training data, validation data, and test data
    '''
    if (identity < 1.0) and (df_train_ID is not None):
        train_data = data.iloc[df_train_ID["ID"], :]
        test_valid_data = data.drop(train_data.index)
    else:
        train_data, test_valid_data = train_test_split(
            data, train_size=train_size, random_state=random_state
        )
    valid_data, test_data = train_test_split(
        test_valid_data, test_size=test_size, random_state=random_state
    )

    print(
        train_data.shape,
        valid_data.shape,
        test_data.shape,
        np.array([len(train_data), len(valid_data),
                 len(test_data)]) / (int(len(data))),
    )
    return data, train_data, valid_data, test_data

def split_train_valid_test_cdhit(
    df, path_infile_fasta, path_outfile_fasta, identity, random_seed
):
    '''
    获取CDHIT算法聚类之后拆分的训练集、验证集和测试集
    Obtain the training set, validation set and test set split after clustering by CDHIT algorithm
    '''
    # 获取聚类文件
    # Get clustering files
    CLSTR_FILE = path_outfile_fasta + ".clstr"

    # 替换缺失值
    # Replace missing values
    df.replace(np.nan, "", inplace=True)
    # 获取序列
    # Get the sequence
    df_seq01 = df['Sequence']

    # 将序列写入fasta文件
    # Write sequence to fasta file
    write_fasta(path_infile_fasta, df_seq01)
    # 运行cd-hit
    # Run cd-hit
    run_cd_hit(path_infile_fasta, path_outfile_fasta, identity, 100000)
    # 解析cd-hit聚类文件
    # Parse cd-hit clustering files
    df_seq_cluster = parse_cdhit_clstr(clstr_file=CLSTR_FILE)
    # 拆分聚类组索引Cluster
    # Split the clustering group index
    list_cluster = list(
        range(df_seq_cluster["Cluster"].min(),
              df_seq_cluster["Cluster"].max() + 1)
    )
    cluster_train_size = int(len(list_cluster) * 0.8)
    # 设置随机种子
    # Set random seed
    random.seed(random_seed)
    # 随机选择聚类组索引
    # Randomly select clustering group index
    cluster_train_set = random.sample(list_cluster, cluster_train_size)
    # 获取训练ID
    # Get training ID
    df_train_ID = df_seq_cluster[
        [x in cluster_train_set for x in df_seq_cluster["Cluster"]]
    ]
    # 获取训练数据，验证数据，测试数据
    # Get training data, validation data, and test data
    df_new, train_data, valid_data, test_data = split_train_valid_test(
        df, df_train_ID, random_state=3, identity=identity
    )

    # 获取特定列
    # Get specific columns
    train_data = train_data[
        [
            "Sequence",
            "reaction1_substrate1_y",
            "reaction1_substrate2_y",
            "reaction1_substrate3_y",
            "reaction1_product1_y",
            "reaction1_product2_y",
            "reaction1_product3_y",
        ]
    ]
    valid_data = valid_data[
        [
            "Sequence",
            "reaction1_substrate1_y",
            "reaction1_substrate2_y",
            "reaction1_substrate3_y",
            "reaction1_product1_y",
            "reaction1_product2_y",
            "reaction1_product3_y",
        ]
    ]
    test_data = test_data[
        [
            "Sequence",
            "reaction1_substrate1_y",
            "reaction1_substrate2_y",
            "reaction1_substrate3_y",
            "reaction1_product1_y",
            "reaction1_product2_y",
            "reaction1_product3_y",
        ]
    ]
    return (df_new, train_data, valid_data, test_data)

In [5]:
# 拆分训练集、验证集和测试集，random_seed=4, 4，4，对应SEQ_IDENTITY=0.8,0.6,0.4
# Split the training set, validation set and test set, random_seed=4, 4, 4, corresponding to SEQ_IDENTITY=0.8,0.6,0.4
data02, train_data, valid_data, test_data = split_train_valid_test_cdhit(
    df=data01,
    path_infile_fasta=path_infile_fasta,
    path_outfile_fasta=path_outfile_fasta,
    identity=SEQ_IDENTITY,
    random_seed=4,
)

/home/mingxuan/cd-hit-v4.8.1-2019-0228/cd-hit -i ../data/review_sequence_20240611.fasta -o ../data/review_sequence_20240611_cdhit_80 -c 0.8 -n 5 -T 8 -M 100000 -d 0
(31471, 41) (3150, 41) (4725, 41) [0.79985259 0.08005896 0.12008845]


In [6]:
def run_cd_hit_2d(infile01, infile02, outfile, cutoff, memory=1000):
    """
    运行特定的cd-hit-2d命令
    Run a specific cd-hit-2d command
    """
    # 根据cutoff的值，获取不同的word大小
    # get the right word size for the cutoff
    if cutoff < 0.5:
        word = 2
    elif cutoff < 0.6:
        word = 3
    elif cutoff < 0.7:
        word = 4
    else:
        word = 5

    # 构建cd-hit-2d命令
    # Build cd-hit-2d command
    mycmd = f"/home/mingxuan/cd-hit-v4.8.1-2019-0228/cd-hit-2d -i {infile01} -i2 {infile02} -o {outfile} -c {cutoff} -n {word} -T 0 -M {memory} -d 0"
    print(mycmd)
    # 运行cd-hit-2d命令
    # Run the cd-hit-2d command
    process = subprocess.Popen(mycmd, shell=True, stdout=subprocess.PIPE)
    process.wait()

def train_test_cluster(list_cutoff, path_train_fasta, path_test_fasta, test_data):
    '''
    用于两个数据集的对比聚类
    Used for comparative clustering of two datasets
    '''
    # 初始化字典，用于存储测试数据集的聚类结果
    # Initialize the dictionary to store the clustering results of test set
    dict_test_cutoff = {}
    # 遍历cutoff列表
    # Traverse the cutoff list
    for cutoff in list_cutoff:
        # 构建训练数据集的聚类结果文件路径
        # Construct the clustering result file path of training set
        path_train_cluster = (
            f"../data/review_sequence_train_{DATE}_cluster{int(cutoff*100)}"
        )
        # 运行cd-hit-2d命令，用于训练和测试数据集的对比聚类
        # Run the cd-hit-2d command for comparative clustering of training and test sets
        run_cd_hit_2d(
            infile01=path_train_fasta,
            infile02=path_test_fasta,
            outfile=path_train_cluster,
            cutoff=cutoff,
            memory=120_000,
        )
        # 解析训练数据集的聚类结果
        # Parse the clustering results of training set
        train_cluster = parse_cdhit_clstr(
            clstr_file=path_train_cluster + ".clstr")
        # 获取测试数据集的聚类结果
        # Get the clustering results of test set
        test_data_cutoff = test_data[test_data.index.isin(train_cluster["ID"])]
        # 将测试数据集的聚类结果存储到字典中
        # Store the clustering results of test set into the dictionary
        dict_test_cutoff[cutoff] = test_data_cutoff
    # 返回字典
    # Return dictionary
    return dict_test_cutoff

In [7]:
# 将训练集和测试集进行序列一致性比对
# Compare the training set and test set for sequence consistency
# 将train_data，test_data写入fasta文件
# Write train_data, test_data into fasta file
write_fasta(path=path_train_fasta, ds=train_data['Sequence'])
write_fasta(path=path_test_fasta, ds=test_data['Sequence'])

# 按照多个一致性阈值对比训练集和测试集，得出与训练集的一致性高于阈值的测试集
# Compare training set and test set according to multiple consistency thresholds, and obtain test set whose consistency with training set is higher than the threshold.
list_cutoff = [0.8, 0.6, 0.4]

# 将测试集与训练集进行一致性比对
dict_test_cutoff = train_test_cluster(
    list_cutoff, path_train_fasta, path_test_fasta, test_data
)
test_data_80, test_data_60, test_data_40 = dict_test_cutoff.values()

# 将测试集拆分为与训练集的一致性为0-40%，40-60%，60-80%，80-100%的子集
# Split test set into subsets whose consistency with training set is 0-40%, 40-60%, 60-80%, and 80-100%
test_data_80_100 = test_data_80
test_data_60_80 = test_data_60[~test_data_60.index.isin(test_data_80.index)]
test_data_40_60 = test_data_40[~test_data_40.index.isin(test_data_60.index)]
test_data_0_40 = test_data[~test_data.index.isin(test_data_40.index)]

print(train_data.shape, valid_data.shape, test_data.shape, test_data_80_100.shape, test_data_60_80.shape, test_data_40_60.shape, test_data_0_40.shape)

/home/mingxuan/cd-hit-v4.8.1-2019-0228/cd-hit-2d -i ../data/review_sequence_train_20240611_80.fasta -i2 ../data/review_sequence_test_20240611_80.fasta -o ../data/review_sequence_train_20240611_cluster80 -c 0.8 -n 5 -T 0 -M 120000 -d 0
/home/mingxuan/cd-hit-v4.8.1-2019-0228/cd-hit-2d -i ../data/review_sequence_train_20240611_80.fasta -i2 ../data/review_sequence_test_20240611_80.fasta -o ../data/review_sequence_train_20240611_cluster60 -c 0.6 -n 4 -T 0 -M 120000 -d 0
/home/mingxuan/cd-hit-v4.8.1-2019-0228/cd-hit-2d -i ../data/review_sequence_train_20240611_80.fasta -i2 ../data/review_sequence_test_20240611_80.fasta -o ../data/review_sequence_train_20240611_cluster40 -c 0.4 -n 2 -T 0 -M 120000 -d 0
(31471, 7) (3150, 7) (4725, 7) (564, 7) (1808, 7) (1206, 7) (1147, 7)


In [9]:
# 保存数据文件
# Save data file
train_data.to_csv(path_train_data, sep="\t", index=False)
valid_data.to_csv(path_valid_data, sep="\t", index=False)
test_data.to_csv(path_test_data, sep="\t", index=False)
test_data_0_40.to_csv(path_test_data_0_40, sep="\t", index=False)
test_data_40_60.to_csv(path_test_data_40_60, sep="\t", index=False)
test_data_60_80.to_csv(path_test_data_60_80, sep="\t", index=False)
test_data_80_100.to_csv(path_test_data_80_100, sep="\t", index=False)