In [1]:
import pandas as pd
import numpy as np
import json
import pickle
from Bio.KEGG import REST
from Bio import SeqIO
import ast
import time
import DLKcat
import torch
from rdkit import Chem
from collections import defaultdict
import math
import re
import cobra
from tqdm import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


### DLKcatIgnoreMets.tsv

In [2]:
miss_met_id =[
's_0803', 's_0804', 's_0805', 's_0806', 's_0807', 's_0808', 's_0809', 's_0810', 's_2808', 's_2976', 's_2994', 's_3226', 's_3449', 's_3657',##h20
's_0793', 's_0794', 's_0795', 's_0796', 's_0797', 's_0798', 's_0799', 's_0800', 's_0801', 's_0802', 's_2783', 's_3094', 's_3146', 's_3164',#H+
's_1275', 's_1276', 's_1277', 's_1278', 's_1279', 's_2817',#oxygen
's_0837', 's_0838', 's_0839', 's_0840',#hydrogen peroxide
's_0419', 's_0420', 's_0421',#ammonium
's_1322', 's_1323', 's_1324', 's_1325', 's_1326', 's_1329', 's_2966', 's_2977', 's_2995', 's_3228', 's_3536', 's_4293',#phosphate
's_0633', 's_0635', 's_0636', 's_0637', 's_0638', 's_2834', 's_2860', 's_3095', 's_4157',#diphosphate
's_1467', 's_1468', 's_4016',#sulphate
's_0841', 's_3906', 's_4263',#hydrogen sulfide
's_4010', 's_4113', 's_4159',#thiosulfate
's_1373', 's_1374', 's_3776',#potassium
's_1437', 's_1438', 's_3775',#sodium
's_3777', 's_3778', 's_3926', 's_4200',#chloride
's_4013', 's_4014', 's_4204',#Mg(2+)
's_3801', 's_3802', 's_4202',#Mn(2+)
's_3822', 's_3823', 's_4026', 's_4027', 's_4203',#Zn(2+)
's_3880', 's_3882', 's_4197', 's_4199',#Ca(2+)
's_4019', 's_4020', 's_4201', #Cu2(+)
's_0924', 's_0925', 's_0926',#iron(2+)
's_3855', 's_3936', 's_4031', #iron(3+)

's_0434', 's_0435', 's_0437', 's_0438', 's_0439', 's_2831', 's_2840', 's_2856', 's_3341', 's_3359', 's_3881', 's_4196',#ATP
's_0394', 's_0395', 's_0397', 's_0398', 's_0399', 's_3324', 's_3342', 's_3360', 's_3883', 's_3987', 's_4198',#ADP
's_0423', 's_0424', 's_0425', 's_0426', 's_2833', 's_2842', 's_2859', 's_4160',#AMP
's_1203', 's_1204', 's_1205', 's_1206', 's_2818', 's_3753',#NADH
's_1198', 's_1199', 's_1200', 's_1201', 's_1202', 's_2820',#NAD
's_1212', 's_1213', 's_1214', 's_1215', 's_2799', 's_2952',#NADPH
's_1207', 's_1208', 's_1210', 's_1211', 's_2800', 's_2953',#NADP(+)
's_0689', 's_0690',#FADH2
's_0687', 's_0688',#FAD
]

In [3]:
### input
yeast870_path =  '../../Data/model/yeast-GEM.yml'
yeast8U_path = '../../Data/model/Yeast-MetaTwin.yml'

yeast_genome_file = '../../Data/Saccharomyces_cerevisiae.fasta'

cut_off = 0.3
num = 50
cut_off_path = f'../../Results/not_lipid/top{num}_{cut_off}_re/'
metabolites_info_to_GEM_path = cut_off_path + f'metabolites_info_to_GEM_top{num}_{cut_off}.csv'
rxndb_total_info_to_model_path = cut_off_path + f'rxndb_total_info_to_model_top{num}_{cut_off}.csv'
yeast8_reaction_in_rxndb_json = cut_off_path + f'yeast8_reaction_in_rxndb_top{num}_{cut_off}.json'


#output 
yeast8U_sequence_smiles_pre_path = '../../Results/kcat_km_predict/yeast8U_sequence_smiles_prepare.csv'

In [4]:
yeast8U = cobra.io.load_yaml_model(yeast8U_path)

In [5]:
met_info = pd.read_csv(metabolites_info_to_GEM_path)
met_info.head(2)

Unnamed: 0,new_met_smiles,sim_smile,ID,compartment
0,CC(N)C(O)=NC(CC(=O)O)C(=O)O,CC(N)C(=O)NC(CC(=O)O)C(=O)O,s_4120,c
1,CCCCCCCCCCCCCCCCCCCC=CC(=O)SCCNC(=O)CCNC(=O)C(...,CCCCCCCCCCCCCCCCCCCC=CC(=O)SCCNC(=O)CCNC(=O)C(...,s_2813,erm


In [6]:
kcat_pre_pd = {'rea_id':[],
               'gene':[],
               'met_id':[],
               'reactant_met_id':[],
               'product_met_id':[]
               }
for reaction in tqdm(yeast8U.reactions,total=len(yeast8U.reactions)):
    gene_list = reaction.gene_reaction_rule
    gene_list = gene_list.replace('[','').replace(']','').replace('(','').replace(')','')
    gene_list = re.split(' and | or ', gene_list)
    gene_list = [x for x in gene_list if x!='']

    if reaction.lower_bound == 0 and  reaction.upper_bound > 0:
        metabolite_list = [k.id for k, v in reaction.metabolites.items() if v < 0 and k.id not in miss_met_id] #不包含miss_met_id
        reactant_list = [k.id for k, v in reaction.metabolites.items() if v < 0] #包含miss_met_id/TurNuP
        product_list = [k.id for k, v in reaction.metabolites.items() if v > 0]
        num = 0
        if len(gene_list) > 0:
            for gene in set(gene_list):
                for met in set(metabolite_list):
                    num += 1
                    kcat_pre_pd['rea_id'].append(reaction.id + '_' + str(num))
                    kcat_pre_pd['gene'].append(gene)
                    kcat_pre_pd['met_id'].append(met)
                    kcat_pre_pd['reactant_met_id'].append(reactant_list)
                    kcat_pre_pd['product_met_id'].append(product_list)

    elif reaction.lower_bound < 0 and  reaction.upper_bound == 0:
        metabolite_list = [k.id for k, v in reaction.metabolites.items() if v > 0 and k.id not in miss_met_id]
        reactant_list = [k.id for k, v in reaction.metabolites.items() if v > 0] #包含miss_met_id
        product_list = [k.id for k, v in reaction.metabolites.items() if v < 0]        
        num = 0
        if len(gene_list) > 0:
            for gene in set(gene_list):
                for met in set(metabolite_list):
                    num += 1
                    kcat_pre_pd['rea_id'].append(reaction.id + '_REV_' + str(num))
                    kcat_pre_pd['gene'].append(gene)
                    kcat_pre_pd['met_id'].append(met)
                    kcat_pre_pd['reactant_met_id'].append(reactant_list)
                    kcat_pre_pd['product_met_id'].append(product_list)

    elif reaction.lower_bound == 0 and  reaction.upper_bound == 0:
        pass

    elif reaction.lower_bound < 0 and  reaction.upper_bound > 0:

        metabolite_list = [k.id for k, v in reaction.metabolites.items() if v < 0 and k.id not in miss_met_id]
        reactant_list = [k.id for k, v in reaction.metabolites.items() if v < 0] #包含miss_met_id/TurNuP
        product_list = [k.id for k, v in reaction.metabolites.items() if v > 0]
        num = 0
        if len(gene_list) > 0:
            for gene in set(gene_list):
                for met in set(metabolite_list):
                    num += 1
                    kcat_pre_pd['rea_id'].append(reaction.id + '_' + str(num))
                    kcat_pre_pd['gene'].append(gene)
                    kcat_pre_pd['met_id'].append(met)
                    kcat_pre_pd['reactant_met_id'].append(reactant_list)
                    kcat_pre_pd['product_met_id'].append(product_list)   


        metabolite_list = [k.id for k, v in reaction.metabolites.items() if v > 0 and k.id not in miss_met_id]
        reactant_list = [k.id for k, v in reaction.metabolites.items() if v > 0] #包含miss_met_id/TurNuP
        product_list = [k.id for k, v in reaction.metabolites.items() if v < 0]
        num = 0
        if len(gene_list) > 0:
            for gene in set(gene_list):
                for met in set(metabolite_list):
                    num += 1
                    kcat_pre_pd['rea_id'].append(reaction.id + '_REV_' + str(num))
                    kcat_pre_pd['gene'].append(gene)
                    kcat_pre_pd['met_id'].append(met)
                    kcat_pre_pd['reactant_met_id'].append(reactant_list)
                    kcat_pre_pd['product_met_id'].append(product_list)    
    else: 
        print('bounds error:',reaction.id)

  0%|          | 0/7459 [00:00<?, ?it/s]

 57%|█████▋    | 4248/7459 [00:00<00:00, 42438.25it/s]

bounds error: r_4046


100%|██████████| 7459/7459 [00:00<00:00, 13287.39it/s]


In [7]:
pd.DataFrame(kcat_pre_pd)

Unnamed: 0,rea_id,gene,met_id,reactant_met_id,product_met_id
0,r_0001_1,YEL071W,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]"
1,r_0001_2,YEL071W,s_0709,"[s_0025, s_0709]","[s_0710, s_1399]"
2,r_0001_3,YJR048W,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]"
3,r_0001_4,YJR048W,s_0709,"[s_0025, s_0709]","[s_0710, s_1399]"
4,r_0001_5,YEL039C,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]"
...,...,...,...,...,...
263664,rxn1363_138,YNL267W,s_3958,"[s_3958, sn_22]","[s_0188, s_3998]"
263665,rxn1363_139,YDR051C,sn_22,"[s_3958, sn_22]","[s_0188, s_3998]"
263666,rxn1363_140,YDR051C,s_3958,"[s_3958, sn_22]","[s_0188, s_3998]"
263667,rxn1363_141,YAL038W,sn_22,"[s_3958, sn_22]","[s_0188, s_3998]"


In [8]:
kcat_pre_pd = pd.DataFrame(kcat_pre_pd)
print(kcat_pre_pd.shape)
kcat_pre_pd.head(3)

(263669, 5)


Unnamed: 0,rea_id,gene,met_id,reactant_met_id,product_met_id
0,r_0001_1,YEL071W,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]"
1,r_0001_2,YEL071W,s_0709,"[s_0025, s_0709]","[s_0710, s_1399]"
2,r_0001_3,YJR048W,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]"


### 补充smiles

In [9]:
def get_smiles(metid,model,met_info):
    # print(metid)
    smiles = None
    if metid.startswith('s_'):
        if hasattr(model.metabolites.get_by_id(metid), 'smiles'):
            smiles = model.metabolites.get_by_id(metid).smiles
    elif metid.startswith('sn_'):
        if metid in met_info['ID'].tolist():
            smiles =  met_info[met_info['ID'] == metid]['new_met_smiles'].to_list()[0]
    return smiles

def get_smiles_lst(metid_lst,model,met_info):
    # print(metid)
    smiles_lst = []
    for metid in metid_lst:
        if metid.startswith('s_'):
            if hasattr(model.metabolites.get_by_id(metid), 'smiles'):
                smiles_lst.append(model.metabolites.get_by_id(metid).smiles)
            else:
                smiles_lst.append('nosmiles')
        elif metid.startswith('sn_'):
            if metid in met_info['ID'].tolist():
                smiles_lst.append(met_info[met_info['ID'] == metid]['new_met_smiles'].to_list()[0])
            else:
                smiles_lst.append('nosmiles')
    if 'nosmiles' not in smiles_lst:
        return smiles_lst
    else:
        return []

In [10]:
tqdm.pandas()
yeast870 = cobra.io.load_yaml_model(yeast870_path)
kcat_pre_pd['SMILES'] = kcat_pre_pd['met_id'].progress_apply(lambda x: get_smiles(x, yeast870, met_info))
kcat_pre_pd

100%|██████████| 263669/263669 [00:33<00:00, 7795.31it/s] 


Unnamed: 0,rea_id,gene,met_id,reactant_met_id,product_met_id,SMILES
0,r_0001_1,YEL071W,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]",CC(C(=O)O)O
1,r_0001_2,YEL071W,s_0709,"[s_0025, s_0709]","[s_0710, s_1399]",CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=...
2,r_0001_3,YJR048W,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]",CC(C(=O)O)O
3,r_0001_4,YJR048W,s_0709,"[s_0025, s_0709]","[s_0710, s_1399]",CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=...
4,r_0001_5,YEL039C,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]",CC(C(=O)O)O
...,...,...,...,...,...,...
263664,rxn1363_138,YNL267W,s_3958,"[s_3958, sn_22]","[s_0188, s_3998]",C(C(C(=O)[O-])O)O
263665,rxn1363_139,YDR051C,sn_22,"[s_3958, sn_22]","[s_0188, s_3998]",O=P(O)(O)OCC1OC(O[C@]2(CO)OC(CO)C(O)C2O)C(O)C(...
263666,rxn1363_140,YDR051C,s_3958,"[s_3958, sn_22]","[s_0188, s_3998]",C(C(C(=O)[O-])O)O
263667,rxn1363_141,YAL038W,sn_22,"[s_3958, sn_22]","[s_0188, s_3998]",O=P(O)(O)OCC1OC(O[C@]2(CO)OC(CO)C(O)C2O)C(O)C(...


In [11]:
kcat_pre_pd['reactant_SMILES'] = kcat_pre_pd['reactant_met_id'].progress_apply(lambda x: get_smiles_lst(x, yeast870, met_info))
kcat_pre_pd

100%|██████████| 263669/263669 [00:54<00:00, 4795.69it/s] 


Unnamed: 0,rea_id,gene,met_id,reactant_met_id,product_met_id,SMILES,reactant_SMILES
0,r_0001_1,YEL071W,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]",CC(C(=O)O)O,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]..."
1,r_0001_2,YEL071W,s_0709,"[s_0025, s_0709]","[s_0710, s_1399]",CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=...,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]..."
2,r_0001_3,YJR048W,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]",CC(C(=O)O)O,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]..."
3,r_0001_4,YJR048W,s_0709,"[s_0025, s_0709]","[s_0710, s_1399]",CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=...,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]..."
4,r_0001_5,YEL039C,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]",CC(C(=O)O)O,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]..."
...,...,...,...,...,...,...,...
263664,rxn1363_138,YNL267W,s_3958,"[s_3958, sn_22]","[s_0188, s_3998]",C(C(C(=O)[O-])O)O,"[C(C(C(=O)[O-])O)O, O=P(O)(O)OCC1OC(O[C@]2(CO)..."
263665,rxn1363_139,YDR051C,sn_22,"[s_3958, sn_22]","[s_0188, s_3998]",O=P(O)(O)OCC1OC(O[C@]2(CO)OC(CO)C(O)C2O)C(O)C(...,"[C(C(C(=O)[O-])O)O, O=P(O)(O)OCC1OC(O[C@]2(CO)..."
263666,rxn1363_140,YDR051C,s_3958,"[s_3958, sn_22]","[s_0188, s_3998]",C(C(C(=O)[O-])O)O,"[C(C(C(=O)[O-])O)O, O=P(O)(O)OCC1OC(O[C@]2(CO)..."
263667,rxn1363_141,YAL038W,sn_22,"[s_3958, sn_22]","[s_0188, s_3998]",O=P(O)(O)OCC1OC(O[C@]2(CO)OC(CO)C(O)C2O)C(O)C(...,"[C(C(C(=O)[O-])O)O, O=P(O)(O)OCC1OC(O[C@]2(CO)..."


In [12]:
kcat_pre_pd['product_SMILES'] = kcat_pre_pd['product_met_id'].progress_apply(lambda x: get_smiles_lst(x, yeast870, met_info))
kcat_pre_pd

100%|██████████| 263669/263669 [00:25<00:00, 10445.65it/s]


Unnamed: 0,rea_id,gene,met_id,reactant_met_id,product_met_id,SMILES,reactant_SMILES,product_SMILES
0,r_0001_1,YEL071W,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]",CC(C(=O)O)O,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...
1,r_0001_2,YEL071W,s_0709,"[s_0025, s_0709]","[s_0710, s_1399]",CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=...,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...
2,r_0001_3,YJR048W,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]",CC(C(=O)O)O,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...
3,r_0001_4,YJR048W,s_0709,"[s_0025, s_0709]","[s_0710, s_1399]",CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=...,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...
4,r_0001_5,YEL039C,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]",CC(C(=O)O)O,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...
...,...,...,...,...,...,...,...,...
263664,rxn1363_138,YNL267W,s_3958,"[s_3958, sn_22]","[s_0188, s_3998]",C(C(C(=O)[O-])O)O,"[C(C(C(=O)[O-])O)O, O=P(O)(O)OCC1OC(O[C@]2(CO)...","[C(C(C(=O)O)OP(=O)(O)O)O, C(C1C(C(C(C(O1)OC2(C..."
263665,rxn1363_139,YDR051C,sn_22,"[s_3958, sn_22]","[s_0188, s_3998]",O=P(O)(O)OCC1OC(O[C@]2(CO)OC(CO)C(O)C2O)C(O)C(...,"[C(C(C(=O)[O-])O)O, O=P(O)(O)OCC1OC(O[C@]2(CO)...","[C(C(C(=O)O)OP(=O)(O)O)O, C(C1C(C(C(C(O1)OC2(C..."
263666,rxn1363_140,YDR051C,s_3958,"[s_3958, sn_22]","[s_0188, s_3998]",C(C(C(=O)[O-])O)O,"[C(C(C(=O)[O-])O)O, O=P(O)(O)OCC1OC(O[C@]2(CO)...","[C(C(C(=O)O)OP(=O)(O)O)O, C(C1C(C(C(C(O1)OC2(C..."
263667,rxn1363_141,YAL038W,sn_22,"[s_3958, sn_22]","[s_0188, s_3998]",O=P(O)(O)OCC1OC(O[C@]2(CO)OC(CO)C(O)C2O)C(O)C(...,"[C(C(C(=O)[O-])O)O, O=P(O)(O)OCC1OC(O[C@]2(CO)...","[C(C(C(=O)O)OP(=O)(O)O)O, C(C1C(C(C(C(O1)OC2(C..."


### 补充蛋白质序列

In [13]:
def fasta_to_df(fasta_name):
    data = []
    with open(fasta_name, 'r') as fasta:
        header = None
        sequence = []
        for line in fasta:
            if line.startswith('>'):
                if header is not None:
                    data.append((header, ''.join(sequence)))
                header = line.strip()[1:]
                sequence = []
            elif line.strip() != '':
                sequence.append(line.strip())

    if header is not None:
        data.append((header, ''.join(sequence)))

    df = pd.DataFrame(data, columns=['ID', 'Sequence'])

    return df

yeast_genome = fasta_to_df(yeast_genome_file)
yeast_genome.head(3)

Unnamed: 0,ID,Sequence
0,YDL204W,MNRNTTTNKNANLNNSRNANAPGEAGHQNKTGLIYWTNPSKSGASF...
1,YNL167C,MSSEERSRQPSTVSTFDLEPNPFEQSFASSKKALSLPGTISHPSLP...
2,YJL068C,MKVVKEFSVCGGRLIKLSHNSNSTKTSMNVNIYLPKHYYAQDFPRN...


In [14]:
def get_sequence(gene,yeast_genome):
    if gene in yeast_genome.ID.tolist():
        return yeast_genome.loc[yeast_genome['ID'] == gene]['Sequence'].values[0]
    return None

print(get_sequence('YDL204W',yeast_genome))

MNRNTTTNKNANLNNSRNANAPGEAGHQNKTGLIYWTNPSKSGASFAATLVSLLILRNVNVISVLLKIGYMVLFTSFAVELSTKVLFDKGVVSRFGMQESPDLVGVLKPHIDRELDRLPALEDRIRKLVFAHRTRNNFTIGVSLYFLHGLFAIFSMNTVLIMTTIFLYTVPLIYDRKQARIDRAIDRMKDLVIHRFHKNYNKVVEKTEPYIDKIIPPQTDEGSYSTSISNENKSSTSQRNKSGLSSSEFDNMNDTSASKSGKDSYSTSQYNRAEYPVSQNENIGTLKSGKQEIPTEKDFNNRHENFSKPDVKTYDPRTVDIEEELAAHQRELEQNLKDGDYNLVGSKEIPDPITVPAPTRHTTKPAESQSIPIKNNETLHKTTHGLKQKLQHA


In [15]:
kcat_pre_pd['Sequence'] = kcat_pre_pd['gene'].progress_apply(lambda x:get_sequence(x,yeast_genome))
kcat_pre_pd

100%|██████████| 263669/263669 [03:02<00:00, 1446.28it/s]


Unnamed: 0,rea_id,gene,met_id,reactant_met_id,product_met_id,SMILES,reactant_SMILES,product_SMILES,Sequence
0,r_0001_1,YEL071W,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]",CC(C(=O)O)O,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...,MTAAHPVAQLTAEAYPKVKRNPNFKVLDSEDLAYFRSILSNDEILN...
1,r_0001_2,YEL071W,s_0709,"[s_0025, s_0709]","[s_0710, s_1399]",CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=...,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...,MTAAHPVAQLTAEAYPKVKRNPNFKVLDSEDLAYFRSILSNDEILN...
2,r_0001_3,YJR048W,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]",CC(C(=O)O)O,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...,MTEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHS...
3,r_0001_4,YJR048W,s_0709,"[s_0025, s_0709]","[s_0710, s_1399]",CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=...,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...,MTEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHS...
4,r_0001_5,YEL039C,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]",CC(C(=O)O)O,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...,MAKESTGFKPGSAKKGATLFKTRCQQCHTIEEGGPNKVGPNLHGIF...
...,...,...,...,...,...,...,...,...,...
263664,rxn1363_138,YNL267W,s_3958,"[s_3958, sn_22]","[s_0188, s_3998]",C(C(C(=O)[O-])O)O,"[C(C(C(=O)[O-])O)O, O=P(O)(O)OCC1OC(O[C@]2(CO)...","[C(C(C(=O)O)OP(=O)(O)O)O, C(C1C(C(C(C(O1)OC2(C...",MHKASSSKKSFDDTIELKKNEQLLKLINSSEFTLHNCVELLCKHSE...
263665,rxn1363_139,YDR051C,sn_22,"[s_3958, sn_22]","[s_0188, s_3998]",O=P(O)(O)OCC1OC(O[C@]2(CO)OC(CO)C(O)C2O)C(O)C(...,"[C(C(C(=O)[O-])O)O, O=P(O)(O)OCC1OC(O[C@]2(CO)...","[C(C(C(=O)O)OP(=O)(O)O)O, C(C1C(C(C(C(O1)OC2(C...",MCEENVHVSEDVAGSHGSFTNARPRLIVLIRHGESESNKNKEVNGY...
263666,rxn1363_140,YDR051C,s_3958,"[s_3958, sn_22]","[s_0188, s_3998]",C(C(C(=O)[O-])O)O,"[C(C(C(=O)[O-])O)O, O=P(O)(O)OCC1OC(O[C@]2(CO)...","[C(C(C(=O)O)OP(=O)(O)O)O, C(C1C(C(C(C(O1)OC2(C...",MCEENVHVSEDVAGSHGSFTNARPRLIVLIRHGESESNKNKEVNGY...
263667,rxn1363_141,YAL038W,sn_22,"[s_3958, sn_22]","[s_0188, s_3998]",O=P(O)(O)OCC1OC(O[C@]2(CO)OC(CO)C(O)C2O)C(O)C(...,"[C(C(C(=O)[O-])O)O, O=P(O)(O)OCC1OC(O[C@]2(CO)...","[C(C(C(=O)O)OP(=O)(O)O)O, C(C1C(C(C(C(O1)OC2(C...",MSRLERLTSLNVVAGSDLRRTSIIGTIGPKTNNPETLVALRKAGLN...


### 补充EC号

In [16]:
# uniprot_sce_file = '../Data/uniprot/uniprotkb_organism_id_559292_2023_11_08.tsv'
# uniprot_sce = pd.read_csv(uniprot_sce_file,sep='\t')
# uniprot_sce = uniprot_sce.fillna(' ')
# uniprot_sce['Gene Names'] = uniprot_sce['Gene Names'].apply(lambda x:x.split(' '))
# uniprot_sce.head(3)

In [17]:
# def get_uniprot_ec(gene,uniprot_sce):
#     ec_lst = []
#     tmp = uniprot_sce[uniprot_sce['Gene Names'].apply(lambda x: gene in x)]
#     if len(tmp)<1:
#         ec_lst.append('noec')
#     else:
#         for i in tmp['EC number'].to_list():
#             ec_lst += i.split('; ')
#     ec_lst = list(set(ec_lst))
#     ec_lst = [x for x in ec_lst if x != ' ']
#     return ec_lst

# gene = 'YPL060C-A'
# # gene = 'YFL026W'
# get_uniprot_ec(gene,uniprot_sce)

In [18]:
# DeepProZyme_path = '../Data/EC_predict/Sce_DeepECv2.txt'

# def get_gene2ec_dict_DeepProZyme(DeepProZyme_path):
#     DeepECv2_res = pd.read_csv(DeepProZyme_path,sep='\t')
#     DeepECv2_res = DeepECv2_res[DeepECv2_res['prediction']!='None']
#     DeepECv2_res['prediction'] = DeepECv2_res['prediction'].apply(lambda x:x.split(':')[1])
#     gene2ec_dict = {}
#     for index,row in DeepECv2_res.iterrows():
#         if row['sequence_ID'] not in gene2ec_dict:
#             gene2ec_dict[row['sequence_ID']] = []
#         gene2ec_dict[row['sequence_ID']].append(row['prediction'])
#     for key, values in gene2ec_dict.items():
#         gene2ec_dict[key] = list(set([".".join(value.split(".")[:3]) for value in values]))

#     return gene2ec_dict 


# def get_ec2gene_dict_DeepProZyme(gene2ec_dict):
#     DeepProZyme_gene_list = list(gene2ec_dict.keys())
#     DeepProZyme_ec2gene_dict = {}
#     for gene, ec_list in gene2ec_dict.items():
#         for ec in ec_list:
#             if ec not in DeepProZyme_ec2gene_dict:
#                 DeepProZyme_ec2gene_dict[ec] = []
#             DeepProZyme_ec2gene_dict[ec].append(gene)
            
#     return DeepProZyme_ec2gene_dict 

# DeepProZyme_gene2ec_dict = get_gene2ec_dict_DeepProZyme(DeepProZyme_path)
# print(len(DeepProZyme_gene2ec_dict))
# DeepProZyme_gene2ec_dict

In [19]:
# sce_gene_clean_ec = '../Data/EC_predict/Saccharomyces_cerevisiae_teacher_maxsep.csv'

# def get_gene2ec_dict_clean(sce_gene_clean_ec):
#     with open(sce_gene_clean_ec, 'r') as file:
#         csv_data = file.read()
#     rows = csv_data.split('\n')
#     gene2ec_dict = {}
#     for row in rows:
#         columns = row.split(',')
#         key = columns[0]
#         values = [v.split('/')[0].replace('EC:','') for v in columns[1:]]
#         if len(values)>0:
#             gene2ec_dict[key] = values
#     for key, values in gene2ec_dict.items():
#         gene2ec_dict[key] = list(set([".".join(value.split(".")[:3]) for value in values]))
#     return gene2ec_dict

# def get_ec2gene_dict_clean(gene2ec_dict):
#     clean_gene_list = list(gene2ec_dict.keys())
#     clean_ec2gene_dict = {}
#     for gene, ec_list in gene2ec_dict.items():
#         for ec in ec_list:
#             if ec not in clean_ec2gene_dict:
#                 clean_ec2gene_dict[ec] = []
#             clean_ec2gene_dict[ec].append(gene)
#     return clean_ec2gene_dict

In [20]:
# clean_gene2ec_dict = get_gene2ec_dict_clean(sce_gene_clean_ec)
# print(len(clean_gene2ec_dict))
# clean_gene2ec_dict

In [21]:
# for index,row in tqdm(kcat_pre_pd.iterrows(),total=len(kcat_pre_pd)):
#     if len(get_uniprot_ec(row['gene'],uniprot_sce))>0:
#         # print(get_uniprot_ec(row['gene'],uniprot_sce))
#         kcat_pre_pd.loc[index,'EC'] = str(get_uniprot_ec(row['gene'],uniprot_sce))
#     # elif row['gene'] in DeepProZyme_gene2ec_dict.keys():
#         # kcat_pre_pd.loc[index,'EC'] = DeepProZyme_gene2ec_dict[row['gene']]########################
#     elif row['gene'] in clean_gene2ec_dict.keys():
#         kcat_pre_pd.loc[index,'EC'] = str(clean_gene2ec_dict[row['gene']])
#     else:
#         pass

In [22]:
# kcat_pre_pd.head()

### 筛选不应该有的行

In [23]:
print(len(kcat_pre_pd))
kcat_pre_pd = kcat_pre_pd[~kcat_pre_pd['met_id'].isna()]
print(len(kcat_pre_pd))
kcat_pre_pd = kcat_pre_pd[~kcat_pre_pd['Sequence'].isna()]
print(len(kcat_pre_pd))
kcat_pre_pd = kcat_pre_pd[kcat_pre_pd['reactant_SMILES'].apply(lambda x: len(x) != 0)]
print(len(kcat_pre_pd))
kcat_pre_pd = kcat_pre_pd[kcat_pre_pd['product_SMILES'].apply(lambda x: len(x) != 0)]
print(len(kcat_pre_pd))

263669
263669
263661
234044
214244


### 保存文件用于kcat/km预测

In [24]:
kcat_pre_pd.to_csv(yeast8U_sequence_smiles_pre_path,index=None)

In [25]:
kcat_pre_pd = pd.read_csv(yeast8U_sequence_smiles_pre_path)
kcat_pre_pd

Unnamed: 0,rea_id,gene,met_id,reactant_met_id,product_met_id,SMILES,reactant_SMILES,product_SMILES,Sequence
0,r_0001_1,YEL071W,s_0025,"['s_0025', 's_0709']","['s_0710', 's_1399']",CC(C(=O)O)O,"['CC(C(=O)O)O', 'CC1=C(C2=CC3=NC(=CC4=C(C(=C([...",['CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C...,MTAAHPVAQLTAEAYPKVKRNPNFKVLDSEDLAYFRSILSNDEILN...
1,r_0001_2,YEL071W,s_0709,"['s_0025', 's_0709']","['s_0710', 's_1399']",CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=...,"['CC(C(=O)O)O', 'CC1=C(C2=CC3=NC(=CC4=C(C(=C([...",['CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C...,MTAAHPVAQLTAEAYPKVKRNPNFKVLDSEDLAYFRSILSNDEILN...
2,r_0001_3,YJR048W,s_0025,"['s_0025', 's_0709']","['s_0710', 's_1399']",CC(C(=O)O)O,"['CC(C(=O)O)O', 'CC1=C(C2=CC3=NC(=CC4=C(C(=C([...",['CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C...,MTEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHS...
3,r_0001_4,YJR048W,s_0709,"['s_0025', 's_0709']","['s_0710', 's_1399']",CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=...,"['CC(C(=O)O)O', 'CC1=C(C2=CC3=NC(=CC4=C(C(=C([...",['CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C...,MTEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHS...
4,r_0001_5,YEL039C,s_0025,"['s_0025', 's_0709']","['s_0710', 's_1399']",CC(C(=O)O)O,"['CC(C(=O)O)O', 'CC1=C(C2=CC3=NC(=CC4=C(C(=C([...",['CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C...,MAKESTGFKPGSAKKGATLFKTRCQQCHTIEEGGPNKVGPNLHGIF...
...,...,...,...,...,...,...,...,...,...
214239,rxn1363_138,YNL267W,s_3958,"['s_3958', 'sn_22']","['s_0188', 's_3998']",C(C(C(=O)[O-])O)O,"['C(C(C(=O)[O-])O)O', 'O=P(O)(O)OCC1OC(O[C@]2(...","['C(C(C(=O)O)OP(=O)(O)O)O', 'C(C1C(C(C(C(O1)OC...",MHKASSSKKSFDDTIELKKNEQLLKLINSSEFTLHNCVELLCKHSE...
214240,rxn1363_139,YDR051C,sn_22,"['s_3958', 'sn_22']","['s_0188', 's_3998']",O=P(O)(O)OCC1OC(O[C@]2(CO)OC(CO)C(O)C2O)C(O)C(...,"['C(C(C(=O)[O-])O)O', 'O=P(O)(O)OCC1OC(O[C@]2(...","['C(C(C(=O)O)OP(=O)(O)O)O', 'C(C1C(C(C(C(O1)OC...",MCEENVHVSEDVAGSHGSFTNARPRLIVLIRHGESESNKNKEVNGY...
214241,rxn1363_140,YDR051C,s_3958,"['s_3958', 'sn_22']","['s_0188', 's_3998']",C(C(C(=O)[O-])O)O,"['C(C(C(=O)[O-])O)O', 'O=P(O)(O)OCC1OC(O[C@]2(...","['C(C(C(=O)O)OP(=O)(O)O)O', 'C(C1C(C(C(C(O1)OC...",MCEENVHVSEDVAGSHGSFTNARPRLIVLIRHGESESNKNKEVNGY...
214242,rxn1363_141,YAL038W,sn_22,"['s_3958', 'sn_22']","['s_0188', 's_3998']",O=P(O)(O)OCC1OC(O[C@]2(CO)OC(CO)C(O)C2O)C(O)C(...,"['C(C(C(=O)[O-])O)O', 'O=P(O)(O)OCC1OC(O[C@]2(...","['C(C(C(=O)O)OP(=O)(O)O)O', 'C(C1C(C(C(C(O1)OC...",MSRLERLTSLNVVAGSDLRRTSIIGTIGPKTNNPETLVALRKAGLN...
