In [1]:
import pandas as pd
import numpy as np
import json
import pickle
from Bio.KEGG import REST
from Bio import SeqIO
import ast
import time
import DLKcat
import torch
from rdkit import Chem
from collections import defaultdict
import math
import re
import cobra
from tqdm import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


### DLKcatIgnoreMets.tsv

In [2]:
miss_met_id =[
's_0803', 's_0804', 's_0805', 's_0806', 's_0807', 's_0808', 's_0809', 's_0810', 's_2808', 's_2976', 's_2994', 's_3226', 's_3449', 's_3657',##h20
's_0793', 's_0794', 's_0795', 's_0796', 's_0797', 's_0798', 's_0799', 's_0800', 's_0801', 's_0802', 's_2783', 's_3094', 's_3146', 's_3164',#H+
's_1275', 's_1276', 's_1277', 's_1278', 's_1279', 's_2817',#oxygen
's_0837', 's_0838', 's_0839', 's_0840',#hydrogen peroxide
's_0419', 's_0420', 's_0421',#ammonium
's_1322', 's_1323', 's_1324', 's_1325', 's_1326', 's_1329', 's_2966', 's_2977', 's_2995', 's_3228', 's_3536', 's_4293',#phosphate
's_0633', 's_0635', 's_0636', 's_0637', 's_0638', 's_2834', 's_2860', 's_3095', 's_4157',#diphosphate
's_1467', 's_1468', 's_4016',#sulphate
's_0841', 's_3906', 's_4263',#hydrogen sulfide
's_4010', 's_4113', 's_4159',#thiosulfate
's_1373', 's_1374', 's_3776',#potassium
's_1437', 's_1438', 's_3775',#sodium
's_3777', 's_3778', 's_3926', 's_4200',#chloride
's_4013', 's_4014', 's_4204',#Mg(2+)
's_3801', 's_3802', 's_4202',#Mn(2+)
's_3822', 's_3823', 's_4026', 's_4027', 's_4203',#Zn(2+)
's_3880', 's_3882', 's_4197', 's_4199',#Ca(2+)
's_4019', 's_4020', 's_4201', #Cu2(+)
's_0924', 's_0925', 's_0926',#iron(2+)
's_3855', 's_3936', 's_4031', #iron(3+)

's_0434', 's_0435', 's_0437', 's_0438', 's_0439', 's_2831', 's_2840', 's_2856', 's_3341', 's_3359', 's_3881', 's_4196',#ATP
's_0394', 's_0395', 's_0397', 's_0398', 's_0399', 's_3324', 's_3342', 's_3360', 's_3883', 's_3987', 's_4198',#ADP
's_0423', 's_0424', 's_0425', 's_0426', 's_2833', 's_2842', 's_2859', 's_4160',#AMP
's_1203', 's_1204', 's_1205', 's_1206', 's_2818', 's_3753',#NADH
's_1198', 's_1199', 's_1200', 's_1201', 's_1202', 's_2820',#NAD
's_1212', 's_1213', 's_1214', 's_1215', 's_2799', 's_2952',#NADPH
's_1207', 's_1208', 's_1210', 's_1211', 's_2800', 's_2953',#NADP(+)
's_0689', 's_0690',#FADH2
's_0687', 's_0688',#FAD
]

In [3]:
### input
yeast870_path = '../Data/model/yeast-GEM870.yml'
# yeast8U_path = '../Data/model/yeast8U_plus.yml'
# yeast8U_path = '../Data/model/yeast8U_plus_mod.yml'
yeast8U_path = '../Data/model/yeast8U_plus_lipids.yml'

# metabolites_info_to_GEM_path = '../Data/metabolites_info_to_GEM_plus.csv'
metabolites_info_to_GEM_path = '../Data/lipids/metabolites_info_to_GEM_yeast8.csv'
yeast_genome_file = '../Data/Saccharomyces_cerevisiae.fasta'

#output 
yeast8U_sequence_smiles_pre_path = '../Data/kcat_km_predict/yeast8U_sequence_smiles_prepare_lipids.csv'

In [4]:
yeast8U = cobra.io.load_yaml_model(yeast8U_path)

In [5]:
met_info = pd.read_csv(metabolites_info_to_GEM_path)
met_info.head(2)

Unnamed: 0,new_met_smiles,sim_smile,ID,compartment
0,CCCCCCC=CCCCCCCCC(=O)OCC(COC(=O)CCCCCCCCCCCCC)...,,sl_1,c
1,CCCCC=CCCCCCCCC(=O)OCC(COP(=O)(O)OCC(O)COP(=O)...,,sl_2,c


In [6]:
Kcat_pre_pd = {'id':[],
               'gene':[],
               'id_id':[],
               'reactant_id_id':[],
               'product_id_id':[]
               }
for reaction in tqdm(yeast8U.reactions,total=len(yeast8U.reactions)):
    if 'rxnl' in reaction.id:
        gene_list = reaction.gene_reaction_rule
        gene_list = gene_list.replace('[','').replace(']','').replace('(','').replace(')','')
        gene_list = re.split(' and | or ', gene_list)
        gene_list = [x for x in gene_list if x!='']

        if reaction.lower_bound == 0 and  reaction.upper_bound > 0:
            metabolite_list = [k.id for k, v in reaction.metabolites.items() if v < 0 and k.id not in miss_met_id] #not contain miss_met_id
            reactant_list = [k.id for k, v in reaction.metabolites.items() if v < 0] #contains miss_met_id/TurNuP
            product_list = [k.id for k, v in reaction.metabolites.items() if v > 0]
            num = 0
            if len(gene_list) > 0:
                for gene in set(gene_list):
                    for met in set(metabolite_list):
                        num += 1
                        Kcat_pre_pd['id'].append(reaction.id + '_' + str(num))
                        Kcat_pre_pd['gene'].append(gene)
                        Kcat_pre_pd['id_id'].append(met)
                        Kcat_pre_pd['reactant_id_id'].append(reactant_list)
                        Kcat_pre_pd['product_id_id'].append(product_list)

        elif reaction.lower_bound < 0 and  reaction.upper_bound == 0:
            metabolite_list = [k.id for k, v in reaction.metabolites.items() if v > 0 and k.id not in miss_met_id]
            reactant_list = [k.id for k, v in reaction.metabolites.items() if v > 0] #contains miss_met_id
            product_list = [k.id for k, v in reaction.metabolites.items() if v < 0]        
            num = 0
            if len(gene_list) > 0:
                for gene in set(gene_list):
                    for met in set(metabolite_list):
                        num += 1
                        Kcat_pre_pd['id'].append(reaction.id + '_REV_' + str(num))
                        Kcat_pre_pd['gene'].append(gene)
                        Kcat_pre_pd['id_id'].append(met)
                        Kcat_pre_pd['reactant_id_id'].append(reactant_list)
                        Kcat_pre_pd['product_id_id'].append(product_list)

        elif reaction.lower_bound == 0 and  reaction.upper_bound == 0:
            pass

        elif reaction.lower_bound < 0 and  reaction.upper_bound > 0:

            metabolite_list = [k.id for k, v in reaction.metabolites.items() if v < 0 and k.id not in miss_met_id]
            reactant_list = [k.id for k, v in reaction.metabolites.items() if v < 0] #contains miss_met_id/TurNuP
            product_list = [k.id for k, v in reaction.metabolites.items() if v > 0]
            num = 0
            if len(gene_list) > 0:
                for gene in set(gene_list):
                    for met in set(metabolite_list):
                        num += 1
                        Kcat_pre_pd['id'].append(reaction.id + '_' + str(num))
                        Kcat_pre_pd['gene'].append(gene)
                        Kcat_pre_pd['id_id'].append(met)
                        Kcat_pre_pd['reactant_id_id'].append(reactant_list)
                        Kcat_pre_pd['product_id_id'].append(product_list)   


            metabolite_list = [k.id for k, v in reaction.metabolites.items() if v > 0 and k.id not in miss_met_id]
            reactant_list = [k.id for k, v in reaction.metabolites.items() if v > 0] #contains miss_met_id/TurNuP
            product_list = [k.id for k, v in reaction.metabolites.items() if v < 0]
            num = 0
            if len(gene_list) > 0:
                for gene in set(gene_list):
                    for met in set(metabolite_list):
                        num += 1
                        Kcat_pre_pd['id'].append(reaction.id + '_REV_' + str(num))
                        Kcat_pre_pd['gene'].append(gene)
                        Kcat_pre_pd['id_id'].append(met)
                        Kcat_pre_pd['reactant_id_id'].append(reactant_list)
                        Kcat_pre_pd['product_id_id'].append(product_list)    
        else: 
            print('bounds error:',reaction.id)

  0%|          | 0/47780 [00:00<?, ?it/s]

100%|██████████| 47780/47780 [00:04<00:00, 9925.32it/s] 


In [None]:
# Kcat_pre_pd = {'id':[],
#                'gene':[],
#                'id_id':[],
#                }
# for reaction in tqdm(yeast8U.reactions):
#     ### gene_list
#     gene_list = reaction.gene_reaction_rule
#     gene_list = gene_list.replace('[','').replace(']','').replace('(','').replace(')','')
#     # gene_list = gene_list.split(' and '|' or ')
#     gene_list = re.split(' and | or ', gene_list)
#     gene_list = [x for x in gene_list if x!='']
#     # print(gene_list)
#     ### metabolite_list
#     if reaction.lower_bound == 0 and  reaction.upper_bound > 0:
#         metabolite_list = [k.id for k, v in reaction.metabolites.items() if v < 0 and k.id not in miss_met_id]
#         # print(metabolite_list)
#     elif reaction.lower_bound < 0 and  reaction.upper_bound > 0:
#         metabolite_list = [k.id for k, v in reaction.metabolites.items() if k.id not in miss_met_id]
#         # print(metabolite_list)   
#     elif reaction.lower_bound < 0 and  reaction.upper_bound == 0:
#         metabolite_list = [k.id for k, v in reaction.metabolites.items() if v > 0 and k.id not in miss_met_id]
#         # print(metabolite_list)
#     elif reaction.lower_bound == 0 and  reaction.upper_bound == 0:
#         metabolite_list = []
#     else: 
#         print('bounds error:',reaction.id) 
#     num = 0
#     if len(gene_list) > 0:
#         for gene in set(gene_list):
#             for met in set(metabolite_list):
#                 num += 1
#                 Kcat_pre_pd['id'].append(reaction.id + '_' + str(num))
#                 Kcat_pre_pd['gene'].append(gene)
#                 Kcat_pre_pd['id_id'].append(met)

In [7]:
Kcat_pre_pd = pd.DataFrame(Kcat_pre_pd)
print(Kcat_pre_pd.shape)
Kcat_pre_pd.head(3)

(2625496, 5)


Unnamed: 0,id,gene,id_id,reactant_id_id,product_id_id
0,rxnl538155_1,YPL227C,sl_2715,"[s_0410, sl_2715]","[s_1322, sl_3772]"
1,rxnl538155_2,YPL227C,s_0410,"[s_0410, sl_2715]","[s_1322, sl_3772]"
2,rxnl538155_3,YDR373W,sl_2715,"[s_0410, sl_2715]","[s_1322, sl_3772]"


### supplement smiles

In [8]:
def get_smiles(metid,model,met_info):
    # print(metid)
    smiles = None
    if metid.startswith('s_'):
        if hasattr(model.metabolites.get_by_id(metid), 'smiles'):
            smiles = model.metabolites.get_by_id(metid).smiles
    elif metid.startswith('sn_'):
        if metid in met_info['ID'].tolist():
            smiles =  met_info[met_info['ID'] == metid]['new_met_smiles'].to_list()[0]
    return smiles

def get_smiles_lst(metid_lst,model,met_info):
    # print(metid)
    smiles_lst = []
    for metid in metid_lst:
        if metid.startswith('s_'):
            if hasattr(model.metabolites.get_by_id(metid), 'smiles'):
                smiles_lst.append(model.metabolites.get_by_id(metid).smiles)
            else:
                smiles_lst.append('nosmiles')
        elif metid.startswith('sn_'):
            if metid in met_info['ID'].tolist():
                smiles_lst.append(met_info[met_info['ID'] == metid]['new_met_smiles'].to_list()[0])
            else:
                smiles_lst.append('nosmiles')
    if 'nosmiles' not in smiles_lst:
        return smiles_lst
    else:
        return []

In [9]:
tqdm.pandas()
yeast870 = cobra.io.load_yaml_model(yeast870_path)
# Kcat_pre_pd['sub'] = Kcat_pre_pd['id_id'].apply(lambda x:get_smiles(x,yeast870,met_info))
Kcat_pre_pd['sub'] = Kcat_pre_pd['id_id'].progress_apply(lambda x: get_smiles(x, yeast870, met_info))
Kcat_pre_pd

100%|██████████| 2625496/2625496 [00:02<00:00, 876911.64it/s]


Unnamed: 0,id,gene,id_id,reactant_id_id,product_id_id,sub
0,rxnl538155_1,YPL227C,sl_2715,"[s_0410, sl_2715]","[s_1322, sl_3772]",
1,rxnl538155_2,YPL227C,s_0410,"[s_0410, sl_2715]","[s_1322, sl_3772]",C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O
2,rxnl538155_3,YDR373W,sl_2715,"[s_0410, sl_2715]","[s_1322, sl_3772]",
3,rxnl538155_4,YDR373W,s_0410,"[s_0410, sl_2715]","[s_1322, sl_3772]",C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O
4,rxnl538155_5,YGL186C,sl_2715,"[s_0410, sl_2715]","[s_1322, sl_3772]",
...,...,...,...,...,...,...
2625491,rxnl9753_254,YMR125W,s_0529,"[s_0176, s_0529]","[s_0722, sl_889]",CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=...
2625492,rxnl9753_255,YCR048W,s_0176,"[s_0176, s_0529]","[s_0722, sl_889]",C(CC(=O)C(=O)O)CC(=O)O
2625493,rxnl9753_256,YCR048W,s_0529,"[s_0176, s_0529]","[s_0722, sl_889]",CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=...
2625494,rxnl9753_257,YMR069W,s_0176,"[s_0176, s_0529]","[s_0722, sl_889]",C(CC(=O)C(=O)O)CC(=O)O


In [10]:
yeast870 = cobra.io.load_yaml_model(yeast870_path)
Kcat_pre_pd['reactant_smiles'] = Kcat_pre_pd['reactant_id_id'].progress_apply(lambda x: get_smiles_lst(x, yeast870, met_info))
Kcat_pre_pd

100%|██████████| 2625496/2625496 [00:08<00:00, 313247.99it/s]


Unnamed: 0,id,gene,id_id,reactant_id_id,product_id_id,sub,reactant_smiles
0,rxnl538155_1,YPL227C,sl_2715,"[s_0410, sl_2715]","[s_1322, sl_3772]",,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O]
1,rxnl538155_2,YPL227C,s_0410,"[s_0410, sl_2715]","[s_1322, sl_3772]",C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O]
2,rxnl538155_3,YDR373W,sl_2715,"[s_0410, sl_2715]","[s_1322, sl_3772]",,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O]
3,rxnl538155_4,YDR373W,s_0410,"[s_0410, sl_2715]","[s_1322, sl_3772]",C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O]
4,rxnl538155_5,YGL186C,sl_2715,"[s_0410, sl_2715]","[s_1322, sl_3772]",,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O]
...,...,...,...,...,...,...,...
2625491,rxnl9753_254,YMR125W,s_0529,"[s_0176, s_0529]","[s_0722, sl_889]",CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=...,"[C(CC(=O)C(=O)O)CC(=O)O, CC(C)(COP(=O)(O)OP(=O..."
2625492,rxnl9753_255,YCR048W,s_0176,"[s_0176, s_0529]","[s_0722, sl_889]",C(CC(=O)C(=O)O)CC(=O)O,"[C(CC(=O)C(=O)O)CC(=O)O, CC(C)(COP(=O)(O)OP(=O..."
2625493,rxnl9753_256,YCR048W,s_0529,"[s_0176, s_0529]","[s_0722, sl_889]",CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=...,"[C(CC(=O)C(=O)O)CC(=O)O, CC(C)(COP(=O)(O)OP(=O..."
2625494,rxnl9753_257,YMR069W,s_0176,"[s_0176, s_0529]","[s_0722, sl_889]",C(CC(=O)C(=O)O)CC(=O)O,"[C(CC(=O)C(=O)O)CC(=O)O, CC(C)(COP(=O)(O)OP(=O..."


In [11]:
Kcat_pre_pd['product_smiles'] = Kcat_pre_pd['product_id_id'].progress_apply(lambda x: get_smiles_lst(x, yeast870, met_info))
Kcat_pre_pd

100%|██████████| 2625496/2625496 [00:08<00:00, 307864.96it/s]


Unnamed: 0,id,gene,id_id,reactant_id_id,product_id_id,sub,reactant_smiles,product_smiles
0,rxnl538155_1,YPL227C,sl_2715,"[s_0410, sl_2715]","[s_1322, sl_3772]",,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O],[[O-]P(=O)([O-])[O-]]
1,rxnl538155_2,YPL227C,s_0410,"[s_0410, sl_2715]","[s_1322, sl_3772]",C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O],[[O-]P(=O)([O-])[O-]]
2,rxnl538155_3,YDR373W,sl_2715,"[s_0410, sl_2715]","[s_1322, sl_3772]",,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O],[[O-]P(=O)([O-])[O-]]
3,rxnl538155_4,YDR373W,s_0410,"[s_0410, sl_2715]","[s_1322, sl_3772]",C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O],[[O-]P(=O)([O-])[O-]]
4,rxnl538155_5,YGL186C,sl_2715,"[s_0410, sl_2715]","[s_1322, sl_3772]",,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O],[[O-]P(=O)([O-])[O-]]
...,...,...,...,...,...,...,...,...
2625491,rxnl9753_254,YMR125W,s_0529,"[s_0176, s_0529]","[s_0722, sl_889]",CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=...,"[C(CC(=O)C(=O)O)CC(=O)O, CC(C)(COP(=O)(O)OP(=O...",[C(=O)[O-]]
2625492,rxnl9753_255,YCR048W,s_0176,"[s_0176, s_0529]","[s_0722, sl_889]",C(CC(=O)C(=O)O)CC(=O)O,"[C(CC(=O)C(=O)O)CC(=O)O, CC(C)(COP(=O)(O)OP(=O...",[C(=O)[O-]]
2625493,rxnl9753_256,YCR048W,s_0529,"[s_0176, s_0529]","[s_0722, sl_889]",CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=...,"[C(CC(=O)C(=O)O)CC(=O)O, CC(C)(COP(=O)(O)OP(=O...",[C(=O)[O-]]
2625494,rxnl9753_257,YMR069W,s_0176,"[s_0176, s_0529]","[s_0722, sl_889]",C(CC(=O)C(=O)O)CC(=O)O,"[C(CC(=O)C(=O)O)CC(=O)O, CC(C)(COP(=O)(O)OP(=O...",[C(=O)[O-]]


### supplement protein sequence

In [12]:
def fasta_to_df(fasta_name):
    data = []
    with open(fasta_name, 'r') as fasta:
        header = None
        sequence = []
        for line in fasta:
            if line.startswith('>'):
                if header is not None:
                    data.append((header, ''.join(sequence)))
                header = line.strip()[1:]
                sequence = []
            elif line.strip() != '':
                sequence.append(line.strip())

    if header is not None:
        data.append((header, ''.join(sequence)))

    df = pd.DataFrame(data, columns=['ID', 'Sequence'])

    return df

yeast_genome = fasta_to_df(yeast_genome_file)
yeast_genome.head(3)

Unnamed: 0,ID,Sequence
0,YDL204W,MNRNTTTNKNANLNNSRNANAPGEAGHQNKTGLIYWTNPSKSGASF...
1,YNL167C,MSSEERSRQPSTVSTFDLEPNPFEQSFASSKKALSLPGTISHPSLP...
2,YJL068C,MKVVKEFSVCGGRLIKLSHNSNSTKTSMNVNIYLPKHYYAQDFPRN...


In [13]:
def get_sequence(gene,yeast_genome):
    if gene in yeast_genome.ID.tolist():
        return yeast_genome.loc[yeast_genome['ID'] == gene]['Sequence'].values[0]
    return None

print(get_sequence('YDL204W',yeast_genome))

MNRNTTTNKNANLNNSRNANAPGEAGHQNKTGLIYWTNPSKSGASFAATLVSLLILRNVNVISVLLKIGYMVLFTSFAVELSTKVLFDKGVVSRFGMQESPDLVGVLKPHIDRELDRLPALEDRIRKLVFAHRTRNNFTIGVSLYFLHGLFAIFSMNTVLIMTTIFLYTVPLIYDRKQARIDRAIDRMKDLVIHRFHKNYNKVVEKTEPYIDKIIPPQTDEGSYSTSISNENKSSTSQRNKSGLSSSEFDNMNDTSASKSGKDSYSTSQYNRAEYPVSQNENIGTLKSGKQEIPTEKDFNNRHENFSKPDVKTYDPRTVDIEEELAAHQRELEQNLKDGDYNLVGSKEIPDPITVPAPTRHTTKPAESQSIPIKNNETLHKTTHGLKQKLQHA


In [14]:
Kcat_pre_pd['Sequence'] = Kcat_pre_pd['gene'].progress_apply(lambda x:get_sequence(x,yeast_genome))
Kcat_pre_pd

100%|██████████| 2625496/2625496 [30:49<00:00, 1419.54it/s]


Unnamed: 0,id,gene,id_id,reactant_id_id,product_id_id,sub,reactant_smiles,product_smiles,Sequence
0,rxnl538155_1,YPL227C,sl_2715,"[s_0410, sl_2715]","[s_1322, sl_3772]",,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O],[[O-]P(=O)([O-])[O-]],MRALRFLIENRNTVFFTLLVALVLSLYLLVYLFSHTPRPPYPEELK...
1,rxnl538155_2,YPL227C,s_0410,"[s_0410, sl_2715]","[s_1322, sl_3772]",C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O],[[O-]P(=O)([O-])[O-]],MRALRFLIENRNTVFFTLLVALVLSLYLLVYLFSHTPRPPYPEELK...
2,rxnl538155_3,YDR373W,sl_2715,"[s_0410, sl_2715]","[s_1322, sl_3772]",,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O],[[O-]P(=O)([O-])[O-]],MGAKTSKLSKDDLTCLKQSTYFDRREIQQWHKGFLRDCPSGQLARE...
3,rxnl538155_4,YDR373W,s_0410,"[s_0410, sl_2715]","[s_1322, sl_3772]",C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O],[[O-]P(=O)([O-])[O-]],MGAKTSKLSKDDLTCLKQSTYFDRREIQQWHKGFLRDCPSGQLARE...
4,rxnl538155_5,YGL186C,sl_2715,"[s_0410, sl_2715]","[s_1322, sl_3772]",,[C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O],[[O-]P(=O)([O-])[O-]],MNRDNMDTTKRKEDHTKHTTDVIEFYEEGTAASSLNIATEKANSSP...
...,...,...,...,...,...,...,...,...,...
2625491,rxnl9753_254,YMR125W,s_0529,"[s_0176, s_0529]","[s_0722, sl_889]",CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=...,"[C(CC(=O)C(=O)O)CC(=O)O, CC(C)(COP(=O)(O)OP(=O...",[C(=O)[O-]],MFNRKRRGDFDEDENYRDFRPRMPKRQRIPPVVQLCKEMMPDIRTI...
2625492,rxnl9753_255,YCR048W,s_0176,"[s_0176, s_0529]","[s_0722, sl_889]",C(CC(=O)C(=O)O)CC(=O)O,"[C(CC(=O)C(=O)O)CC(=O)O, CC(C)(COP(=O)(O)OP(=O...",[C(=O)[O-]],MTETKDLLQDEEFLKIRRLNSAEANKRHSVTYDNVILPQESMEVSP...
2625493,rxnl9753_256,YCR048W,s_0529,"[s_0176, s_0529]","[s_0722, sl_889]",CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=...,"[C(CC(=O)C(=O)O)CC(=O)O, CC(C)(COP(=O)(O)OP(=O...",[C(=O)[O-]],MTETKDLLQDEEFLKIRRLNSAEANKRHSVTYDNVILPQESMEVSP...
2625494,rxnl9753_257,YMR069W,s_0176,"[s_0176, s_0529]","[s_0722, sl_889]",C(CC(=O)C(=O)O)CC(=O)O,"[C(CC(=O)C(=O)O)CC(=O)O, CC(C)(COP(=O)(O)OP(=O...",[C(=O)[O-]],MRSSVYSENTYNCIRTSKEHLTERRRVAMAPMFQHFLNLCVEKFPE...


### supplement EC number


In [15]:
uniprot_sce_file = '../Data/uniprot/uniprotkb_organism_id_559292_2023_11_08.tsv'
uniprot_sce = pd.read_csv(uniprot_sce_file,sep='\t')
uniprot_sce = uniprot_sce.fillna(' ')
uniprot_sce['Gene Names'] = uniprot_sce['Gene Names'].apply(lambda x:x.split(' '))
uniprot_sce.head(3)

Unnamed: 0,Entry,Organism,Gene Names,Protein names,EC number
0,A0A0B7P3V8,Saccharomyces cerevisiae (strain ATCC 204508 /...,"[TY4B-P, YPLCTy4-1, POL, YPL060C-A]",Transposon Ty4-P Gag-Pol polyprotein (TY4A-TY4...,2.7.7.49; 2.7.7.7; 3.1.26.4; 3.4.23.-
1,D6VTK4,Saccharomyces cerevisiae (strain ATCC 204508 /...,"[STE2, YFL026W]",Pheromone alpha factor receptor,
2,O13297,Saccharomyces cerevisiae (strain ATCC 204508 /...,"[CET1, YPL228W, P1433]",mRNA-capping enzyme subunit beta (EC 3.6.1.74)...,3.6.1.74


In [16]:
def get_uniprot_ec(gene,uniprot_sce):
    ec_lst = []
    tmp = uniprot_sce[uniprot_sce['Gene Names'].apply(lambda x: gene in x)]
    if len(tmp)<1:
        ec_lst.append('noec')
    else:
        for i in tmp['EC number'].to_list():
            ec_lst += i.split('; ')
    ec_lst = list(set(ec_lst))
    ec_lst = [x for x in ec_lst if x != ' ']
    return ec_lst

gene = 'YPL060C-A'
# gene = 'YFL026W'
get_uniprot_ec(gene,uniprot_sce)

['3.1.26.4', '3.4.23.-', '2.7.7.7', '2.7.7.49']

In [17]:
DeepProZyme_path = '../Data/EC_predict/Sce_DeepECv2.txt'

def get_gene2ec_dict_DeepProZyme(DeepProZyme_path):
    DeepECv2_res = pd.read_csv(DeepProZyme_path,sep='\t')
    DeepECv2_res = DeepECv2_res[DeepECv2_res['prediction']!='None']
    DeepECv2_res['prediction'] = DeepECv2_res['prediction'].apply(lambda x:x.split(':')[1])
    gene2ec_dict = {}
    for index,row in DeepECv2_res.iterrows():
        if row['sequence_ID'] not in gene2ec_dict:
            gene2ec_dict[row['sequence_ID']] = []
        gene2ec_dict[row['sequence_ID']].append(row['prediction'])
    for key, values in gene2ec_dict.items():
        gene2ec_dict[key] = list(set([".".join(value.split(".")[:3]) for value in values]))

    return gene2ec_dict 


def get_ec2gene_dict_DeepProZyme(gene2ec_dict):
    DeepProZyme_gene_list = list(gene2ec_dict.keys())
    DeepProZyme_ec2gene_dict = {}
    for gene, ec_list in gene2ec_dict.items():
        for ec in ec_list:
            if ec not in DeepProZyme_ec2gene_dict:
                DeepProZyme_ec2gene_dict[ec] = []
            DeepProZyme_ec2gene_dict[ec].append(gene)
            
    return DeepProZyme_ec2gene_dict 

DeepProZyme_gene2ec_dict = get_gene2ec_dict_DeepProZyme(DeepProZyme_path)
print(len(DeepProZyme_gene2ec_dict))
DeepProZyme_gene2ec_dict

2015


{'YNL167C': ['2.7.11'],
 'YJL068C': ['3.1.2'],
 'YHR077C': ['3.1.21', '3.4.19', '2.7.1', '2.4.1', '3.6.5'],
 'YPL240C': ['2.7.9'],
 'YPL036W': ['7.1.2'],
 'YDL166C': ['2.7.4'],
 'YBR180W': ['1.8.1'],
 'YDR063W': ['2.7.7'],
 'YCR089W': ['3.4.19', '3.2.1'],
 'YDR260C': ['1.11.1'],
 'YDR177W': ['2.3.2'],
 'YDR497C': ['1.3.1'],
 'YEL054C': ['2.7.7'],
 'YOR065W': ['1.10.2'],
 'YLR097C': ['2.7.7'],
 'YKR053C': ['3.6.1'],
 'YPR161C': ['2.7.11'],
 'YOR161C': ['2.4.1'],
 'YIL002C': ['3.1.3'],
 'YPL042C': ['2.7.11'],
 'YHR113W': ['3.4.11'],
 'YJR045C': ['1.3.1'],
 'YDR490C': ['2.7.11'],
 'YLL008W': ['3.6.4'],
 'YHL013C': ['3.4.19'],
 'YPL007C': ['1.14.11'],
 'YDR447C': ['3.1.26'],
 'YEL060C': ['3.4.21'],
 'YPR157W': ['3.1.3'],
 'YBR022W': ['3.1.22'],
 'YJR041C': ['2.7.1'],
 'YDL031W': ['3.6.4'],
 'YOL136C': ['3.1.4'],
 'YJL216C': ['3.2.1'],
 'YBL037W': ['2.7.7'],
 'YFR028C': ['3.1.3'],
 'YCR038C': ['2.7.7'],
 'YGL009C': ['4.2.1'],
 'YPR066W': ['6.2.1'],
 'YDR138W': ['3.6.4'],
 'YMR323W': ['4.2.1

In [18]:
sce_gene_clean_ec = '../Data/EC_predict/Saccharomyces_cerevisiae_teacher_maxsep.csv'

def get_gene2ec_dict_clean(sce_gene_clean_ec):
    with open(sce_gene_clean_ec, 'r') as file:
        csv_data = file.read()
    rows = csv_data.split('\n')
    gene2ec_dict = {}
    for row in rows:
        columns = row.split(',')
        key = columns[0]
        values = [v.split('/')[0].replace('EC:','') for v in columns[1:]]
        if len(values)>0:
            gene2ec_dict[key] = values
    for key, values in gene2ec_dict.items():
        gene2ec_dict[key] = list(set([".".join(value.split(".")[:3]) for value in values]))
    return gene2ec_dict

def get_ec2gene_dict_clean(gene2ec_dict):
    clean_gene_list = list(gene2ec_dict.keys())
    clean_ec2gene_dict = {}
    for gene, ec_list in gene2ec_dict.items():
        for ec in ec_list:
            if ec not in clean_ec2gene_dict:
                clean_ec2gene_dict[ec] = []
            clean_ec2gene_dict[ec].append(gene)
    return clean_ec2gene_dict

In [19]:
clean_gene2ec_dict = get_gene2ec_dict_clean(sce_gene_clean_ec)
print(len(clean_gene2ec_dict))
clean_gene2ec_dict

5911


{'YDL204W': ['5.3.99'],
 'YNL167C': ['2.3.2'],
 'YJL068C': ['3.1.2'],
 'YBL005W': ['2.3.1'],
 'YFR013W': ['2.3.1'],
 'YLR037C': ['1.12.99'],
 'YHR077C': ['5.6.2'],
 'YGR294W': ['3.2.2'],
 'YGR233C': ['4.2.3'],
 'YOL162W': ['1.14.15'],
 'YLR001C': ['2.7.6'],
 'YJL052C-A': ['3.5.1'],
 'YPL240C': ['3.6.4'],
 'YER159C-A': ['3.4.21'],
 'YGR245C': ['2.1.1'],
 'YPL036W': ['7.1.2'],
 'YGL249W': ['2.3.1'],
 'YDL166C': ['2.7.4'],
 'YDR289C': ['3.4.21'],
 'YDR170C': ['2.3.2'],
 'YBR281C': ['3.4.13'],
 'YDR246W-A': ['3.4.22'],
 'YCR095C': ['3.1.3'],
 'YLR226W': ['2.3.1'],
 'YBR180W': ['3.4.16', '2.4.1', '3.1.1'],
 'YKL005C': ['2.1.1'],
 'YDR063W': ['3.1.3'],
 'YCR089W': ['2.4.1'],
 'YPL056C': ['3.2.1'],
 'YGR003W': ['5.6.2'],
 'YGL198W': ['2.4.1'],
 'YDR260C': ['1.11.1'],
 'YOR093C': ['4.2.3'],
 'YIL134C-A': ['3.4.24'],
 'YDR051C': ['3.1.3', '2.7.1'],
 'YDR177W': ['2.3.2'],
 'YDR522C': ['1.6.5'],
 'YJR007W': ['3.1.13'],
 'YDR497C': ['2.1.1'],
 'YBR163W': ['3.1.12'],
 'YLL032C': ['2.7.7'],
 'YEL054

In [32]:
for index,row in tqdm(Kcat_pre_pd.iterrows(),total=len(Kcat_pre_pd)):
    if len(get_uniprot_ec(row['gene'],uniprot_sce))>0:
        # print(get_uniprot_ec(row['gene'],uniprot_sce))
        Kcat_pre_pd.loc[index,'EC'] = str(get_uniprot_ec(row['gene'],uniprot_sce))
    # elif row['gene'] in DeepProZyme_gene2ec_dict.keys():
        # Kcat_pre_pd.loc[index,'EC'] = DeepProZyme_gene2ec_dict[row['gene']]########################
    elif row['gene'] in clean_gene2ec_dict.keys():
        Kcat_pre_pd.loc[index,'EC'] = str(clean_gene2ec_dict[row['gene']])
    else:
        pass

100%|██████████| 472418/472418 [2:01:42<00:00, 64.70it/s]  


In [36]:
Kcat_pre_pd.head()

Unnamed: 0,id,gene,id_id,reactant_id_id,product_id_id,sub,reactant_smiles,Sequence,EC,product_smiles
0,r_0001_1,YEL039C,s_0709,"[s_0025, s_0709]","[s_0710, s_1399]",CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=...,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",MAKESTGFKPGSAKKGATLFKTRCQQCHTIEEGGPNKVGPNLHGIF...,['7.1.1'],[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...
1,r_0001_2,YEL039C,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]",CC(C(=O)O)O,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",MAKESTGFKPGSAKKGATLFKTRCQQCHTIEEGGPNKVGPNLHGIF...,['7.1.1'],[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...
2,r_0001_3,YJR048W,s_0709,"[s_0025, s_0709]","[s_0710, s_1399]",CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=...,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",MTEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHS...,['7.1.1'],[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...
3,r_0001_4,YJR048W,s_0025,"[s_0025, s_0709]","[s_0710, s_1399]",CC(C(=O)O)O,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",MTEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHS...,['7.1.1'],[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...
4,r_0001_5,YEL071W,s_0709,"[s_0025, s_0709]","[s_0710, s_1399]",CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(=...,"[CC(C(=O)O)O, CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]...",MTAAHPVAQLTAEAYPKVKRNPNFKVLDSEDLAYFRSILSNDEILN...,"['1.1.99.40', '1.1.2.4']",[CC1=C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(=C(C(...


### filter the rows that should not be there


In [20]:
print(len(Kcat_pre_pd))
Kcat_pre_pd = Kcat_pre_pd[~Kcat_pre_pd['sub'].isna()]
print(len(Kcat_pre_pd))
Kcat_pre_pd = Kcat_pre_pd[~Kcat_pre_pd['Sequence'].isna()]
print(len(Kcat_pre_pd))
Kcat_pre_pd = Kcat_pre_pd[Kcat_pre_pd['reactant_smiles'].apply(lambda x: len(x) != 0)]
print(len(Kcat_pre_pd))
Kcat_pre_pd = Kcat_pre_pd[Kcat_pre_pd['product_smiles'].apply(lambda x: len(x) != 0)]
print(len(Kcat_pre_pd))

2625496
387114
387114
357987
326775


In [21]:
Kcat_pre_pd.to_csv(yeast8U_sequence_smiles_pre_path,index=None)

In [22]:
Kcat_pre_pd = pd.read_csv(yeast8U_sequence_smiles_pre_path)
Kcat_pre_pd

Unnamed: 0,id,gene,id_id,reactant_id_id,product_id_id,sub,reactant_smiles,product_smiles,Sequence
0,rxnl538155_2,YPL227C,s_0410,"['s_0410', 'sl_2715']","['s_1322', 'sl_3772']",C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O,['C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O'],['[O-]P(=O)([O-])[O-]'],MRALRFLIENRNTVFFTLLVALVLSLYLLVYLFSHTPRPPYPEELK...
1,rxnl538155_4,YDR373W,s_0410,"['s_0410', 'sl_2715']","['s_1322', 'sl_3772']",C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O,['C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O'],['[O-]P(=O)([O-])[O-]'],MGAKTSKLSKDDLTCLKQSTYFDRREIQQWHKGFLRDCPSGQLARE...
2,rxnl538155_6,YGL186C,s_0410,"['s_0410', 'sl_2715']","['s_1322', 'sl_3772']",C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O,['C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O'],['[O-]P(=O)([O-])[O-]'],MNRDNMDTTKRKEDHTKHTTDVIEFYEEGTAASSLNIATEKANSSP...
3,rxnl538155_8,YLR350W,s_0410,"['s_0410', 'sl_2715']","['s_1322', 'sl_3772']",C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O,['C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O'],['[O-]P(=O)([O-])[O-]'],MIDRTKNESPAFEESPLTPNVSNLKPFPSQSNKISTPVTDHRRRRS...
4,rxnl538155_10,YGR227W,s_0410,"['s_0410', 'sl_2715']","['s_1322', 'sl_3772']",C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O,['C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O'],['[O-]P(=O)([O-])[O-]'],MDAKKNTGEANNDVLEEEAAIQLIAPGIARNLTQEVITGIFCNVVI...
...,...,...,...,...,...,...,...,...,...
326770,rxnl9753_254,YMR125W,s_0529,"['s_0176', 's_0529']","['s_0722', 'sl_889']",CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=...,"['C(CC(=O)C(=O)O)CC(=O)O', 'CC(C)(COP(=O)(O)OP...",['C(=O)[O-]'],MFNRKRRGDFDEDENYRDFRPRMPKRQRIPPVVQLCKEMMPDIRTI...
326771,rxnl9753_255,YCR048W,s_0176,"['s_0176', 's_0529']","['s_0722', 'sl_889']",C(CC(=O)C(=O)O)CC(=O)O,"['C(CC(=O)C(=O)O)CC(=O)O', 'CC(C)(COP(=O)(O)OP...",['C(=O)[O-]'],MTETKDLLQDEEFLKIRRLNSAEANKRHSVTYDNVILPQESMEVSP...
326772,rxnl9753_256,YCR048W,s_0529,"['s_0176', 's_0529']","['s_0722', 'sl_889']",CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=...,"['C(CC(=O)C(=O)O)CC(=O)O', 'CC(C)(COP(=O)(O)OP...",['C(=O)[O-]'],MTETKDLLQDEEFLKIRRLNSAEANKRHSVTYDNVILPQESMEVSP...
326773,rxnl9753_257,YMR069W,s_0176,"['s_0176', 's_0529']","['s_0722', 'sl_889']",C(CC(=O)C(=O)O)CC(=O)O,"['C(CC(=O)C(=O)O)CC(=O)O', 'CC(C)(COP(=O)(O)OP...",['C(=O)[O-]'],MRSSVYSENTYNCIRTSKEHLTERRRVAMAPMFQHFLNLCVEKFPE...
