In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import re
# sys.path.append('../Code/')
from common import *
from tqdm import tqdm
import multiprocessing
from functools import partial
tqdm.pandas()

In [16]:
## input
YMDB_total_smiles_path = '../../Data/ymdb/ymdb_final_result.xlsx'
# yeast870_path = '../Data/model/yeast-GEM870.mat'
yeast_model_path = '../../Data/model/yeast-GEM-final.csv'
mnxreac_smile_atom_mapping_rules_file_path = '../../Data/rules/MNXreaction_smiles_atommap_rules_filter.csv'
path_enzyme_path = '../../Data/pathway_enzyme.list'
mnxmeta_smile_file_path = '../../Data/database/mnx_chem_prop.tsv'
ymdb_output_final = '../../Data/ymdb/ymdb_final_result.xlsx'
## output
mnxmeta_smile_file_carbon_path = '../../Data/database/mnxmeta_smile_carbon.csv'
lipid_target_smiles_path = '../../Results/lipid/target_smiles_lipid.pickle'
not_lipid_target_smiles_path = '../../Results/not_lipid/target_smiles_not_lipid.pickle'
not_lipid_target_smiles_complex_path = '../../Results/not_lipid/target_smiles_not_lipid_complex.pickle'
lipid_yeast_met_file = '../../Results/lipid/yeast_met_lipid.pickle'
not_lipid_yeast_met_file = '../../Results/not_lipid/yeast_met_not_lipid.pickle'
lipid_mnxreac_smile_atom_mapping_rules_file_path = '../../Data/rules/MNXreaction_smiles_atommap_rules_lipid.csv'
total_met_inchikey0_file = '../../Data/total_met_inchikey0.pickle'
lipid_smiles2metnetx_path = '../../Results/lipid/lipid_smiles2metnetx.pickle'
not_lipid_smiles2metnetx_path = '../../Results/not_lipid/not_lipid_smiles2metnetx.pickle'
mnxmeta_smile_inchikey_dict_path = '../../Data/database/mnxmeta_smile_inchikey_dict.json'


In [17]:
def get_lipid_rules(path_enzyme_path,mnxreac_smile_atom_mapping_rules_file_path,lipid_mnxreac_smile_atom_mapping_rules_file_path):
    with open(path_enzyme_path, 'r') as f:
        lines = f.readlines()
    #kegg lipid metabolism map id
    lipid = '00061 MFatty acid biosynthesis00062 MFatty acid elongation00071 M NFatty acid degradation00073Cutin, suberine and wax biosynthesis00100 M NSteroid biosynthesis00120 M NPrimary bile acid biosynthesis00121Secondary bile acid biosynthesis00140 M NSteroid hormone biosynthesis00561 MGlycerolipid metabolism00564 MGlycerophospholipid metabolism00565 MEther lipid metabolism00600 M NSphingolipid metabolism00590Arachidonic acid metabolism00591Linoleic acid metabolism00592 Malpha-Linolenic acid metabolism01040 M NBiosynthesis of unsaturated fatty acids'
    maps = re.findall(r'\d{5}', lipid)
    dict_map_ec = {}
    #get ec number for each map
    for m in maps:
        dict_map_ec[m] = []
        for line in lines:
            if m in line:
                dict_map_ec[m].append(line.split()[1])
        dict_map_ec[m] = list(set(dict_map_ec[m]))
    ec_lipid = []
    for k, v in dict_map_ec.items():
        ec_lipid.extend(v)
    ec_lipid = list(set(ec_lipid))
    ec_lipid = [i.replace('ec:','') for i in ec_lipid ]
    #get 3 level ec number
    ec_3 = []
    for ec in ec_lipid:
        ec = ec.split('.')
        if len(ec) == 4:
            ec = ec[:3]
            ec = '.'.join(ec)
            ec_3.append(ec)
        else:
            ec_3.append('.'.join(ec))
    ec_3 = list(set(ec_3))
    rules = pd.read_csv(mnxreac_smile_atom_mapping_rules_file_path)
    rules['lipid_rules'] = 0
    #check if the rule is lipid
    for i in tqdm(rules.index,total=rules.shape[0]):
        if pd.notna(rules.loc[i, 'classifs']):
            classifs = rules.loc[i, 'classifs'].split(';')
            for j in classifs:
                if j[:5] in ec_3:
                    rules.loc[i, 'lipid_rules'] = 1
                    break
            rules.loc[i, 'classifs'] = ';'.join(classifs)
    #save lipid rules
    rules_lipid = rules[rules['lipid_rules'] == 1]
    rules_lipid.to_csv(lipid_mnxreac_smile_atom_mapping_rules_file_path, index=False)

In [18]:
def get_total_inchikey0(model_path, YMDB_total_smiles_path, output_file):
    model = pd.read_csv(model_path)
    YMDB_Data = pd.read_excel(YMDB_total_smiles_path)
    YMDB_Data = YMDB_Data.dropna(subset=['standard_smiles'])
    model = model.dropna(subset=['standard_smiles'])
    model['inchikey0'] = model['standard_smiles'].apply(smiles2inchikey0)
    inchikey0_model = model['inchikey0'].values
    inchikey0= list(inchikey0_model) + list(YMDB_Data['inchikey0'].values)
    inchikey0 = list(set(inchikey0))
    print(len(inchikey0))
    dump_file(inchikey0, output_file)

In [19]:
def get_mnxmeta_smiles_carbon(mnxmeta_smile_file_path,mnxmeta_smile_file_carbon_path):
    mnxmeta_smile = pd.read_csv(mnxmeta_smile_file_path,sep='\t')
    print(mnxmeta_smile.shape)
    mnxmeta_smile = mnxmeta_smile.loc[1:,:]
    mnxmeta_smile = mnxmeta_smile.dropna(subset=['SMILES'])
    mnxmeta_smile['has_carbon'] = mnxmeta_smile['SMILES'].progress_apply(smiles_has_carbon)
    mnxmeta_smile_filtered = mnxmeta_smile[mnxmeta_smile['has_carbon'] == 1]
    mnxmeta_smile_filtered['inchikey0'] = mnxmeta_smile_filtered['SMILES'].apply(smiles2inchikey0)
    mnxmeta_smile_filtered = mnxmeta_smile_filtered.groupby('inchikey0')['SMILES'].agg(list).reset_index()
    mnxmeta_smile_filtered.to_csv(mnxmeta_smile_file_carbon_path, index=False)
    print(mnxmeta_smile_filtered.shape)
    return mnxmeta_smile_filtered

In [20]:
def compare_smiles_mnxmeta(smiles, compare_total_smiles):
    matched_smiles = []
    for compare_smiles in compare_total_smiles:
        if compare_smiles_inchikey(smiles, compare_smiles):
            matched_smiles.append(compare_smiles)
    return smiles, matched_smiles

In [21]:
def process_complex_smiles(not_lipid_target_smiles,not_lipid_target_smiles_complex_path):
    not_lipid_target_smiles_complex = [x for x in not_lipid_target_smiles if '.' in x]
    not_lipid_target_smiles_complex_with_carbon = []
    for smiles in tqdm(not_lipid_target_smiles_complex):
        not_lipid_target_smiles_complex_with_carbon.extend(smiles.split('.'))
    not_lipid_target_smiles_complex_with_carbon = [x for x in not_lipid_target_smiles_complex_with_carbon if smiles_has_carbon(x)]
    not_lipid_target_smiles = [x for x in not_lipid_target_smiles if x not in not_lipid_target_smiles_complex]
    not_lipid_target_smiles = not_lipid_target_smiles + not_lipid_target_smiles_complex_with_carbon
    dump_file(not_lipid_target_smiles,not_lipid_target_smiles_complex_path)
    return not_lipid_target_smiles

### yeast8 metabolites, ymdb metabolites

In [22]:
YMDB_Data = pd.read_excel(YMDB_total_smiles_path)
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00074',['in_model']] = 1 #cyanide
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00108',['in_model']] = 1 #fructose 1,6-bisphosphate
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00121',['in_model']] = 1 #sulfide
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00253',['in_model']] = 1 #d-ribose
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00657',['in_model']] = 1 #D-fructose
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00789',['in_model']] = 1 #Galactose
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00797',['in_model']] = 1 #udp-galactose
YMDB_Data.loc[YMDB_Data['ID']=='YMDB01490',['in_model']] = 1 #Ammonium phosphate
YMDB_Data.loc[YMDB_Data['ID']=='YMDB01491',['in_model']] = 1 #Ammonium chloride
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00032',['in_model']] = 1 #purine
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00020',['in_model']] = 1 #Sodium
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00207',['in_model']] = 1 #Copper(2+)
YMDB_Data.loc[YMDB_Data['ID']=='YMDB01517',['in_model']] = 1 #chloride
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00358',['in_model']] = 1 #Calcium(2+)
YMDB_Data.loc[YMDB_Data['ID']=='YMDB16117',['in_model']] = 1 #Potassium
YMDB_Data.loc[YMDB_Data['ID']=='YMDB01523',['in_model']] = 1 #Phosphorus
YMDB_Data['has_carbon'] = YMDB_Data['SMILES'].progress_apply(smiles_has_carbon)
YMDB_Data.loc[YMDB_Data['has_carbon']==0,'in_model'] = 1
YMDB_Data.to_excel(YMDB_total_smiles_path,index=False)
not_lipid_target_smiles = YMDB_Data[(YMDB_Data['super_class']!='Lipids and lipid-like molecules')&(YMDB_Data['in_model']==0)]['SMILES'].to_list()
not_lipid_target_smiles = get_target_smiles(not_lipid_target_smiles,not_lipid_target_smiles_path)

lipid_target_smiles = YMDB_Data[(YMDB_Data['super_class']=='Lipids and lipid-like molecules')&(YMDB_Data['in_model']==0)]['SMILES'].to_list()
lipid_target_smiles = get_target_smiles(lipid_target_smiles,lipid_target_smiles_path)

100%|██████████| 16042/16042 [00:05<00:00, 2711.29it/s]


target_smiles: 572
target_smiles: 14310


In [9]:
not_lipid_target_smiles = process_complex_smiles(not_lipid_target_smiles,not_lipid_target_smiles_complex_path)


100%|██████████| 6/6 [00:00<00:00, 108942.96it/s]


In [10]:
#combine model smiles with target smiles
# get_total_smiles(ymdb_output_final,yeast_model_path, not_lipid_target_smiles, not_lipid_yeast_met_file)
get_total_smiles(yeast_model_path, not_lipid_target_smiles, not_lipid_yeast_met_file)

total_smiles 1520


In [23]:
get_total_smiles(yeast_model_path, lipid_target_smiles, lipid_yeast_met_file)


total_smiles 15255


In [43]:
#get lipid rules
get_lipid_rules(path_enzyme_path,mnxreac_smile_atom_mapping_rules_file_path,lipid_mnxreac_smile_atom_mapping_rules_file_path)

100%|██████████| 20210/20210 [00:03<00:00, 5309.52it/s]


In [12]:
# get total inchikey0
get_total_inchikey0(yeast_model_path, YMDB_total_smiles_path, total_met_inchikey0_file)

15862


In [47]:
mnxmeta_smile_filtered = get_mnxmeta_smiles_carbon(mnxmeta_smile_file_path,mnxmeta_smile_file_carbon_path)

(1292154, 9)


100%|██████████| 1248094/1248094 [05:14<00:00, 3967.69it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mnxmeta_smile_filtered['inchikey0'] = mnxmeta_smile_filtered['SMILES'].apply(smiles2inchikey0)


(906669, 2)


In [48]:
mnxmeta_smile_inchikey_dict = dict(zip(mnxmeta_smile_filtered['inchikey0'], mnxmeta_smile_filtered['SMILES']))
with open(mnxmeta_smile_inchikey_dict_path, 'w') as f:
    json.dump(mnxmeta_smile_inchikey_dict, f,indent=4)

In [24]:
with open(mnxmeta_smile_inchikey_dict_path, 'r') as f:
    mnxmeta_smile_inchikey_dict = json.load(f)

In [14]:
not_lipid_yeast_total_smiles = process_yeast_smiles(not_lipid_yeast_met_file)
print(len(not_lipid_yeast_total_smiles))
not_lipid_total_inchikey0 = [smiles2inchikey0(x) for x in not_lipid_yeast_total_smiles]

total_smiles 1520
total_smiles 1515
total_smiles 1490
total_smiles 1463
1463


In [25]:
lipid_yeast_total_smiles = process_yeast_smiles(lipid_yeast_met_file)
print(len(lipid_yeast_total_smiles))
lipid_total_inchikey0 = [smiles2inchikey0(x) for x in lipid_yeast_total_smiles]

total_smiles 15255
total_smiles 15247
total_smiles 15222
total_smiles 15085
15085


In [16]:
get_smiles2metnetx(list(mnxmeta_smile_inchikey_dict.keys()), not_lipid_total_inchikey0,not_lipid_smiles2metnetx_path,num_processes=60)

100%|██████████| 906669/906669 [00:03<00:00, 233105.93it/s]


In [26]:
get_smiles2metnetx(list(mnxmeta_smile_inchikey_dict.keys()), lipid_total_inchikey0,lipid_smiles2metnetx_path,num_processes=40)


100%|██████████| 906669/906669 [00:17<00:00, 52970.21it/s]


In [18]:
smiles_metnetx_reverse(not_lipid_smiles2metnetx_path,not_lipid_total_inchikey0,mnxmeta_smile_inchikey_dict)

100%|██████████| 906669/906669 [00:00<00:00, 4776531.89it/s]
100%|██████████| 1463/1463 [00:39<00:00, 36.73it/s]
100%|██████████| 1457/1457 [00:00<00:00, 168810.28it/s]


In [27]:
smiles_metnetx_reverse(lipid_smiles2metnetx_path,lipid_total_inchikey0,mnxmeta_smile_inchikey_dict)


100%|██████████| 906669/906669 [00:00<00:00, 5509350.07it/s]
100%|██████████| 15085/15085 [06:48<00:00, 36.88it/s]
100%|██████████| 15073/15073 [00:00<00:00, 473453.69it/s]


In [20]:
merge_metnetx_smiles(not_lipid_yeast_total_smiles,not_lipid_smiles2metnetx_path,not_lipid_yeast_met_file)

1463


100%|██████████| 1457/1457 [00:00<00:00, 1383854.38it/s]

3680





In [28]:
merge_metnetx_smiles(lipid_yeast_total_smiles,lipid_smiles2metnetx_path,lipid_yeast_met_file)


15085


100%|██████████| 15073/15073 [00:00<00:00, 3303241.77it/s]

19742



