In [96]:
import pandas as pd
from sm_precursor_predictor.data_integration.generate_kegg_networks import KeggNetworkGenerator
from sm_precursor_predictor.data_integration.kegg_api import KeggApi
import networkx as nx 
import re


path = "/home/joao/SMPrecursorPredictor/examples/precursors_map_curated_3.csv"
data = pd.read_csv(path)

#1
def get_precursors_in_pathway(map_id, data):
    """
    Get all compounds in a pathway based on the map ID and the precursors.
    """
    compounds = set()
    pathway_data = data[data["pathway"] == map_id]
    for _, row in pathway_data.iterrows():
        precursors = row["precursors"].split(";")
        compounds.update(precursors)
    return compounds


#2
def find_path_from_source_to_target(graph, source_compound, target_compound):
    """
    Find a path from the source compound to the target compound in the graph.
    """
    paths = nx.shortest_path(graph, source_compound, target_compound)
    
    return paths


#3
def get_allcompounds_of_pathway(graph):
    
    pattern = r'^C\d{5}$'  
    allcomp = [node for node in graph if re.search(pattern, node)]
    return allcomp


#4
def check_path_from_compound_to_precursor(graph, map_id, data):
    """
    Check if there is a path from a compound to a precursor in the graph.
    """
    compounds = get_allcompounds_of_pathway(graph)
    precursors = get_precursors_in_pathway(map_id, data)

    for compound in compounds:
        for precursor in precursors:
            try:
                path = find_path_from_source_to_target(graph, compound, precursor)
                if path:
                    return True
            except nx.NodeNotFound:
                pass

    return False

In [97]:
import pickle

#abrir ficheiro pkl

pickle_file = "graphs.pkl"

with open(pickle_file, "rb") as f:
    graphs = pickle.load(f)

print(graphs)

{'map00902': <networkx.classes.digraph.DiGraph object at 0x7f1374c6b070>, 'map00909': <networkx.classes.digraph.DiGraph object at 0x7f1376eb5670>, 'map00904': <networkx.classes.digraph.DiGraph object at 0x7f13748e8790>, 'map00906': <networkx.classes.digraph.DiGraph object at 0x7f1374948340>, 'map00905': <networkx.classes.digraph.DiGraph object at 0x7f1374948130>, 'map00981': <networkx.classes.digraph.DiGraph object at 0x7f13749480d0>, 'map00908': <networkx.classes.digraph.DiGraph object at 0x7f13749482b0>, 'map01059': <networkx.classes.digraph.DiGraph object at 0x7f1374948100>, 'map01053': <networkx.classes.digraph.DiGraph object at 0x7f1374c2c6a0>, 'map00940': <networkx.classes.digraph.DiGraph object at 0x7f13749b7220>, 'map00941': <networkx.classes.digraph.DiGraph object at 0x7f1374c21a00>, 'map00944': <networkx.classes.digraph.DiGraph object at 0x7f1374c21eb0>, 'map00942': <networkx.classes.digraph.DiGraph object at 0x7f1374c21ac0>, 'map00943': <networkx.classes.digraph.DiGraph obje

In [108]:
map_ids = data.iloc[:, 0].unique().tolist()

In [235]:
import re
from sm_precursor_predictor.data_integration.kegg_precursor_finder import KEGGPrecursorFinder

def get_compounds_for_precursors(graphs, map_ids, data):
    """
    Get a dictionary mapping precursors to their associated compounds for all map IDs.
    """
    precursor_compound_dict = {}


    for map_id in map_ids:

        kegg_precursors_finder = KEGGPrecursorFinder(map_id, data, create_graph=False)
        precursors = get_precursors_in_pathway(map_id, data)

        if len(precursors) == 1:
                graph = graphs.get(map_id)
                kegg_precursors_finder.graph = graph
                compounds = kegg_precursors_finder.get_all_compounds_of_pathway()
                precursor_compound_dict[list(precursors)[0]] = compounds[1:]

                
        else:
            for precursor in precursors:
                precursor_compound_dict.setdefault(precursor, [])

            graph = graphs.get(map_id) 

            if graph is not None:
                compounds = get_allcompounds_of_pathway(graph)

                for precursor in precursors:
                    for compound in compounds:
                        try:
                            if compound in graph and precursor in graph:
                                path = find_path_from_source_to_target(graph, compound, precursor)
                                if path:
                                    if precursor not in precursor_compound_dict:
                                        precursor_compound_dict[precursor] = []
                                    precursor_compound_dict[precursor] = list(set(precursor_compound_dict[precursor]) | set([compound]))
                        except (nx.NodeNotFound, nx.NetworkXNoPath):
                            pass

    for precursor, compounds in precursor_compound_dict.items():
        if len(compounds) > 1 and compounds[0] == precursor:
            compounds.pop(1)

                                
    return precursor_compound_dict

get_compounds_for_precursors_dic = get_compounds_for_precursors(graphs, map_ids, data)
print(get_compounds_for_precursors_dic)

value = 'C06074'

def search_value_in_dict_of_lists(dictionary, value):
    for lst in dictionary.values():
        if value in lst:
            return True
    return False

result = search_value_in_dict_of_lists(get_compounds_for_precursors_dic, value)

if result:
    print(f"The value {value} was found in the dictionary.")
else:
    print(f"The value {value} was not found in the dictionary.")


value_to_remove = 'C00080'


for key, value_list in get_compounds_for_precursors_dic.items():
    if value_to_remove in value_list:
        value_list.remove(value_to_remove)

print(f"The value {value_to_remove} has been removed from the dictionary of lists.")





{'C00341': ['C02344', 'C02462', 'C03190', 'C06074', 'C00521', 'C00400', 'C00843', 'C00080', 'C00964', 'C01123', 'C02452', 'C00553', 'C01765', 'C00808', 'C03092', 'C00848', 'C04718', 'C01767', 'C02485', 'C01512', 'C01433', 'C01852', 'C06071', 'C06066', 'C06070', 'C20790', 'C06308', 'C06307', 'C11636', 'C01957', 'C11672', 'C11673', 'C09769', 'C11409', 'C11383', 'C06099', 'C11951', 'C09893', 'C11952', 'C11393', 'C09844', 'C11389', 'C11388', 'C01500', 'C17621', 'C17622', 'C09782', 'C18025', 'C18027', 'C04433', 'C20221', 'C11382', 'C06304', 'C06305', 'C20789', 'C20943', 'C20944', 'C21203', 'C09804', 'C22014', 'C22225'], 'C00448': ['C19676', 'C17954', 'C19833', 'C20187', 'C01352', 'C19973', 'C16286', 'C01841', 'C16142', 'C16829', 'C20191', 'C08615', 'C19829', 'C19832', 'C09672', 'C19742', 'C17953', 'C06083', 'C19819', 'C03428', 'C20163', 'C17955', 'C17277', 'C00751', 'C20200', 'C16144', 'C03461', 'C19708', 'C20192', 'C09704', 'C06310', 'C19736', 'C09666', 'C19740', 'C19820', 'C01054', 'C1677

In [230]:

import re
from sm_precursor_predictor.data_integration.kegg_precursor_finder import KEGGPrecursorFinder

def get_precursors_for_compound(graphs, map_ids, data):
    """
    Get a dictionary mapping compounds to their associated precursor for all map IDs.
    """
    compound_prec_dict = {}

    for map_id in map_ids:
        
        kegg_precursors_finder = KEGGPrecursorFinder(map_id, data, create_graph=False)
        compounds = get_allcompounds_of_pathway(graphs[map_id])
        precursors = get_precursors_in_pathway(map_id, data)

        if len(precursors) == 1:
            graph = graphs.get(map_id)
            kegg_precursors_finder.graph = graph

            for compound in compounds:

                compounds = kegg_precursors_finder.get_all_compounds_of_pathway()
                compound_prec_dict[compound] = [list(precursors)[0]]
                compound_prec_dict = {compound: precursors for compound, precursors in compound_prec_dict.items() if compound not in precursors}

        
        else:
            for compound in compounds:
                if compound not in precursors:
                    compound_prec_dict.setdefault(compound, [])

                    for precursor in precursors:
                        try:
                            if precursor in graphs[map_id] and compound in graphs[map_id]:
                                path = find_path_from_source_to_target(graphs[map_id], precursor, compound)
                                if path:
                                    compound_prec_dict[compound] = list(set(compound_prec_dict[compound]) | set([precursor]))
                        except (nx.NodeNotFound, nx.NetworkXNoPath):
                            pass
                                

    return compound_prec_dict


compound_prec_dict = get_precursors_for_compound(graphs, map_ids, data)
print(compound_prec_dict)


value = 'C06074'


if value in compound_prec_dict:
    associated_list = compound_prec_dict[value]
    print(f"The list associated with key '{value}' is: {associated_list}")
else:
    print(f"The key '{value}' is not present in the dictionary.")

key_to_remove = 'C00080'

if key_to_remove in compound_prec_dict:
    
    del compound_prec_dict[key_to_remove]
    print(f"The key '{key_to_remove}' and its associated list has been removed.")
else:
    print(f"The key '{key_to_remove}' is not present in the dictionary.")


{'C02344': ['C00341'], 'C02462': ['C00341'], 'C03190': ['C00341'], 'C06074': ['C00341'], 'C00521': ['C00341'], 'C00400': ['C00341'], 'C00843': ['C00341'], 'C00080': ['C00073', 'C00078', 'C00079', 'C00082', 'C00123'], 'C00964': ['C00341'], 'C01123': ['C00341'], 'C02452': ['C00341'], 'C00553': ['C00341'], 'C01765': ['C00341'], 'C00808': ['C00341'], 'C03092': ['C00341'], 'C00848': ['C00341'], 'C04718': ['C00341'], 'C01767': ['C00341'], 'C02485': ['C00341'], 'C01512': ['C00341'], 'C01433': ['C00341'], 'C01852': ['C00341'], 'C06071': ['C00341'], 'C06066': ['C00341'], 'C06070': ['C00341'], 'C20790': ['C00341'], 'C06308': ['C00341'], 'C06307': ['C00341'], 'C11636': ['C00341'], 'C01957': ['C00341'], 'C11672': ['C00341'], 'C11673': ['C00341'], 'C09769': ['C00341'], 'C11409': ['C00341'], 'C11383': ['C00341'], 'C06099': ['C00341'], 'C11951': ['C00341'], 'C09893': ['C00341'], 'C11952': ['C00341'], 'C11393': ['C00341'], 'C09844': ['C00341'], 'C11389': ['C00341'], 'C11388': ['C00341'], 'C01500': ['C

In [234]:
def get_all_maps_for_compound(graphs, map_ids, data):
    """
    Get a dictionary mapping compounds to their associated maps for all map IDs.
    """
    compound_map_dict = {}

    for map_id in map_ids:
        graph = graphs.get(map_id)  

        if graph is not None:
            compounds = get_allcompounds_of_pathway(graph)

            precursors = get_precursors_in_pathway(map_id, data)

            for compound in compounds:
                if compound not in precursors:
                    if compound not in compound_map_dict:
                        compound_map_dict[compound] = []
                    if map_id not in compound_map_dict[compound]:
                        compound_map_dict[compound].append(map_id)

    return compound_map_dict

compound_map_dict = get_all_maps_for_compound(graphs, map_ids, data)
print(compound_map_dict)


value = 'C06074'

if value in compound_map_dict:
     associated_list = compound_map_dict[value]
     print(f"The list associated with key '{value}' is: {associated_list}")
else:
     print(f"The key '{value}' is not present in the dictionary.")

key_to_remove = 'C00080'

if key_to_remove in compound_map_dict:
    
    del compound_map_dict[key_to_remove]
    print(f"The key '{key_to_remove}' and its associated list has been removed.")
else:
    print(f"The key '{key_to_remove}' is not present in the dictionary.")

{'C02344': ['map00902'], 'C02462': ['map00902'], 'C03190': ['map00902'], 'C06074': ['map00902'], 'C00521': ['map00902'], 'C00400': ['map00902'], 'C00843': ['map00902'], 'C00080': ['map00902', 'map00909', 'map00904', 'map00906', 'map00905', 'map00908', 'map00940', 'map00941', 'map00943', 'map00965', 'map00966'], 'C00964': ['map00902'], 'C01123': ['map00902'], 'C02452': ['map00902'], 'C00553': ['map00902'], 'C01765': ['map00902'], 'C00808': ['map00902'], 'C03092': ['map00902'], 'C00848': ['map00902'], 'C04718': ['map00902'], 'C01767': ['map00902'], 'C02485': ['map00902'], 'C01512': ['map00902'], 'C01433': ['map00902'], 'C01852': ['map00902'], 'C06071': ['map00902'], 'C06066': ['map00902'], 'C06070': ['map00902'], 'C20790': ['map00902'], 'C06308': ['map00902'], 'C06307': ['map00902'], 'C11636': ['map00902'], 'C01957': ['map00902'], 'C11672': ['map00902'], 'C11673': ['map00902'], 'C09769': ['map00902'], 'C11409': ['map00902'], 'C11383': ['map00902'], 'C06099': ['map00902'], 'C11951': ['map

In [186]:
precursor_ids = set(precursor for precursors in compound_prec_dict.values() for precursor in precursors)
additional_precursor_ids = {'C00148', 'C00108', 'C00047', 'C00062', 'C00041', 'C00049', 'C01852', 'C00129', 'C00135', 'C00187'}
precursor_ids.update(additional_precursor_ids)
print(precursor_ids)

{'C00341', 'C00078', 'C01789', 'C00049', 'C00183', 'C03506', 'C00187', 'C00079', 'C00047', 'C01852', 'C00407', 'C00129', 'C00235', 'C00062', 'C00353', 'C00148', 'C00073', 'C00108', 'C00123', 'C00135', 'C00448', 'C00082', 'C00041'}


In [236]:
import pandas as pd
from rdkit import Chem


def generate_sdf(compound_prec_dict, compound_map_dict, graphs, data):
    writer = Chem.SDWriter('output07.sdf')

    precursor_ids = set(precursor for precursors in compound_prec_dict.values() for precursor in precursors)
    additional_precursor_ids = {'C00148', 'C00108', 'C00047', 'C00062', 'C00041', 'C00049', 'C01852', 'C00129', 'C00135', 'C00187'}
    precursor_ids.update(additional_precursor_ids)

    for compound, precursors in compound_prec_dict.items():
        compound_structure = None
        
        for map_id, graph in graphs.items():
            if compound in graph.nodes:
                compound_structure = graph.nodes[compound]["mol"]
                break

        if compound_structure:
            mol = Chem.MolFromMolBlock(compound_structure)

            mol.SetProp("Compound_ID",compound)

            for precursor_id in precursor_ids:
                flag = "1" if precursor_id in precursors else "0"
                precursor_prop_name = precursor_id
                mol.SetProp(precursor_prop_name, flag)

            mol.SetProp("Map_IDs", ";".join(compound_map_dict.get(compound, [])))


            writer.write(mol)

            if all(precursor_id == '0' for precursor_id in precursors):
                print(f"Compound {compound} has zero precursors")

        else:
            print(f"Failed to retrieve structure for compound {compound}")

    writer.close()

    sdf_supplier = Chem.SDMolSupplier('output06.sdf')
    for mol in sdf_supplier:
        compound_id = mol.GetProp("Compound_ID")
        if compound_id == 'C06074':
            return True

    return False

generate_sdf(compound_prec_dict, compound_map_dict, graphs, data)


True

In [239]:
from rdkit import Chem


def check_precursors_in_sdf(sdf_file, compound_to_check):
    precursor_info = {}

    sdf_supplier = Chem.SDMolSupplier(sdf_file)
    for mol in sdf_supplier:
        compound_id = mol.GetProp("Compound_ID")
        if compound_id == compound_to_check:
            for prop_name in mol.GetPropNames():
                if prop_name != "Compound_ID" and prop_name != "Map_IDs":
                    precursor_info[prop_name] = mol.GetProp(prop_name)
            break

    return precursor_info


sdf_file = "output07.sdf" 
compound_to_check = "C00080"

#C00080

precursor_info = check_precursors_in_sdf(sdf_file, compound_to_check)

if precursor_info:
    print(f"Precursors associated with compound '{compound_to_check}':")
    for precursor_id, flag in precursor_info.items():
        print(f" - Precursor ID: {precursor_id}, Flag: {flag}")
else:
    print(f"Compound '{compound_to_check}' not found or has no precursors in the SDF file.")


Compound 'C00080' not found or has no precursors in the SDF file.


In [218]:
def has_zero_precursors(compound_prec_dict):

    """
    Checks if any compound as 0 precursors on the compound_prec_dic.
    """

    for compound in compound_prec_dict:
        precursor_ids = compound_prec_dict[compound]
        if all(precursor_id == '0' for precursor_id in precursor_ids):
            return True
    return False

if has_zero_precursors(compound_prec_dict):
    print("There are compounds with zero precursors.")
else:
    print("No compounds have zero precursors.")




No compounds have zero precursors.
