In [1]:
import pandas as pd
import requests
import json

In [2]:
def dfs(current_node_id, branch_visited):
    """
    Depth-First Search (DFS) function to retrieve pathway hierarchies from MetaCyc.
    
    Parameters:
        current_node_id (str): The ID of the current node (pathway) being visited.
        branch_visited (list): List of pathway IDs and labels visited so far in the current branch.
    
    Returns:
        None (The results are stored in the global variable 'recorded_pathways').
    """    
    global recorded_pathways
    
    # Make a request to get the direct children-pathways of the current node from the MetaCyc website.
    response = requests.get(f"https://biocyc.org/META/ajax-direct-subs?object={current_node_id}")
    
    # Process the response (JSON) to retrieve child-pathway information.
    for pathway in json.loads(response.text):
        next_node_id = pathway["id"]          # ID of the child pathway to explore.
        next_node_label = pathway["label"]    # Label (name) of the child pathway.
        
        # Update the list of visited pathways in the current branch with information of the new child pathway.
        branch_updated = branch_visited + [f"{next_node_id}: {next_node_label}"]

        # If the child pathway is at the lowest hierarchy (leaf pathway), add it to the recorded pathways.
        if pathway["numInstances"] == 0:
            recorded_pathways.append(branch_updated)
        else:
            # Recursively call the DFS function to explore children pathways of the child pathway.
            dfs(current_node_id = next_node_id, branch_visited = branch_updated)
    
    return


In [10]:
###
### WARNING makes many calls to an external resource
###
# retrieving the hierarchy by traversing all pathway pages on the biocyc website using DFS
recorded_pathways = []
dfs(current_node_id = "Pathways", branch_visited = ["Pathways: Pathways"])

# Prepare the data for creating the pandas DataFrame with hierarchical annotations.
max_pathway_hierarchy = max([len(i)-1 for i in recorded_pathways])
padded_recorded_pathways = []

# Loop through the recorded pathways and pad the hierarchy levels for a consistent DataFrame.
for pathway in recorded_pathways:
    actual_pathway = pathway[1:]

    padded_pathway = actual_pathway
    
    leaf_pathway = pathway[-1]
    
    # Add None to the pathway hierarchy if it is shallower than the the maximum depth.
    if len(actual_pathway) < max_pathway_hierarchy:
        padded_pathway = actual_pathway + [None] * (max_pathway_hierarchy - len(actual_pathway))

    # Store the padded pathway along with the leaf pathway in a dictionary.        
    padded_recorded_pathways.append({leaf_pathway:padded_pathway})

In [14]:
# Create a DataFrame with the padded hierarchical annotations.
pathway_annotated = pd.DataFrame({})

for pathway in padded_recorded_pathways:
    pathway_annotated = pd.concat((pathway_annotated, pd.DataFrame(pathway).T))

# Rename the index to 'feature' for a more descriptive name.
pathway_annotated.rename_axis('feature', inplace = True)

# Create annotated column names 'level_1', 'level_2', etc. based on the hierarchy depth.
annotated_columns = []
for i, col in enumerate(pathway_annotated.columns):
    annotated_columns.append(f"level_{i+1}")
    
pathway_annotated.columns = annotated_columns 

In [15]:
pathway_annotated

Unnamed: 0_level_0,level_1,level_2,level_3,level_4,level_5,level_6,level_7,level_8
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
PWY-7723: bacterial bioluminescence,Bioluminescence: Bioluminescence,PWY-7723: bacterial bioluminescence,,,,,,
PWY-7914: coral bioluminescence,Bioluminescence: Bioluminescence,PWY-7914: coral bioluminescence,,,,,,
PWY-7912: dinoflagellate bioluminescence,Bioluminescence: Bioluminescence,PWY-7912: dinoflagellate bioluminescence,,,,,,
PWY-7913: firefly bioluminescence,Bioluminescence: Bioluminescence,PWY-7913: firefly bioluminescence,,,,,,
PWY-7937: fungal bioluminescence,Bioluminescence: Bioluminescence,PWY-7937: fungal bioluminescence,,,,,,
...,...,...,...,...,...,...,...,...
PWY66-432: blood coagulation - tissue factor pathway,PWY66-432: blood coagulation - tissue factor p...,,,,,,,
PWY-8468: monocistronic tRNA processing I,PWY-8468: monocistronic tRNA processing I,,,,,,,
PWY-8470: monocistronic tRNA processing II,PWY-8470: monocistronic tRNA processing II,,,,,,,
PWY-8471: monocistronic tRNA processing III,PWY-8471: monocistronic tRNA processing III,,,,,,,


In [12]:

#pathway_annotated.insert(1, 'Pathway', pathway_annotated.index)

#pathway_annotated = pathway_annotated.reset_index().rename(columns={'index': 'Pathway'})


In [17]:
pathway_annotated.to_csv('metacyc_pathway_hierarchy.tsv', sep='\t', index=True)