In [None]:
from bs4 import BeautifulSoup
from tqdm import tqdm

def extract_terms_and_ids_from_go(input_filename):
    """
    Extract terms and IDs from the GO OWL ontology, focusing on
    cellular components, molecular functions, and biological processes.

    Args:
        input_filename (str): Path to the OWL ontology file.

    Returns:
        dict: Dictionary where keys are terms and values are IDs.
    """
    # Parse the OWL file using BeautifulSoup
    with open(input_filename, 'r') as file:
        soup = BeautifulSoup(file, 'xml')

    # Define expected namespaces
    expected_namespaces = {"cellular_component", "molecular_function", "biological_process"}
    term_to_id = {}

    # Find all owl:Class elements
    all_classes = soup.find_all("owl:Class")

    # Iterate through each class and extract relevant information
    for owl_class in tqdm(all_classes, desc="Extracting GO Terms", total=len(all_classes)):
        # Extract the ID from rdf:about attribute
        term_id = owl_class.get('rdf:about')
        if term_id:
            term_id = term_id.split("/")[-1]

        # Extract the label using rdfs:label
        label = owl_class.find("rdfs:label")
        if label:
            term_name = label.text.strip()

        # Extract the namespace using oboInOwl:hasOBONamespace
        namespace = owl_class.find("oboInOwl:hasOBONamespace")
        if namespace and namespace.text.strip() in expected_namespaces and term_name and term_id:
            term_to_id[term_name.lower()] = term_id

    return term_to_id

# # Example usage
# input_file = "/path/to/go.owl"
# terms_and_ids = extract_terms_and_ids_from_go(input_file)
# print(terms_and_ids)


In [None]:
input_file = "/home/stirunag/work/github/source_data/knowledge_base/GO/go.owl"
terms2_dict = extract_terms_and_ids_from_go(input_file)

In [None]:
terms2_dict

In [None]:
terms2_dict['cell fate specification']

In [None]:
cui = "GO_0000759"
# Reverse the dictionary to find terms by their CUI
id_to_term = {v: k for k, v in terms2_dict.items()}
term = id_to_term.get(cui, "Unknown CUI")
print(f"The term associated with '{cui}' is: {term}")

In [1]:
from bs4 import BeautifulSoup
from tqdm import tqdm

def extract_terms_and_ids_from_chebi(input_filename):
    """
    Extract terms and IDs from the provided ChEBI OWL ontology into a dictionary.

    Args:
        input_filename (str): Path to the OWL ontology file.

    Returns:
        dict: Dictionary where keys are terms and values are IDs.
    """
    # Parse the OWL file using BeautifulSoup
    with open(input_filename, 'r') as file:
        soup = BeautifulSoup(file, 'xml')

    term_to_id = {}

    # Find all owl:Class elements
    all_classes = soup.find_all("owl:Class")

    # Iterate through each class and extract relevant information
    for owl_class in tqdm(all_classes, desc="Extracting ChEBI Terms", total=len(all_classes)):
        # Extract the ID from rdf:about attribute
        chebi_id = owl_class.get('rdf:about')
        if chebi_id:
            chebi_id = chebi_id.split("/")[-1]

        # Initialize term_name to None
        term_name = None
        
        # Extract the label using rdfs:label
        label = owl_class.find("rdfs:label")
        if label:
            term_name = label.text.strip()

        # Make sure we have a term name and ID
        if term_name and chebi_id:
            term_to_id[term_name.lower()] = chebi_id

    return term_to_id


# Example usage
input_file = "/home/stirunag/work/github/source_data/knowledge_base/chebi/chebi.owl"
terms_and_ids = extract_terms_and_ids_from_chebi(input_file)
terms_and_ids


Extracting ChEBI Terms: 100%|████████████████████████████████████████████████████| 183247/183247 [00:10<00:00, 17587.54it/s]


{'(+)-atherospermoline': 'CHEBI_10',
 '(-)-medicarpin': 'CHEBI_100',
 'vismione d': 'CHEBI_10000',
 '(2s,3s,4r)-3-[4-(3-cyclopentylprop-1-ynyl)phenyl]-4-(hydroxymethyl)-1-(2-methoxy-1-oxoethyl)-2-azetidinecarbonitrile': 'CHEBI_100000',
 'n-[(2r,3s,6r)-2-(hydroxymethyl)-6-[2-[[oxo-[4-(trifluoromethyl)anilino]methyl]amino]ethyl]-3-oxanyl]-3-pyridinecarboxamide': 'CHEBI_100001',
 '3-chloro-n-[(5s,6s,9s)-5-methoxy-3,6,9-trimethyl-2-oxo-11-oxa-3,8-diazabicyclo[10.4.0]hexadeca-1(12),13,15-trien-14-yl]benzenesulfonamide': 'CHEBI_100002',
 '(4r,7s,8r)-8-methoxy-4,7,10-trimethyl-11-oxo-14-(1-oxobutylamino)-n-propyl-2-oxa-5,10-diazabicyclo[10.4.0]hexadeca-1(12),13,15-triene-5-carboxamide': 'CHEBI_100003',
 '1-(2,5-difluorophenyl)-3-[(5s,6s,9s)-5-methoxy-3,6,9-trimethyl-2-oxo-8-[oxo(2-pyrazinyl)methyl]-11-oxa-3,8-diazabicyclo[10.4.0]hexadeca-1(12),13,15-trien-14-yl]urea': 'CHEBI_100004',
 'n-[(1s,3s,4as,9ar)-1-(hydroxymethyl)-3-[2-oxo-2-(1-piperidinyl)ethyl]-3,4,4a,9a-tetrahydro-1h-pyrano[3,4-b]b