SAX is faster

DOM

In [13]:
import xml.etree.ElementTree as ET

dom=ET.parse('D:\\浙大海宁学习大一\\IBI1\\IBI1_2024-25\\IBI1_2024-25\\Practical14\\go_obo.xml')
dom=dom.getroot()
#go through(traverse) child element
for elem in dom:
    if elem.tag=="term":
        print(elem.tag())
    break

In [14]:
import xml.dom.minidom
from datetime import datetime
from collections import defaultdict

# Initialize result storage
max_info_dom = defaultdict(lambda: {'term': '', 'count': 0})
start_time_dom = datetime.now()  # Start time recording

# Parse XML file
dom_tree = xml.dom.minidom.parse("D:\\浙大海宁学习大一\\IBI1\\IBI1_2024-25\\IBI1_2024-25\\Practical14\\go_obo.xml")  # Replace with actual file path
terms = dom_tree.getElementsByTagName('term')

for term in terms:
    # Extract namespace
    namespace_elem = term.getElementsByTagName('namespace')[0]
    namespace = namespace_elem.firstChild.data.strip()
    
    # Filter target ontologies
    if namespace not in ['molecular_function', 'biological_process', 'cellular_component']:
        continue
    
    # Extract term name
    name_elem = term.getElementsByTagName('name')[0]
    term_name = name_elem.firstChild.data.strip()
    
    # Count "is_a" elements
    is_a_count = len(term.getElementsByTagName('is_a'))
    
    # Update maximum count
    if is_a_count > max_info_dom[namespace]['count']:
        max_info_dom[namespace] = {'term': term_name, 'count': is_a_count}

# Calculate elapsed time
dom_elapsed = (datetime.now() - start_time_dom).total_seconds()

# Output DOM results
print("=== DOM Parser Results ===")
for ns in ['molecular_function', 'biological_process', 'cellular_component']:
    data = max_info_dom[ns]
    print(f"Ontology: {ns.capitalize()}")
    print(f"  Term with most 'is_a': {data['term']}")
    print(f"  Number of 'is_a': {data['count']}\n")
print(f"DOM parsing time: {dom_elapsed:.4f} seconds\n")

=== DOM Parser Results ===
Ontology: Molecular_function
  Term with most 'is_a': biotin binding
  Number of 'is_a': 8

Ontology: Biological_process
  Term with most 'is_a': biotin catabolic process
  Number of 'is_a': 10

Ontology: Cellular_component
  Term with most 'is_a': high-affinity iron permease complex
  Number of 'is_a': 5

DOM parsing time: 12.8001 seconds



SAX

In [16]:
import xml.sax
from datetime import datetime
from collections import defaultdict

class GOHandler(xml.sax.ContentHandler):
    def __init__(self):
        self.reset_state()
        self.max_info = defaultdict(lambda: {'term_name': '', 'is_a_count': 0})
    
    def reset_state(self):
        self.current_term = {
            'namespace': '',
            'name': '',
            'is_a': 0
        }
        self.in_name = False
        self.in_namespace = True  # Initial state assumption (adjust based on XML)
        self.temp_data = ""
    
    def startElement(self, tag, attrs):
        if tag == 'term':
            self.reset_state()
        elif tag == 'name':
            self.in_name = True
            self.temp_data = ""
        elif tag == 'namespace':
            self.in_namespace = True
            self.temp_data = ""
        elif tag == 'is_a':  # Count all <is_a> tags regardless of nesting
            self.current_term['is_a'] += 1
    
    def characters(self, content):
        if self.in_name or self.in_namespace:
            self.temp_data += content.strip()
    
    def endElement(self, tag):
        if tag == 'name':
            self.current_term['name'] = self.temp_data
            self.in_name = False
        elif tag == 'namespace':
            self.current_term['namespace'] = self.temp_data
            self.in_namespace = False
        elif tag == 'term':
            ns = self.current_term['namespace']
            target_ontologies = ['molecular_function', 'biological_process', 'cellular_component']
            if ns in target_ontologies:
                if self.current_term['is_a'] > self.max_info[ns]['is_a_count']:
                    self.max_info[ns] = {
                        'term_name': self.current_term['name'],
                        'is_a_count': self.current_term['is_a']
                    }

start_time = datetime.now()
handler = GOHandler()
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
parser.parse("D:\\浙大海宁学习大一\\IBI1\\IBI1_2024-25\\IBI1_2024-25\\Practical14\\go_obo.xml")  # Replace with actual file path
sax_elapsed = (datetime.now() - start_time).total_seconds()

print("=== SAX Parser Results ===")
for ns in ['molecular_function', 'biological_process', 'cellular_component']:
    data = handler.max_info[ns]
    print(f"Ontology: {ns.capitalize()}")
    print(f"  Term with most is_a: {data['term_name']}")
    print(f"  is_a count: {data['is_a_count']}\n")
print(f"SAX parsing time: {sax_elapsed:.4f} seconds\n")

# Comment: Uncomment to compare speeds after running
# if dom_elapsed < sax_elapsed:
#     print("# DOM was faster")
# else:
#     print("# SAX was faster")

=== SAX Parser Results ===
Ontology: Molecular_function
  Term with most is_a: biotin binding
  is_a count: 8

Ontology: Biological_process
  Term with most is_a: biotin catabolic process
  is_a count: 10

Ontology: Cellular_component
  Term with most is_a: high-affinity iron permease complex
  is_a count: 5

SAX parsing time: 1.7086 seconds

