In [11]:
from bs4 import BeautifulSoup
import os
import requests
import re

In [3]:
response = requests.get("http://127.0.0.1:5000/get_latex_eq")
if response.status_code == 200:
    latex_equation = response.json().get("latex_equation")
    print("Fetched stored MathML:", latex_equation)
else:
    print("Error fetching stored MathML")

Fetched stored MathML: S\left(x\right)=\int_0^x\sin\left(t^2\right)dt


In [4]:
print(type(latex_equation))

<class 'str'>


In [5]:
import subprocess

def convert_latex_to_mathml(latex_equation):
    try:
        process = subprocess.run(
            ["latexmlmath", "--contentmathml=-", latex_equation],  # Output directly to terminal
            text=True,
            capture_output=True,
            check=True
        )
        return process.stdout  # Return the MathML directly

    except subprocess.CalledProcessError as e:
        print(f"Error running LateXML: {e.stderr}")
        return None

mathml_result = convert_latex_to_mathml(latex_equation)
print(mathml_result)  # Directly prints the Content MathML

<math xmlns="http://www.w3.org/1998/Math/MathML" alttext="S\left(x\right)=\int_{0}^{x}\sin\left(t^{2}\right)dt" display="block">
  <apply>
    <eq/>
    <apply>
      <times/>
      <ci>𝑆</ci>
      <ci>𝑥</ci>
    </apply>
    <apply>
      <apply>
        <csymbol cd="ambiguous">superscript</csymbol>
        <apply>
          <csymbol cd="ambiguous">subscript</csymbol>
          <int/>
          <cn type="integer">0</cn>
        </apply>
        <ci>𝑥</ci>
      </apply>
      <apply>
        <times/>
        <apply>
          <sin/>
          <apply>
            <csymbol cd="ambiguous">superscript</csymbol>
            <ci>𝑡</ci>
            <cn type="integer">2</cn>
          </apply>
        </apply>
        <apply>
          <csymbol cd="latexml">differential-d</csymbol>
          <ci>𝑡</ci>
        </apply>
      </apply>
    </apply>
  </apply>
</math>



In [18]:
from bs4 import BeautifulSoup

def expand_self_closing_tags(soup):
    """
    Ensures all self-closing tags in the given BeautifulSoup object
    are explicitly converted into open/close tags.
    """
    for tag in soup.find_all():
        if not tag.contents and tag.name not in ["ci", "cn"]:  # Keep <ci> & <cn> self-closing
            tag.string = ""  # Add empty content to force open/close format

def extract_core_mathml(mathml_str):
    """
    Extracts and retains only the <eq/> and the first <apply> from the Content MathML,
    ensuring all self-closing tags are expanded to full open/close tags.

    Args:
        mathml_str (str): The Content MathML input as a string.

    Returns:
        str: The modified Content MathML containing only the equation core.
    """
    # Parse the MathML string
    soup = BeautifulSoup(mathml_str, "xml")

    # Find the <eq/> tag
    eq_tag = soup.find("eq")
    
    if eq_tag:
        # Find the first <apply> tag after <eq/>
        first_apply = eq_tag.find_next_sibling("apply")

        # Ensure all self-closing tags are expanded
        expand_self_closing_tags(eq_tag)
        extracted_part = str(eq_tag)
        extracted_part = re.sub(r"<(\w+)([^>/]*)/>", r"<\1\2></\1>", extracted_part)  # Fix <eq/>

        if first_apply:
            expand_self_closing_tags(first_apply)
            extracted_part += "\n" + str(first_apply)

        return extracted_part.strip()  # Clean and return result

    return None  # Return None if <eq/> is not found

filtered_mathml = extract_core_mathml(mathml_result)
print(filtered_mathml)

<eq></eq>
<apply>
<times></times>
<ci>𝑆</ci>
<ci>𝑥</ci>
</apply>


In [13]:
folder_names = ["wpmath00000"+ "0" + str(f_name) if f_name < 10 else "wpmath00000" + str(f_name) for f_name in range(2, 17)]
base_path = "/Users/atharvajain/Downloads/NTCIR-12_MathIR_Wikipedia_Corpus/MathTagArticles" # Change Base path based on the location of the directory
print(folder_names)

['wpmath0000002', 'wpmath0000003', 'wpmath0000004', 'wpmath0000005', 'wpmath0000006', 'wpmath0000007', 'wpmath0000008', 'wpmath0000009', 'wpmath0000010', 'wpmath0000011', 'wpmath0000012', 'wpmath0000013', 'wpmath0000014', 'wpmath0000015', 'wpmath0000016']


In [14]:
import os
from bs4 import BeautifulSoup

def corpus_find_articles(folder_names, base_path, content_mathml):
    """
    Search for HTML articles containing the given Content MathML expression.

    Args:
        folder_names (list): A list of folder names containing HTML articles.
        base_path (str): The base path where the folders are located.
        content_mathml (str): The Content MathML representation of the input equation.

    Returns:
        list: A list of article names where the given MathML expression is found.
    """

    # List to store articles that contain the Content MathML expression
    matching_articles = []
    
    # Convert `content_mathml` to match BeautifulSoup's formatting
    content_soup = BeautifulSoup(content_mathml, 'html.parser')
    formatted_content_mathml = content_soup.prettify()

    # Process each folder
    for folder_name in folder_names:
        folder_path = os.path.join(base_path, folder_name)
        
        # Iterate over all files in the folder
        for article_name in os.listdir(folder_path):
            article_path = os.path.join(folder_path, article_name)
            
            # Skip non-HTML files
            if not article_path.endswith('.html'):
                continue
            
            try:
                # Parse the HTML file with BeautifulSoup
                with open(article_path, 'r', encoding='utf-8') as html_file:
                    soup = BeautifulSoup(html_file, 'html.parser')

                # Find all MathML `<apply>` elements in the article
                elements_with_mathml = soup.find_all('apply')

                # Check if any of the MathML elements in the article match `content_mathml`
                for math_element in elements_with_mathml:
                    math_content = math_element.prettify()  # Convert to formatted string

                    if formatted_content_mathml == math_content:
                        matching_articles.append(article_name)
                        break  # Stop searching once a match is found
            
            except Exception as e:
                print(f"Error processing file {article_path}: {e}")
    
    return matching_articles

# Call the function with content_mathml
matching_articles = corpus_find_articles(folder_names, base_path, content_mathml)

# Print results
print("Articles containing the input Content MathML:")
print(matching_articles)

KeyboardInterrupt: 