In [11]:
from bs4 import BeautifulSoup
import os
import requests
import re

In [12]:
response = requests.get("http://127.0.0.1:5000/get_latex_eq")
if response.status_code == 200:
    latex_equation = response.json().get("latex_equation")
    print("Fetched stored MathML:", latex_equation)
else:
    print("Error fetching stored MathML")

Fetched stored MathML: S\left(x\right)=\int_0^x\sin\left(t^2\right)dt


In [13]:
print(type(latex_equation))

<class 'str'>


In [14]:
import subprocess

def convert_latex_to_mathml(latex_equation):
    try:
        process = subprocess.run(
            ["latexmlmath", "--contentmathml=-", latex_equation],  # Output directly to terminal
            text=True,
            capture_output=True,
            check=True
        )
        return process.stdout  # Return the MathML directly

    except subprocess.CalledProcessError as e:
        print(f"Error running LateXML: {e.stderr}")
        return None

mathml_result = convert_latex_to_mathml(latex_equation)
print(mathml_result)  # Directly prints the Content MathML

<math xmlns="http://www.w3.org/1998/Math/MathML" alttext="S\left(x\right)=\int_{0}^{x}\sin\left(t^{2}\right)dt" display="block">
  <apply>
    <eq/>
    <apply>
      <times/>
      <ci>𝑆</ci>
      <ci>𝑥</ci>
    </apply>
    <apply>
      <apply>
        <csymbol cd="ambiguous">superscript</csymbol>
        <apply>
          <csymbol cd="ambiguous">subscript</csymbol>
          <int/>
          <cn type="integer">0</cn>
        </apply>
        <ci>𝑥</ci>
      </apply>
      <apply>
        <times/>
        <apply>
          <sin/>
          <apply>
            <csymbol cd="ambiguous">superscript</csymbol>
            <ci>𝑡</ci>
            <cn type="integer">2</cn>
          </apply>
        </apply>
        <apply>
          <csymbol cd="latexml">differential-d</csymbol>
          <ci>𝑡</ci>
        </apply>
      </apply>
    </apply>
  </apply>
</math>



In [15]:
from bs4 import BeautifulSoup

def expand_self_closing_tags(soup):
    """
    Ensures all self-closing tags in the given BeautifulSoup object
    are explicitly converted into open/close tags.
    """
    for tag in soup.find_all():
        if not tag.contents and tag.name not in ["ci", "cn"]:  # Keep <ci> & <cn> self-closing
            tag.string = ""  # Add empty content to force open/close format

def convert_plain_text(soup):
    """
    Converts the content of all <ci> tags to plain text by normalizing their content.

    Args:
        soup (BeautifulSoup): The BeautifulSoup object representing MathML.

    Returns:
        None: Modifies the soup object in place.
    """
    for ci_tag in soup.find_all("ci"):
        if ci_tag.string:
            # Normalize the text content (remove special formatting if any)
            plain_text = ci_tag.string.strip()
            ci_tag.string = plain_text  # Assign plain text back to the tag

def extract_core_mathml(mathml_str):
    """
    Extracts and retains only the <eq/> and the first <apply> from the Content MathML,
    ensuring all self-closing tags are expanded to full open/close tags.

    Args:
        mathml_str (str): The Content MathML input as a string.

    Returns:
        str: The modified Content MathML containing only the equation core.
    """
    # Parse the MathML string
    soup = BeautifulSoup(mathml_str, "html.parser")
    # Find the <eq/> tag
    eq_tag = soup.find("eq")
    
    if eq_tag:
        # Find the first <apply> tag after <eq/>
        first_apply = eq_tag.find_next_sibling("apply")

        # Ensure all self-closing tags are expanded
        expand_self_closing_tags(eq_tag)
        extracted_part = str(eq_tag)
        extracted_part = re.sub(r"<(\w+)([^>/]*)/>", r"<\1\2></\1>", extracted_part)  # Fix <eq/>

        if first_apply:
            expand_self_closing_tags(first_apply)
            extracted_part += "\n" + str(first_apply)

        return extracted_part.strip()  # Clean and return result

    return None  # Return None if <eq/> is not found

filtered_mathml = extract_core_mathml(mathml_result)
print(filtered_mathml)

<eq></eq>
<apply>
<times></times>
<ci>𝑆</ci>
<ci>𝑥</ci>
</apply>


In [16]:
import unicodedata
def normalize_ci_content(mathml_str):
    """
    Converts the content of all <ci> tags to plain text (non-italicized ASCII).

    Args:
        mathml_str (str): The Content MathML input as a string.

    Returns:
        str: The modified Content MathML with <ci> tag content normalized.
    """
    # Parse the MathML string
    soup = BeautifulSoup(mathml_str, "html.parser")

    # Process all <ci> tags
    for ci_tag in soup.find_all("ci"):
        if ci_tag.string:
            # Normalize Unicode to remove italics (convert to standard ASCII if possible)
            plain_text = unicodedata.normalize("NFKD", ci_tag.string)
            ci_tag.string = plain_text  # Replace the original content

    # Return the modified MathML as a string
    return str(soup)

filtered_mathml = normalize_ci_content(filtered_mathml)
print(filtered_mathml)

<eq></eq>
<apply>
<times></times>
<ci>S</ci>
<ci>x</ci>
</apply>


In [17]:
folder_names = ["wpmath00000"+ "0" + str(f_name) if f_name < 10 else "wpmath00000" + str(f_name) for f_name in range(2, 17)]
base_path = "/Users/atharvajain/Downloads/NTCIR-12_MathIR_Wikipedia_Corpus/MathTagArticles" # Change Base path based on the location of the directory
print(folder_names)

['wpmath0000002', 'wpmath0000003', 'wpmath0000004', 'wpmath0000005', 'wpmath0000006', 'wpmath0000007', 'wpmath0000008', 'wpmath0000009', 'wpmath0000010', 'wpmath0000011', 'wpmath0000012', 'wpmath0000013', 'wpmath0000014', 'wpmath0000015', 'wpmath0000016']


In [18]:
import os
from bs4 import BeautifulSoup

def corpus_find_articles(folder_names, base_path, content_mathml):
    """
    Search for HTML articles containing the given Content MathML expression.

    Args:
        folder_names (list): A list of folder names containing HTML articles.
        base_path (str): The base path where the folders are located.
        content_mathml (str): The Content MathML representation of the input equation.

    Returns:
        list: A list of article names where the given MathML expression is found.
    """

    # List to store articles that contain the Content MathML expression
    matching_articles = []

    # Parse the input Content MathML for comparison
    content_soup = BeautifulSoup(content_mathml, "html.parser")
    
    # Extract the core part of the input equation (everything under <apply>)
    eq_tag = content_soup.find("eq")
    first_apply = eq_tag.find_next_sibling("apply") if eq_tag else None

    if not first_apply:
        print("No <apply> structure found in input equation.")
        return matching_articles

    # Convert to string for comparison
    formatted_content_mathml = str(first_apply)

    # Process each folder
    for folder_name in folder_names:
        folder_path = os.path.join(base_path, folder_name)
        
        # Iterate over all files in the folder
        for article_name in os.listdir(folder_path):
            article_path = os.path.join(folder_path, article_name)
            
            # Skip non-HTML files
            if not article_path.endswith('.html'):
                continue
            
            try:
                # Parse the HTML file with BeautifulSoup
                with open(article_path, 'r', encoding='utf-8') as html_file:
                    soup = BeautifulSoup(html_file, 'html.parser')

                # Find all <eq> tags in the article
                for eq_tag in soup.find_all("eq"):
                    # Check if the next sibling is <apply>
                    apply_tag = eq_tag.find_next_sibling("apply")
                    
                    if apply_tag:
                        # Convert the found <apply> structure to string
                        article_mathml = str(apply_tag)

                        # Compare against the input Content MathML equation
                        if formatted_content_mathml in article_mathml:
                            matching_articles.append(article_name)
                            break  # Stop searching once a match is found
            
            except Exception as e:
                print(f"Error processing file {article_path}: {e}")
    
    return matching_articles

# Call the function
matching_articles = corpus_find_articles(folder_names, base_path, filtered_mathml)

# Print results
print("Articles containing the input Content MathML:")
print(matching_articles)


Articles containing the input Content MathML:
['Combinatorial_species.html', 'Fresnel_integral.html', 'Fredholm_operator.html', 'Classical_orthogonal_polynomials.html', 'Tensor_algebra.html', 'Independent_component_analysis.html', 'Exponential_sum.html', "Ruffini's_rule.html", 'Clenshaw_algorithm.html', 'Unbounded_operator.html', 'Lanczos_resampling.html', 'Banach_limit.html', 'Hidden_Field_Equations.html', 'Implementation_of_mathematics_in_set_theory.html', 'Force_of_mortality.html', 'Primitive_recursive_arithmetic.html', 'Cross-entropy_method.html', 'Sears–Haack_body.html', 'General_set_theory.html', 'Diffusionless_transformation.html', 'Wilson–Cowan_model.html', 'Differentiation_of_integrals.html', 'Group_Hopf_algebra.html', 'Uncertainty_theory.html', 'Method_of_steepest_descent.html', "De_Moivre's_law.html", 'Forney_algorithm.html', 'Two-body_Dirac_equations.html', 'Rebound_attack.html', 'G-measure.html']


In [10]:
import pickle

with open("matching_articles.pkl", "wb") as f:
    pickle.dump(matching_articles, f)

In [40]:
from bs4 import BeautifulSoup

def print_full_html(file_path):
    """
    Reads and prints the entire HTML content of the given file using BeautifulSoup.
    
    Args:
        file_path (str): The path to the HTML file.
    """
    try:
        # Open and read the file
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Parse with BeautifulSoup
        soup = BeautifulSoup(html_content, "html.parser")

        # Print the full formatted HTML
        return soup

    except Exception as e:
        print(f"Error reading file {file_path}: {e}")

# Define file path
base_path = "/Users/atharvajain/Downloads/NTCIR-12_MathIR_Wikipedia_Corpus/MathTagArticles/wpmath0000002/Fresnel_integral.html"


In [52]:
def check_mathml_in_article(file_path, filtered_mathml):
    """
    Checks if the given filtered MathML exists in the HTML of an article.

    Args:
        file_path (str): The path to the HTML file.
        filtered_mathml (str): The Content MathML structure to search for.

    Returns:
        bool: True if found, False otherwise.
    """
    # Get the article's HTML as a BeautifulSoup object
    soup = print_full_html(file_path)
    if soup is None:
        return False  # Return False if there was an error reading the file

    # Parse the filtered MathML to match BeautifulSoup's formatting
    filtered_soup = BeautifulSoup(filtered_mathml, "html.parser")
    # print(filtered_soup)

    # Search for the MathML structure in the article
    found = soup.find_all("apply")  # Searching for all <apply> elements
    # print(found[4])
    # Convert found[4] and filtered_mathml to strings
    found_mathml_str = str(found[4]).strip()
    filtered_mathml_str = str(BeautifulSoup(filtered_mathml, "html.parser")).strip()
    print(found_mathml_str)

# Check if filtered_mathml_str exists inside found_mathml_str
    if filtered_mathml_str in found_mathml_str:
        print("✅ filtered_mathml is a substring inside found[4]!")
    else:
        print("❌ filtered_mathml is NOT found inside found[4].")

check_mathml_in_article(base_path, normalized_mathml)

<apply>
<csymbol cd="ambiguous">formulae-sequence</csymbol>
<apply>
<eq></eq>
<apply>
<times></times>
<ci>S</ci>
<ci>x</ci>
</apply>
<apply>
<apply>
<csymbol cd="ambiguous">superscript</csymbol>
<apply>
<csymbol cd="ambiguous">subscript</csymbol>
<int></int>
<cn type="integer">0</cn>
</apply>
<ci>x</ci>
</apply>
<apply>
<times></times>
<apply>
<sin></sin>
<apply>
<csymbol cd="ambiguous">superscript</csymbol>
<ci>t</ci>
<cn type="integer">2</cn>
</apply>
</apply>
<ci>normal-d</ci>
<ci>t</ci>
</apply>
</apply>
</apply>
<apply>
<eq></eq>
<apply>
<times></times>
<ci>C</ci>
<ci>x</ci>
</apply>
<apply>
<apply>
<csymbol cd="ambiguous">superscript</csymbol>
<apply>
<csymbol cd="ambiguous">subscript</csymbol>
<int></int>
<cn type="integer">0</cn>
</apply>
<ci>x</ci>
</apply>
<apply>
<times></times>
<apply>
<cos></cos>
<apply>
<csymbol cd="ambiguous">superscript</csymbol>
<ci>t</ci>
<cn type="integer">2</cn>
</apply>
</apply>
<ci>normal-d</ci>
<ci>t</ci>
</apply>
</apply>
</apply>
</apply>
✅ fil