In [9]:
import pandas as pd
import re
from collections import Counter
import numpy as np

# Load the generated equations
df = pd.read_csv("equations.csv")

Loaded 180 equations


# Util

In [3]:
def normalize_latex(latex: str) -> str:
    """
    Normalize LaTeX expression for comparison.
    - Remove all whitespace
    - Make definite integrals indefinite (remove bounds)
    - Remove display math delimiters
    - Standardize common variations
    """
    if not latex or pd.isna(latex):
        return ""

    s = str(latex)

    # Remove display/inline math delimiters
    s = s.replace("$$", "").replace("$", "")

    # Remove all whitespace (spaces, newlines, tabs)
    s = re.sub(r'\s+', '', s)

    # Remove \, \; \: \! (LaTeX spacing commands)
    s = re.sub(r'\\[,;:!]', '', s)

    # Make definite integrals indefinite: \int_{a}^{b} → \int or \int_a^b → \int
    s = re.sub(r'\\int_\{[^}]*\}\^\{[^}]*\}', r'\\int', s)
    s = re.sub(r'\\int_[^_\s\\{]+\^[^_\s\\{]+', r'\\int', s)
    s = re.sub(r'\\int_\{[^}]*\}\^[^_\s\\{]+', r'\\int', s)
    s = re.sub(r'\\int_[^_\s\\{]+\^\{[^}]*\}', r'\\int', s)

    # Remove \left and \right (parenthesis sizing)
    s = s.replace(r'\left', '').replace(r'\right', '')

    # Standardize fraction notation: \dfrac → \frac, \tfrac → \frac
    s = s.replace(r'\dfrac', r'\frac').replace(r'\tfrac', r'\frac')

    # Standardize dx variations: \,dx → dx, \ dx → dx
    s = re.sub(r'\\,?d([a-z])', r'd\1', s)

    # Remove \displaystyle, \textstyle
    s = s.replace(r'\displaystyle', '').replace(r'\textstyle', '')

    # Remove instruction prefixes (Compute:, Evaluate:, etc.)
    for prefix in ['Compute:', 'Evaluate:', 'Calculate:', 'Find:', 'Solve:', 'Simplify:']:
        s = s.replace(prefix, '')

    # Lowercase for case-insensitive comparison
    s = s.lower()

    return s

In [4]:
def normalize_wolfram(wolfram: str) -> str:
    """
    Normalize Wolfram query for comparison.
    - Remove whitespace
    - Make definite integrals indefinite
    - Lowercase
    """
    if not wolfram or pd.isna(wolfram):
        return ""

    s = str(wolfram).lower()

    # Remove all whitespace
    s = re.sub(r'\s+', '', s)

    # Remove bounds from integrate: integrate...from...to...
    s = re.sub(r'from[\d\w.+-]+to[\d\w.+-]+', '', s)
    s = re.sub(r'\{([a-z]),([^,}]+),([^}]+)\}', r'\1', s)

    return s

# Eval

In [10]:
# Apply normalization
df['latex_normalized'] = df['latex'].apply(normalize_latex)
df['wolfram_normalized'] = df['wolfram'].apply(normalize_wolfram)

# Analyze each group
results = []

for (domain, topic, difficulty), group in df.groupby(["domain", "topic", "difficulty"]):
    total = len(group)

    unique_latex = group['latex_normalized'].nunique()
    unique_wolfram = group['wolfram_normalized'].nunique()

    latex_ratio = unique_latex / total if total > 0 else 0
    wolfram_ratio = unique_wolfram / total if total > 0 else 0

    results.append({
        "domain": domain,
        "topic": topic,
        "difficulty": difficulty,
        "total": total,
        "unique_latex": unique_latex,
        "unique_wolfram": unique_wolfram,
        "latex_ratio": latex_ratio,
        "wolfram_ratio": wolfram_ratio,
    })

summary_df = pd.DataFrame(results)

In [11]:
# Define group order
GROUP_ORDER = [
    ("calculus", "Integration by Parts", "easy"),
    ("calculus", "Integration by Parts", "medium"),
    ("calculus", "Integration by Parts", "hard"),
    ("calculus", "Integration with Substitution", "hard"),
    ("calculus", "Volume Calculation", "hard"),
    ("linear_algebra", "Eigenvectors and Eigenvalues", "medium"),
]

group_sets = {}
for idx, (domain, topic, difficulty) in enumerate(GROUP_ORDER, 1):
    mask = (df['domain'] == domain) & (df['topic'] == topic) & (df['difficulty'] == difficulty)
    group_sets[idx] = set(df.loc[mask, 'latex_normalized']) - {""}

# Build overlap matrix
n = len(GROUP_ORDER)
overlap_matrix = pd.DataFrame(index=range(1, n+1), columns=range(1, n+1))

for i in range(1, n+1):
    for j in range(1, n+1):
        if i == j:
            overlap_matrix.loc[i, j] = "---"
        else:
            overlap_matrix.loc[i, j] = len(group_sets[i] & group_sets[j])

print(overlap_matrix)

     1    2    3    4    5    6
1  ---    1    0    0    0    0
2    1  ---    0    0    0    0
3    0    0  ---    0    0    0
4    0    0    0  ---    0    0
5    0    0    0    0  ---    0
6    0    0    0    0    0  ---


In [15]:
overlap_matrix.head(6)

Unnamed: 0,1,2,3,4,5,6
1,---,1,0,0,0,0
2,1,---,0,0,0,0
3,0,0,---,0,0,0
4,0,0,0,---,0,0
5,0,0,0,0,---,0
6,0,0,0,0,0,---
