In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import nest_asyncio


sys.path.insert(0, os.path.abspath('..'))
nest_asyncio.apply()

In [2]:
import logging


logging.basicConfig(
    level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s'
)

In [3]:
from math_rag.infrastructure.containers import InfrastructureContainer


infrastructure_container = InfrastructureContainer()
infrastructure_container.init_resources()

math_article_seeder = infrastructure_container.math_article_seeder()
math_expression_seeder = infrastructure_container.math_expression_seeder()
math_expression_prediction_seeder = (
    infrastructure_container.math_expression_prediction_seeder()
)
math_article_seeder.seed()
await math_expression_seeder.seed()
await math_expression_prediction_seeder.seed()

math_article_repository = infrastructure_container.math_article_repository()
math_expression_repository = infrastructure_container.math_expression_repository()
math_expression_prediction_repository = (
    infrastructure_container.math_expression_prediction_repository()
)
google_file_repository = infrastructure_container.google_file_repository()

katex_correction_assistant = infrastructure_container.katex_correction_assistant()
katex_validator_service = infrastructure_container.katex_validator_service()
latex_parser_service = infrastructure_container.latex_parser_service()
latex_reader_service = infrastructure_container.latex_reader_service()
latex_visitor_service = infrastructure_container.latex_visitor_service()
arxiv_searcher_service = infrastructure_container.arxiv_searcher_service()

2025-03-05 11:37:29,481 - INFO - file_cache is only supported with oauth2client<4.0.0


In [None]:
from IPython.display import Math, display
from pylatexenc.latexwalker import LatexMathNode

### Download

In [9]:
from math_rag.infrastructure.services.arxiv import MathCategory
from math_rag.infrastructure.utils import GzipExtractorUtil


results = [
    result
    for category in MathCategory
    for result in arxiv_searcher_service.search(category, 4)
]
files: dict[str, bytes] = {}

for result in results:
    # NOTE: we dont need pdfs at the moment
    # pdf = await arxiv_searcher_service.get_pdf(result.entry_id)
    src = await arxiv_searcher_service.get_src(result.entry_id)

    if src is None:
        continue

    src_name, src_bytes = src

    if src_name.endswith('.tar.gz'):
        extracted_files = GzipExtractorUtil.extract_tar_gz(src_bytes)
        files.update(extracted_files)

    elif src_name.endswith('.gz'):
        extracted_bytes = GzipExtractorUtil.extract_gz(src_bytes)
        extracted_files[src_name] = extracted_bytes

    else:
        raise ValueError(f'Unexpected file extension {src_name}')

2025-03-05 11:43:30,318 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Amath.AC&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100


2025-03-05 11:43:31,467 - INFO - Got first page: 100 of 13499 total results
2025-03-05 11:43:31,468 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Amath.AG&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
2025-03-05 11:43:32,567 - INFO - Got first page: 100 of 54794 total results
2025-03-05 11:43:32,567 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Amath.AP&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
2025-03-05 11:43:33,607 - INFO - Got first page: 100 of 66105 total results
2025-03-05 11:43:33,610 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Amath.AT&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
2025-03-05 11:43:34,760 - INFO - Got first page: 100 of 16361 total results
2025-03-05 11:43:34,761 - INFO - Requesting page (first: True, try: 0

### Load

In [None]:
# NOTE: deprecated
# from zipfile import ZipFile
# folder_name, name = 'articles', 'articles_v1.zip'

# file_id = google_file_repository.get_file_id(name, folder_name)
# assert file_id is not None

# file_bytes = google_file_repository.get_file_by_id(file_id)

# with ZipFile(file_bytes, 'r') as zip_file:
#     files = {
#         name: zip_file.read(name)
#         for name in zip_file.namelist()
#         if not name.endswith('/')
#     }

In [None]:
from math_rag.core.models import MathArticle


math_articles = [MathArticle(name=name, bytes=bytes) for name, bytes in files.items()]
math_article_repository.insert_math_articles(math_articles)

In [None]:
for name in math_article_repository.list_math_article_names():
    print(name)

articles_v1/ac/2502.13273v1/2502.13273v1.pdf
articles_v1/ac/2502.13273v1/AlgClos14.tex
articles_v1/ac/2502.13276v1/2502.13276v1.pdf
articles_v1/ac/2502.13276v1/Main.tex
articles_v1/ac/2502.13806v1/2502.13806v1.pdf
articles_v1/ac/2502.13806v1/arXiv.bbl
articles_v1/ac/2502.13806v1/arXiv.tex
articles_v1/ac/2502.13903v1/2502.13903v1.pdf
articles_v1/ac/2502.13903v1/dpsl2_20250219.tex
articles_v1/ag/2502.13800v1/2502.13800v1.pdf
articles_v1/ag/2502.13800v1/neron.tex
articles_v1/ag/2502.13815v1/2502.13815v1.pdf
articles_v1/ag/2502.13815v1/Z3-v3.tex
articles_v1/ag/2502.13882v1/2502.13882v1.pdf
articles_v1/ag/2502.13882v1/main.tex
articles_v1/ag/2502.13892v1/2502.13892v1.pdf
articles_v1/ag/2502.13892v1/_commands.tex
articles_v1/ag/2502.13892v1/_preamble.tex
articles_v1/ag/2502.13892v1/abstract.tex
articles_v1/ag/2502.13892v1/appl.tex
articles_v1/ag/2502.13892v1/cl.tex
articles_v1/ag/2502.13892v1/faces.tex
articles_v1/ag/2502.13892v1/filt.tex
articles_v1/ag/2502.13892v1/intro.tex
articles_v1/ag/

### Parse

In [None]:
file_names = math_article_repository.list_math_article_names()
file_names = [x for x in file_names if x.endswith('.tex')]

math_nodes_by_file_name: dict[str, list[LatexMathNode]] = {}

for name in file_names:
    math_article = math_article_repository.get_math_article_by_name(name)
    latex = latex_reader_service.read(math_article.bytes)
    nodes = latex_parser_service.parse(latex)

    math_nodes_by_file_name[name] = []
    append_math_node = lambda x: math_nodes_by_file_name[name].append(x)
    callbacks = {LatexMathNode: append_math_node}

    latex_visitor_service.visit(nodes, callbacks)

In [None]:
import re

from math_rag.core.enums import MathCategory
from math_rag.core.models import MathExpression


math_expressions: list[MathExpression] = []

for file_name, math_nodes in math_nodes_by_file_name.items():
    # category_slug = re.search(r'articles_v\d+/([^/]+)/', file_name).group(1)

    for math_node in math_nodes:
        # category = MathCategory.from_str(category_slug)
        latex: str = math_node.latex_verbatim()
        katex = latex.strip('$')
        validation_result = await katex_validator_service.validate(katex)

        if not validation_result.valid:
            katex = await katex_correction_assistant.correct(katex)

        math_expression = MathExpression(
            latex=latex,
            katex=katex,
            position=math_node.pos,
            is_inline=math_node.displaytype == 'inline',
            # math_category=category,
        )
        math_expressions.append(math_expression)

In [16]:
await math_expression_repository.insert_math_expressions(math_expressions)

### Display

In [10]:
def fix_latex(latex_str: str):
    fixed = latex_str.replace('\\[', '$$').replace('\\]', '$$')
    fixed = fixed.replace('\\EE', '\\mathbb{E}')
    fixed = fixed.replace('\\II', '\\mathbb{I}')
    fixed = fixed.replace('\\Var', '\\mathrm{Var}')
    fixed = fixed.replace('\\HH', '\\mathbb{H}')
    fixed = fixed.replace('\\AND', '\\wedge')
    fixed = fixed.replace('\\OR', '\\vee')
    fixed = fixed.replace('\\mathbbm{1}', '\\mathbf{1}')
    fixed = fixed.replace('\\Maj', '\\mathrm{Maj}')
    fixed = fixed.replace('\\sgn', '\\operatorname{sgn}')
    fixed = fixed.replace('\\Tribus', '\\mathrm{Tribus}')
    fixed = fixed.replace('\\linebreak', '\\text{ }')
    fixed = fixed.replace('\\Prob', '\\mathbb{P}')
    fixed = fixed.replace('\\WW', '\\mathcal{W}')

    return fixed

In [None]:
for i, latex_math_node in enumerate(math_nodes[:100]):
    latex = latex_math_node.latex_verbatim()
    latex_fixed = fix_latex(latex)
    math_display_object = Math(latex_fixed)

    # print(i)
    # print(math_display_object._repr_latex_())

    display(math_display_object)

### Classify

In [None]:
from decouple import config

from math_rag.infrastructure.inference import LLM


OPENAI_BASE_URL = config('OPENAI_BASE_URL')
OPENAI_API_KEY = config('OPENAI_API_KEY')

In [14]:
llm = LLM(model='gpt-4o', base_url=OPENAI_BASE_URL, api_key=OPENAI_API_KEY)

In [5]:
# math_expr = math_nodes[13].latex_verbatim()  # 13
math_expr = '$\\mu(x)=\\frac{1}{2^n}$'

In [None]:
prompt = f"""
You are a mathematical expression classifier.
Given a mathematical expression, classify it in one of 4 given classes:
- constant
- variable
- formula
- other

Return a class only!

Mathematical expression:
{math_expr}

Class:
"""

In [7]:
from openai import NOT_GIVEN
from openai.types.chat import ChatCompletion


def get_prompt(math_expr: str) -> str:
    return f"""
You are a mathematical expression classifier.
Given a mathematical expression, classify it in a single class regarding STRUCTURE of the expression.
Class must be a single word.

Return a class only!

Mathematical expression:
{math_expr}

Class:
"""


use_json = False


async def get_completion(prompt: str) -> ChatCompletion:
    return await llm.client.chat.completions.create(
        model='gpt-4o',
        messages=[{'role': 'user', 'content': prompt}],
        response_format={'type': 'json_object'} if use_json else NOT_GIVEN,
        logprobs=True,
        temperature=0.0,
        top_logprobs=5,
    )

In [8]:
math_expressions_by_category = (
    await math_expression_repository.get_math_expressions_by_category(32)
)

In [None]:
from math_rag.core.models import MathExpressionPrediction


completions: list[ChatCompletion] = []
predictions: list[MathExpressionPrediction] = []

from time import sleep


for math_expression in math_expressions_by_category:
    prompt = get_prompt(math_expression.latex)
    completion = await get_completion(prompt)

    prediction = MathExpressionPrediction(
        math_expression_id=math_expression.id,
        value=completion.choices[0].message.content,
    )
    completions.append(completion)
    predictions.append(prediction)

    # await math_expression_prediction_repository.insert_math_expression_predictions([prediction])
    sleep(1)

In [None]:
await math_expression_prediction_repository.insert_math_expression_predictions(
    predictions
)

In [15]:
import json


with open('../tmp/completions.json', 'w') as json_file:
    json.dump([x.to_dict() for x in completions], json_file, indent=4)

In [None]:
from uuid import UUID


class UUIDEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, UUID):
            return str(obj)

        return super().default(obj)


with open('../tmp/predictions.json', 'w') as json_file:
    json.dump(
        [x.model_dump() for x in predictions], json_file, indent=4, cls=UUIDEncoder
    )

In [None]:
completion = await get_completion(prompt)

completion.choices[0].message.content

'Function'

In [57]:
import numpy as np


for x in completion.choices[0].logprobs.content:
    for y in x.top_logprobs:
        print(f'"{y.token}": {np.exp(y.logprob)}')

    print('------')
    print(x.token)
    print(x.logprob)
    print(np.exp(x.logprob))

"formula": 0.9999996871837189
" formula": 9.931194312156244e-08
"Formula": 7.734421907141565e-08
"_formula": 9.237449661970594e-09
"form": 2.061153622438558e-09
------
formula
-3.1281633e-07
0.9999996871837189


In [None]:
# TODO
# - description for each class
# - how to determine classes?
# - do names need to take a single token?

### Analyze

In [5]:
from enum import Enum


class MathExpressionCategory(str, Enum):
    EQUALITY = 'equality'
    INEQUALITY = 'inequality'
    CONSTANT = 'constant'
    VARIABLE = 'variable'
    OTHER = 'other'

In [6]:
predictions = (
    await math_expression_prediction_repository.get_math_expression_predictions(1024)
)

In [11]:
expressions = [
    await math_expression_repository.get_math_expression_by_id(
        prediction.math_expression_id
    )
    for prediction in predictions
]

In [None]:
for expression, prediction in zip(expressions, predictions):
    try:
        validation_result = await katex_validator_service.validate(
            expression.latex.strip('$')
        )
        math_display_object = Math(expression.latex)

        print(prediction.value)
        print(expression.latex)
        display(math_display_object)
        print(validation_result.valid, validation_result.error)
        print('-----')

    except Exception as e:
        print(f'skipping {expression.id}')
        print(e)

### Correct

In [37]:
incorrect_katex = r'd\omega = \theta \w \omega'
error = r'KaTeX parse error: Undefined control sequence: \w at position 18: …omega = \theta \̲w̲ ̲\omega'

In [38]:
corrected_katex = await katex_correction_assistant.correct(incorrect_katex, error)
print(corrected_katex)
display(Math(corrected_katex))

2025-03-03 19:50:53,083 - INFO - HTTP Request: POST http://localhost:3000/validate "HTTP/1.1 200 OK"
2025-03-03 19:50:54,043 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-03 19:50:54,053 - INFO - HTTP Request: POST http://localhost:3000/validate "HTTP/1.1 200 OK"


d\omega = \theta \omega


<IPython.core.display.Math object>

### New

In [None]:
all_expressions = await math_expression_repository.collection.find().to_list()

In [66]:
tikz_count, over_2k_count = 0, 0

for expression in all_expressions:
    latex = expression['latex']
    x = len(latex)

    if 'tikz' in latex:
        tikz_count += 1
        continue

    if x > 2000:  # and x < 3000:
        over_2k_count += 1
        print(x)
        print(latex)
        print('----- -----')

5062
$ is regular;
        {\it equivariant completion} may be a better name.
        Their existence is stated as
        conjecture   GCNM   on \cite[p.318]{MR715605}, and attributed to Mumford. We prove  it  in Section~\ref{eq.comp.sec} in characteristic 0. 
        
      

\begin{cor} \label{main.min.ner.thm.cor.2}
  If $S$ is   over a field of characteristic 0, then
   \begin{enumerate}
   \item there is a minimal model $A^*_S$ on which $\ner(A_K)$ acts regularly, and 
   \item $\ner(A_K)\to S$ has an equivariant completion.
      \end{enumerate}
\end{cor}


The papers 
\cite{mitsui2024relativecompactificationssemiabelianneron, nakamura2024relativecompactificationsemiabelianneron}
construct an equivariant completion of the N\'eron model 
for  the semiabelian reduction cases, but  do not check that its singularities are terminal. Using  minimal model  theory, we get the following.

        \begin{cor} \label{main.min.ner.thm.cor.3}
          Assume that all  fibers of  $\ner(A_K)\

In [67]:
tikz_count, over_2k_count

(129, 20)