In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import nest_asyncio


sys.path.insert(0, os.path.abspath('..'))
nest_asyncio.apply()

In [None]:
import logging


logging.basicConfig(
    level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s'
)

In [None]:
from decouple import config

from math_rag.repositories.embeddings import QdrantEmbeddingRepository
from math_rag.repositories.files import MinioFileRepository
from math_rag.repositories.graphs import Neo4jGraphRepository
from math_rag.repositories.objects import MongoObjectRepository


DEPLOYMENT = config('DEPLOYMENT')

MINIO_ENDPOINT = config('MINIO_ENDPOINT')
MINIO_ACCESS_KEY = config('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY = config('MINIO_SECRET_KEY')
MONGO_HOST = config('MONGO_HOST')
NEO4J_URI = config('NEO4J_URI')
NEO4J_USERNAME = config('NEO4J_USERNAME')
NEO4J_PASSWORD = config('NEO4J_PASSWORD')
QDRANT_URL = config('QDRANT_URL')

In [None]:
file_repo = MinioFileRepository(MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY)
embedding_repo = QdrantEmbeddingRepository(QDRANT_URL)
graph_repo = Neo4jGraphRepository(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)
object_repo = MongoObjectRepository(MONGO_HOST, DEPLOYMENT)

In [4]:
import gzip
import shutil
import tarfile

from pathlib import Path

from pylatexenc.latexwalker import (
    LatexCharsNode,
    LatexCommentNode,
    LatexEnvironmentNode,
    LatexGroupNode,
    LatexMacroNode,
    LatexMathNode,
    LatexNode,
    LatexSpecialsNode,
    LatexWalker,
)


ARTICLES_PATH = '../tmp/articles'

In [None]:
from io import BytesIO
from pathlib import Path

from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import Resource, build
from googleapiclient.http import MediaIoBaseDownload

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials


SCOPES = ['https://www.googleapis.com/auth/drive']


def authenticate() -> Resource:
    token_path = Path('../google/token.json')
    credentials: Credentials | None = None

    if token_path.exists():
        credentials = Credentials.from_authorized_user_file(str(token_path), SCOPES)

    if not credentials or not credentials.valid:
        if credentials and credentials.expired and credentials.refresh_token:
            credentials.refresh(Request())

        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                '../google/credentials.json', SCOPES
            )
            credentials = flow.run_local_server(port=0)

        token_path.write_text(credentials.to_json())

    return build('drive', 'v3', credentials=credentials)


def get_file_id(service: Resource, file_name: str, dir_name: str) -> str | None:
    dir_query = f"name='{dir_name}' and mimeType='application/vnd.google-apps.folder'"
    dir_fields = 'files(id, name)'
    dir_results: dict = (
        service.files()
        .list(
            q=dir_query,
            fields=dir_fields,
        )
        .execute()
    )
    dirs = dir_results.get('files', [])

    if not dirs:
        logging.info(f'Directory {dir_name} not found.')
        return None

    dir_id = dirs[0]['id']

    file_query = f"name='{file_name}' and '{dir_id}' in parents"
    file_fields = 'files(id, name)'
    file_results = service.files().list(q=file_query, fields=file_fields).execute()
    files = file_results.get('files', [])

    if not files:
        logging.info(f'File {file_name} not found in directory {dir_name}.')
        return None

    return files[0]['id']


def download_file(service: Resource, file_id: str) -> BytesIO:
    request = service.files().get_media(fileId=file_id)
    file_bytes = BytesIO()

    downloader = MediaIoBaseDownload(file_bytes, request)
    done = False

    while not done:
        _, done = downloader.next_chunk()

    return file_bytes

In [5]:
service = authenticate()

/home/lukap/math-rag/notebooks
Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=740655027153-0682f6qfp13qk1oh917pno162crrseaq.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A52179%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&state=nz6VROXkZvZlsdbfluZ2OWnCz8Mbmb&access_type=offline


gio: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=740655027153-0682f6qfp13qk1oh917pno162crrseaq.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A52179%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&state=nz6VROXkZvZlsdbfluZ2OWnCz8Mbmb&access_type=offline: Operation not supported


In [None]:
dir_name = 'articles'
file_name = 'articles_v1.zip'

file_id = get_file_id(service, file_name, dir_name)

if file_id:
    file_bytes = download_file(service, file_id)

In [None]:
from zipfile import ZipFile


with ZipFile(file_bytes, 'r') as zip_file:
    file_dict = {name: BytesIO(zip_file.read(name)) for name in zip_file.namelist()}

file_name, file_bytes = file_dict.items()[0]

In [None]:
bucket_name = 'articles'
file_repo.create_bucket(bucket_name)
file_repo.insert_file(bucket_name, file_name, file_bytes)

### Extract

In [None]:
def get_gzip_original_filename(file_path):
    with open(file_path, 'rb') as f:
        if f.read(2) != b'\x1f\x8b':
            return None
        f.read(1)
        flag = f.read(1)[0]
        f.read(4)
        f.read(1)
        f.read(1)
        orig_name = None
        if flag & 0x08:
            name_bytes = bytearray()
            while True:
                b = f.read(1)
                if not b or b == b'\x00':
                    break
                name_bytes.extend(b)
            try:
                orig_name = name_bytes.decode('utf-8')
            except UnicodeDecodeError:
                orig_name = name_bytes.decode('latin1')
        return orig_name


def extract_gz(file_path, dest_folder):
    orig_name = get_gzip_original_filename(file_path)
    if not orig_name:
        orig_name = os.path.splitext(os.path.basename(file_path))[0]
    dest_path = os.path.join(dest_folder, orig_name)
    with gzip.open(file_path, 'rb') as f_in, open(dest_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)


def extract_tar_gz(file_path, dest_folder):
    with tarfile.open(file_path, 'r:gz') as tar:
        tar.extractall(path=dest_folder)


def process_subdir(subdir_path):
    files = os.listdir(subdir_path)
    pdf_files = {f for f in files if f.endswith('.pdf')}
    for pdf in pdf_files:
        base_name = pdf[:-4]
        gz_name = f'arXiv-{base_name}.gz'
        tar_gz_name = f'arXiv-{base_name}.tar.gz'
        gz_file = None
        if tar_gz_name in files:
            gz_file = tar_gz_name
        elif gz_name in files:
            gz_file = gz_name
        if gz_file:
            new_dir = os.path.join(subdir_path, base_name)
            os.makedirs(new_dir, exist_ok=True)
            shutil.move(os.path.join(subdir_path, pdf), new_dir)
            shutil.move(os.path.join(subdir_path, gz_file), new_dir)
            new_gz_path = os.path.join(new_dir, gz_file)
            if gz_file.endswith('.tar.gz'):
                extract_tar_gz(new_gz_path, new_dir)
            else:
                extract_gz(new_gz_path, new_dir)


def extract_all():
    for subdir in os.listdir(ARTICLES_PATH):
        subdir_path = os.path.join(ARTICLES_PATH, subdir)
        if os.path.isdir(subdir_path):
            process_subdir(subdir_path)


extract_all()

  tar.extractall(path=dest_folder)


In [9]:
def clean():
    for root, dirs, files in os.walk(ARTICLES_PATH):
        for file in files:
            if file.endswith('.gz'):
                os.remove(os.path.join(root, file))


clean()

### Categorize

In [None]:
from enum import Enum


class ArXivMathCategory(str, Enum):
    AC = 'commutative_algebra'
    AG = 'algebraic_geometry'
    AP = 'analysis_of_pdes'
    AT = 'algebraic_topology'
    CA = 'classical_analysis_and_odes'
    CO = 'combinatorics'
    CT = 'category_theory'
    CV = 'complex_variables'
    DG = 'differential_geometry'
    DS = 'dynamical_systems'
    FA = 'functional_analysis'
    GM = 'general_mathematics'
    GN = 'general_topology'
    GR = 'group_theory'
    GT = 'geometric_topology'
    HO = 'history_and_overview'
    IT = 'information_theory'
    KT = 'k_theory_and_homology'
    LO = 'logic'
    MG = 'metric_geometry'
    MP = 'mathematical_physics'
    NA = 'numerical_analysis'
    NT = 'number_theory'
    OA = 'operator_algebras'
    OC = 'optimization_and_control'
    PR = 'probability'
    QA = 'quantum_algebra'
    RA = 'rings_and_algebras'
    RT = 'representation_theory'
    SG = 'symplectic_geometry'
    SP = 'spectral_theory'
    ST = 'statistics_theory'

### Parse

In [5]:
def read_tex_file(file_path: Path) -> tuple[str, str]:
    for encoding in ('utf-8', 'latin1', 'cp1252'):
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read(), encoding

        except UnicodeDecodeError:
            continue

In [None]:
from pylatexenc.macrospec import ParsedMacroArgs


def visit_nodes_old(file_path: Path):
    latex, encoding = read_tex_file(file_path)
    walker = LatexWalker(latex)
    nodes, _, _ = walker.get_latex_nodes()

    stack = [iter(nodes)]

    while stack:
        node = next(stack[-1], None)

        if node is None:
            stack.pop()
            continue

        if isinstance(node, LatexCharsNode):
            pass

        elif isinstance(node, LatexMathNode):
            pass

        elif isinstance(node, LatexCommentNode):
            pass

        elif isinstance(node, LatexSpecialsNode):
            pass

        elif isinstance(node, LatexMacroNode):
            # if node.macroname == 'input':
            #     args: ParsedMacroArgs = node.nodeargd
            #     assert len(args.argnlist) == 1

            #     group_node = args.argnlist[0]
            #     if not isinstance(group_node, LatexGroupNode):
            #         continue

            #     assert len(group_node.nodelist) == 1

            #     chars_node: LatexCharsNode = group_node.nodelist[0]
            #     input_file_name = str(chars_node.chars)
            #     input_file_path = dir_path / input_file_name

            #     if not input_file_path.suffix:
            #         input_file_path = input_file_path.with_suffix('.tex')

            #     if input_file_path.suffix == '.tex':
            #         latex, encoding = read_tex_file(input_file_path)
            #         walker = LatexWalker(latex)
            #         nodes, _, _ = walker.get_latex_nodes()
            #         stack.append(iter(nodes))

            # elif node.macroname == 'include':
            #     pass
            pass

        elif isinstance(node, LatexEnvironmentNode):
            stack.append(iter(node.nodelist))

        elif isinstance(node, LatexGroupNode):
            stack.append(iter(node.nodelist))

In [6]:
from typing import Callable, Iterator


def visit_nodes(
    file_path: Path, callbacks: dict[type[LatexNode], Callable[[LatexNode], None]]
):
    latex, _ = read_tex_file(file_path)
    walker = LatexWalker(latex)
    nodes, _, _ = walker.get_latex_nodes()

    stack: list[Iterator[LatexNode]] = [iter(nodes)]

    while stack:
        node = next(stack[-1], None)

        if node is None:
            stack.pop()
            continue

        node_type = type(node)

        if node_type in callbacks:
            callbacks[node_type](node)

        if isinstance(node, LatexEnvironmentNode) or isinstance(node, LatexGroupNode):
            stack.append(iter(node.nodelist))

In [7]:
# NOTE:
# problem with multiple .tex files -> which one is root? -> one with doc env node?
# problem with relative \input{} -> root dir needed, i.e., .../2502.13810v1

math_nodes: list[LatexMathNode] = []
append_math_node = lambda x: math_nodes.append(x)

callbacks = {LatexMathNode: append_math_node}

for file_path in Path(ARTICLES_PATH).rglob('*.tex'):
    visit_nodes(file_path, callbacks)

len(math_nodes)

127133

In [None]:
from uuid import UUID, uuid4

from pydantic import BaseModel, Field


class MathExpressionRecord(BaseModel):
    id: UUID = Field(default_factory=uuid4)
    latex: str
    position: int
    is_inline: bool

In [140]:
def fix_latex(latex_str: str):
    fixed = latex_str.replace('\\[', '$$').replace('\\]', '$$')
    fixed = fixed.replace('\\EE', '\\mathbb{E}')
    fixed = fixed.replace('\\II', '\\mathbb{I}')
    fixed = fixed.replace('\\Var', '\\mathrm{Var}')
    fixed = fixed.replace('\\HH', '\\mathbb{H}')
    fixed = fixed.replace('\\AND', '\\wedge')
    fixed = fixed.replace('\\OR', '\\vee')
    fixed = fixed.replace('\\mathbbm{1}', '\\mathbf{1}')
    fixed = fixed.replace('\\Maj', '\\mathrm{Maj}')
    fixed = fixed.replace('\\sgn', '\\operatorname{sgn}')
    fixed = fixed.replace('\\Tribus', '\\mathrm{Tribus}')
    fixed = fixed.replace('\\linebreak', '\\text{ }')
    fixed = fixed.replace('\\Prob', '\\mathbb{P}')
    fixed = fixed.replace('\\WW', '\\mathcal{W}')

    return fixed

In [None]:
from IPython.display import Math, display


for i, latex_math_node in enumerate(math_nodes[:100]):
    latex = latex_math_node.latex_verbatim()
    latex_fixed = fix_latex(latex)
    math_display_object = Math(latex_fixed)

    # print(i)
    # print(math_display_object._repr_latex_())

    display(math_display_object)

### Classify

In [2]:
from decouple import config

from math_rag.inference.llms import LLM


OPENAI_BASE_URL = config('OPENAI_BASE_URL')
OPENAI_API_KEY = config('OPENAI_API_KEY')

In [3]:
llm = LLM(model='gpt-4o-mini', base_url=OPENAI_BASE_URL, api_key=OPENAI_API_KEY)

In [55]:
math_expr = math_nodes[13].latex_verbatim()  # 13
math_expr

'$\\mu(x)=\\frac{1}{2^n}$'

In [56]:
from openai import NOT_GIVEN


prompt = f"""
You are a mathematical expression classifier.
Given a mathematical expression, classify it in one of 4 given classes:
- constant
- variable
- formula
- other

Return a class only!

Mathematical expression:
{math_expr}

Class:
"""

use_json = False
completion = await llm.client.chat.completions.create(
    model=llm.model,
    messages=[{'role': 'user', 'content': prompt}],
    response_format={'type': 'json_object'} if use_json else NOT_GIVEN,
    logprobs=True,
    temperature=0.0,
    top_logprobs=5,
)
print(completion)

ChatCompletion(id='chatcmpl-B3qibSoK3Z0LqPtZQjEYKhXO1AjaF', choices=[Choice(finish_reason='stop', index=0, logprobs=ChoiceLogprobs(content=[ChatCompletionTokenLogprob(token='formula', bytes=[102, 111, 114, 109, 117, 108, 97], logprob=-3.1281633e-07, top_logprobs=[TopLogprob(token='formula', bytes=[102, 111, 114, 109, 117, 108, 97], logprob=-3.1281633e-07), TopLogprob(token=' formula', bytes=[32, 102, 111, 114, 109, 117, 108, 97], logprob=-16.125), TopLogprob(token='Formula', bytes=[70, 111, 114, 109, 117, 108, 97], logprob=-16.375), TopLogprob(token='_formula', bytes=[95, 102, 111, 114, 109, 117, 108, 97], logprob=-18.5), TopLogprob(token='form', bytes=[102, 111, 114, 109], logprob=-20.0)])], refusal=None), message=ChatCompletionMessage(content='formula', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1740257333, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier='default', system_fingerprint='fp_7fcd609668', usage=Comple

In [58]:
completion.choices[0].message.content

'formula'

In [57]:
import numpy as np


for x in completion.choices[0].logprobs.content:
    for y in x.top_logprobs:
        print(f'"{y.token}": {np.exp(y.logprob)}')

    print('------')
    print(x.token)
    print(x.logprob)
    print(np.exp(x.logprob))

"formula": 0.9999996871837189
" formula": 9.931194312156244e-08
"Formula": 7.734421907141565e-08
"_formula": 9.237449661970594e-09
"form": 2.061153622438558e-09
------
formula
-3.1281633e-07
0.9999996871837189


In [None]:
# TODO
# - description for each class
# - how to determine classes?
# - do names need to take a single token?

In [None]:
class MathExpressionCategory(str, Enum):
    CONSTANT = 'constant'