In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import nest_asyncio


sys.path.insert(0, os.path.abspath('..'))
nest_asyncio.apply()

In [None]:
import gzip
import shutil
import tarfile

from pylatexenc.latexwalker import (
    LatexCharsNode,
    LatexCommentNode,
    LatexEnvironmentNode,
    LatexGroupNode,
    LatexMacroNode,
    LatexMathNode,
    LatexNode,
    LatexSpecialsNode,
    LatexWalker,
)


ARTICLES_PATH = '../tmp/articles'

### Extract

In [None]:
def get_gzip_original_filename(file_path):
    with open(file_path, 'rb') as f:
        if f.read(2) != b'\x1f\x8b':
            return None
        f.read(1)
        flag = f.read(1)[0]
        f.read(4)
        f.read(1)
        f.read(1)
        orig_name = None
        if flag & 0x08:
            name_bytes = bytearray()
            while True:
                b = f.read(1)
                if not b or b == b'\x00':
                    break
                name_bytes.extend(b)
            try:
                orig_name = name_bytes.decode('utf-8')
            except UnicodeDecodeError:
                orig_name = name_bytes.decode('latin1')
        return orig_name


def extract_gz(file_path, dest_folder):
    orig_name = get_gzip_original_filename(file_path)
    if not orig_name:
        orig_name = os.path.splitext(os.path.basename(file_path))[0]
    dest_path = os.path.join(dest_folder, orig_name)
    with gzip.open(file_path, 'rb') as f_in, open(dest_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)


def extract_tar_gz(file_path, dest_folder):
    with tarfile.open(file_path, 'r:gz') as tar:
        tar.extractall(path=dest_folder)


def process_subdir(subdir_path):
    files = os.listdir(subdir_path)
    pdf_files = {f for f in files if f.endswith('.pdf')}
    for pdf in pdf_files:
        base_name = pdf[:-4]
        gz_name = f'arXiv-{base_name}.gz'
        tar_gz_name = f'arXiv-{base_name}.tar.gz'
        gz_file = None
        if tar_gz_name in files:
            gz_file = tar_gz_name
        elif gz_name in files:
            gz_file = gz_name
        if gz_file:
            new_dir = os.path.join(subdir_path, base_name)
            os.makedirs(new_dir, exist_ok=True)
            shutil.move(os.path.join(subdir_path, pdf), new_dir)
            shutil.move(os.path.join(subdir_path, gz_file), new_dir)
            new_gz_path = os.path.join(new_dir, gz_file)
            if gz_file.endswith('.tar.gz'):
                extract_tar_gz(new_gz_path, new_dir)
            else:
                extract_gz(new_gz_path, new_dir)


def extract_all():
    for subdir in os.listdir(ARTICLES_PATH):
        subdir_path = os.path.join(ARTICLES_PATH, subdir)
        if os.path.isdir(subdir_path):
            process_subdir(subdir_path)


extract_all()

  tar.extractall(path=dest_folder)


In [9]:
def clean():
    for root, dirs, files in os.walk(ARTICLES_PATH):
        for file in files:
            if file.endswith('.gz'):
                os.remove(os.path.join(root, file))


clean()

### Categorize

In [None]:
from enum import Enum


class Category(Enum):
    AC = 'commutative_algebra'
    AG = 'algebraic_geometry'
    AP = 'analysis_of_pdes'
    AT = 'algebraic_topology'
    CA = 'classical_analysis_and_odes'
    CO = 'combinatorics'
    CT = 'category_theory'
    CV = 'complex_variables'
    DG = 'differential_geometry'
    DS = 'dynamical_systems'
    FA = 'functional_analysis'
    GM = 'general_mathematics'
    GN = 'general_topology'
    GR = 'group_theory'
    GT = 'geometric_topology'
    HO = 'history_and_overview'
    IT = 'information_theory'
    KT = 'k_theory_and_homology'
    LO = 'logic'
    MG = 'metric_geometry'
    MP = 'mathematical_physics'
    NA = 'numerical_analysis'
    NT = 'number_theory'
    OA = 'operator_algebras'
    OC = 'optimization_and_control'
    PR = 'probability'
    QA = 'quantum_algebra'
    RA = 'rings_and_algebras'
    RT = 'representation_theory'
    SG = 'symplectic_geometry'
    SP = 'spectral_theory'
    ST = 'statistics_theory'

### Merge

In [164]:
from pylatexenc.macrospec import ParsedMacroArgs


def find_file_imports(root: LatexNode):
    stack = [iter([root])]
    stack_end = object()

    while stack:
        node = next(stack[-1], stack_end)

        if node is stack_end:
            stack.pop()
            continue

        if isinstance(node, LatexCharsNode):
            pass

        elif isinstance(node, LatexMathNode):
            pass

        elif isinstance(node, LatexCommentNode):
            pass

        elif isinstance(node, LatexSpecialsNode):
            pass

        elif isinstance(node, LatexMacroNode):
            if node.macroname == 'input':
                args: ParsedMacroArgs = node.nodeargd
                group_node: LatexGroupNode = args.argnlist[0]
                chars_node: LatexCharsNode = group_node.nodelist[0]
                print(chars_node.chars)

            elif node.macroname == 'include':
                pass

        elif isinstance(node, LatexEnvironmentNode):
            stack.append(iter(node.nodelist))

        elif isinstance(node, LatexGroupNode):
            stack.append(iter(node.nodelist))

In [165]:
# NOTE merge
path = f'{ARTICLES_PATH}/ct/2502.13810v1/main.tex'

with open(path, 'r', encoding='utf-8') as file:
    latex = file.read()

walker = LatexWalker(latex)
nodes, pos, len_ = walker.get_latex_nodes()

doc_env_node = None

for node in nodes:
    if isinstance(node, LatexEnvironmentNode) and node.environmentname == 'document':
        doc_env_node = node
        break

if doc_env_node is not None:
    find_file_imports(doc_env_node)

sections/introduction
sections/background
sections/error
sections/universal
sections/conclusion


### Parse

In [None]:
def traverse(root: LatexNode):
    stack = [iter([root])]
    stack_end = object()

    while stack:
        node = next(stack[-1], stack_end)

        if node is stack_end:
            stack.pop()
            continue

        if isinstance(node, LatexCharsNode):
            pass

        elif isinstance(node, LatexMathNode):
            pass

        elif isinstance(node, LatexCommentNode):
            pass

        elif isinstance(node, LatexSpecialsNode):
            pass

        elif isinstance(node, LatexMacroNode):
            pass

        elif isinstance(node, LatexEnvironmentNode):
            stack.append(iter(node.nodelist))

        elif isinstance(node, LatexGroupNode):
            stack.append(iter(node.nodelist))

In [49]:
def get_latex_math_nodes(root: LatexNode):
    latex_math_nodes = []
    stack = [iter([root])]
    stack_end = object()

    while stack:
        node = next(stack[-1], stack_end)

        if node is stack_end:
            stack.pop()
            continue

        elif isinstance(node, LatexMathNode):
            latex_math_nodes.append(node)

        elif isinstance(node, LatexEnvironmentNode):
            stack.append(iter(node.nodelist))

        elif isinstance(node, LatexGroupNode):
            stack.append(iter(node.nodelist))

    return latex_math_nodes

In [None]:
from pathlib import Path


def find_tex_files(dir: str) -> list[LatexMathNode]:
    latex_math_nodes = []

    for file_path in Path(dir).rglob('*.tex'):
        for encoding in ('utf-8', 'latin1', 'cp1252'):
            try:
                with open(file_path, 'r', encoding=encoding) as file:
                    latex = file.read()

            except UnicodeDecodeError:
                continue

        walker = LatexWalker(latex)
        nodes, _, _ = walker.get_latex_nodes()

        doc_env_node = None

        for node in nodes:
            if (
                isinstance(node, LatexEnvironmentNode)
                and node.environmentname == 'document'
            ):
                doc_env_node = node
                break

        if doc_env_node is not None:
            latex_math_nodes += get_latex_math_nodes(doc_env_node)

    return latex_math_nodes


latex_math_nodes = find_tex_files(ARTICLES_PATH)

In [None]:
len(latex_math_nodes)

107825

In [140]:
def fix_latex(latex_str: str):
    fixed = latex_str.replace('\\[', '$$').replace('\\]', '$$')
    fixed = fixed.replace('\\EE', '\\mathbb{E}')
    fixed = fixed.replace('\\II', '\\mathbb{I}')
    fixed = fixed.replace('\\Var', '\\mathrm{Var}')
    fixed = fixed.replace('\\HH', '\\mathbb{H}')
    fixed = fixed.replace('\\AND', '\\wedge')
    fixed = fixed.replace('\\OR', '\\vee')
    fixed = fixed.replace('\\mathbbm{1}', '\\mathbf{1}')
    fixed = fixed.replace('\\Maj', '\\mathrm{Maj}')
    fixed = fixed.replace('\\sgn', '\\operatorname{sgn}')
    fixed = fixed.replace('\\Tribus', '\\mathrm{Tribus}')
    fixed = fixed.replace('\\linebreak', '\\text{ }')
    fixed = fixed.replace('\\Prob', '\\mathbb{P}')
    fixed = fixed.replace('\\WW', '\\mathcal{W}')

    return fixed

In [None]:
from IPython.display import Math, display


for i, latex_math_node in enumerate(latex_math_nodes[:100]):
    latex = latex_math_node.latex_verbatim()
    latex_fixed = fix_latex(latex)
    math_display_object = Math(latex_fixed)

    # print(i)
    # print(math_display_object._repr_latex_())

    display(math_display_object)

In [None]:
path = 'assets/input.tex'

with open(path, 'r', encoding='utf-8') as file:
    latex = file.read()

walker = LatexWalker(latex)

nodes, pos, len_ = walker.get_latex_nodes()

In [None]:
doc_env_node = None

for node in nodes:
    if isinstance(node, LatexEnvironmentNode) and node.environmentname == 'document':
        doc_env_node = node
        break

if doc_env_node is not None:
    extracted_text = parse_node(doc_env_node)