In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import nest_asyncio


sys.path.insert(0, os.path.abspath('..'))
nest_asyncio.apply()

In [None]:
import gzip
import shutil
import tarfile


ARTICLES_PATH = '../tmp/articles'

### Extract

In [None]:
def get_gzip_original_filename(file_path):
    with open(file_path, 'rb') as f:
        if f.read(2) != b'\x1f\x8b':
            return None
        f.read(1)
        flag = f.read(1)[0]
        f.read(4)
        f.read(1)
        f.read(1)
        orig_name = None
        if flag & 0x08:
            name_bytes = bytearray()
            while True:
                b = f.read(1)
                if not b or b == b'\x00':
                    break
                name_bytes.extend(b)
            try:
                orig_name = name_bytes.decode('utf-8')
            except UnicodeDecodeError:
                orig_name = name_bytes.decode('latin1')
        return orig_name


def extract_gz(file_path, dest_folder):
    orig_name = get_gzip_original_filename(file_path)
    if not orig_name:
        orig_name = os.path.splitext(os.path.basename(file_path))[0]
    dest_path = os.path.join(dest_folder, orig_name)
    with gzip.open(file_path, 'rb') as f_in, open(dest_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)


def extract_tar_gz(file_path, dest_folder):
    with tarfile.open(file_path, 'r:gz') as tar:
        tar.extractall(path=dest_folder)


def process_subdir(subdir_path):
    files = os.listdir(subdir_path)
    pdf_files = {f for f in files if f.endswith('.pdf')}
    for pdf in pdf_files:
        base_name = pdf[:-4]
        gz_name = f'arXiv-{base_name}.gz'
        tar_gz_name = f'arXiv-{base_name}.tar.gz'
        gz_file = None
        if tar_gz_name in files:
            gz_file = tar_gz_name
        elif gz_name in files:
            gz_file = gz_name
        if gz_file:
            new_dir = os.path.join(subdir_path, base_name)
            os.makedirs(new_dir, exist_ok=True)
            shutil.move(os.path.join(subdir_path, pdf), new_dir)
            shutil.move(os.path.join(subdir_path, gz_file), new_dir)
            new_gz_path = os.path.join(new_dir, gz_file)
            if gz_file.endswith('.tar.gz'):
                extract_tar_gz(new_gz_path, new_dir)
            else:
                extract_gz(new_gz_path, new_dir)


def extract_all():
    for subdir in os.listdir(ARTICLES_PATH):
        subdir_path = os.path.join(ARTICLES_PATH, subdir)
        if os.path.isdir(subdir_path):
            process_subdir(subdir_path)


extract_all()

  tar.extractall(path=dest_folder)


In [9]:
def clean():
    for root, dirs, files in os.walk(ARTICLES_PATH):
        for file in files:
            if file.endswith('.gz'):
                os.remove(os.path.join(root, file))


clean()

### Categorize

In [None]:
from enum import Enum


class Category(Enum):
    AC = 'commutative_algebra'
    AG = 'algebraic_geometry'
    AP = 'analysis_of_pdes'
    AT = 'algebraic_topology'
    CA = 'classical_analysis_and_odes'
    CO = 'combinatorics'
    CT = 'category_theory'
    CV = 'complex_variables'
    DG = 'differential_geometry'
    DS = 'dynamical_systems'
    FA = 'functional_analysis'
    GM = 'general_mathematics'
    GN = 'general_topology'
    GR = 'group_theory'
    GT = 'geometric_topology'
    HO = 'history_and_overview'
    IT = 'information_theory'
    KT = 'k_theory_and_homology'
    LO = 'logic'
    MG = 'metric_geometry'
    MP = 'mathematical_physics'
    NA = 'numerical_analysis'
    NT = 'number_theory'
    OA = 'operator_algebras'
    OC = 'optimization_and_control'
    PR = 'probability'
    QA = 'quantum_algebra'
    RA = 'rings_and_algebras'
    RT = 'representation_theory'
    SG = 'symplectic_geometry'
    SP = 'spectral_theory'
    ST = 'statistics_theory'

### Parse

In [None]:
from pylatexenc.latexwalker import (
    LatexCharsNode,
    LatexCommentNode,
    LatexEnvironmentNode,
    LatexGroupNode,
    LatexMacroNode,
    LatexMathNode,
    LatexNode,
    LatexSpecialsNode,
    LatexWalker,
)

In [None]:
path = 'assets/input.tex'

with open(path, 'r', encoding='utf-8') as file:
    latex_str = file.read()

walker = LatexWalker(latex_str)

(nodes, pos, len_) = walker.get_latex_nodes()

In [None]:
def parse_node(root: LatexNode) -> tuple[str, dict[int, LatexMathNode]]:
    text = str()
    formulas: dict[int, LatexMathNode] = {}
    images: dict[int, LatexMacroNode | LatexEnvironmentNode] = {}

    stack = [iter([root])]
    stack_end = object()

    while stack:
        node = next(stack[-1], stack_end)

        if node is stack_end:
            stack.pop()
            continue

        if isinstance(node, LatexCharsNode):
            text += node.chars

        elif isinstance(node, LatexMathNode):
            i = len(formulas)
            formulas[i] = node
            text += f'[formula-{i}]'

        elif isinstance(node, LatexMacroNode):
            if node.macroname == 'includegraphics':
                i = len(images)
                images[i] = node
                text += f'[image-{i}]'

            else:
                pass

        elif isinstance(node, LatexEnvironmentNode):
            if node.environmentname == 'document':
                stack.append(iter(node.nodelist))

            elif node.environmentname in ['figure']:
                i = len(images)
                images[i] = node
                text += f'[image-{i}]'

            else:
                stack.append(iter(node.nodelist))

        elif isinstance(node, LatexGroupNode):
            stack.append(iter(node.nodelist))

        elif isinstance(node, (LatexCommentNode, LatexSpecialsNode)):
            pass

        else:
            stack.append(iter(getattr(node, 'nodelist', [])))

    return text, formulas


doc_env_node = next(
    (
        node
        for node in nodes
        if isinstance(node, LatexEnvironmentNode) and node.environmentname == 'document'
    ),
    None,
)

if doc_env_node is not None:
    extracted_text = parse_node(doc_env_node)