In [1]:
from typing import TYPE_CHECKING


if TYPE_CHECKING:
    from math_rag.application.containers import ApplicationContainer
    from math_rag.infrastructure.containers import InfrastructureContainer

    application_container: ApplicationContainer
    infrastructure_container: InfrastructureContainer

In [2]:
RESET = False
%load_ext hooks.notebook_hook

2025-06-22 12:08:19,592 - INFO - PyTorch version 2.6.0 available.


In [3]:
from pathlib import Path


google_drive_repository = infrastructure_container.google_drive_repository()

file_id = google_drive_repository.get_file_id(
    Path('ml/lectures/L07-LogisticRegression2/2024_08_10_2174b40686820b4cb591g.tex')
)

if not file_id:
    raise ValueError()

file_content = google_drive_repository.get_file_by_id(file_id)

2025-06-22 12:08:20,062 - INFO - file_cache is only supported with oauth2client<4.0.0


In [4]:
latex = file_content.getvalue().decode()

In [5]:
from math_rag.core.models import MathArticle


math_article = MathArticle(
    math_expression_dataset_id=None, index_id=None, name='article', bytes=file_content.getvalue()
)

In [None]:
import re


MATH_PLACEHOLDER_PATTERN = r'\[math_placeholder(?: \| \d+)?\]'


def parse_placeholders(text: str) -> tuple[list[int], list[int]]:
    """Find all math-placeholder spans and return their start positions and lengths."""
    matches = list(re.finditer(MATH_PLACEHOLDER_PATTERN, text))
    positions = [match.start() for match in matches]
    lengths = [match.end() - match.start() for match in matches]

    return positions, lengths


def find_contiguous_block_ranges(
    positions: list[int], lengths: list[int], max_window_size: int
) -> list[tuple[int, int]]:
    """
    Split the sequence of placeholders into blocks where gaps between
    consecutive placeholders do not exceed the window size.
    Returns a list of (start_index, end_index) ranges into the positions/lengths lists.
    """
    blocks: list[tuple[int, int]] = []
    i = 0
    n = len(positions)

    while i < n:
        j = i + 1

        while j < n:
            previous_end = positions[j - 1] + lengths[j - 1]
            gap = positions[j] - previous_end

            if gap > max_window_size:
                break

            # extend block
            j += 1

        blocks.append((i, j))
        i = j

    return blocks


def chunk_block(
    block_text: str,
    relative_positions: list[int],
    relative_lengths: list[int],
    max_window_size: int,
) -> list[str]:
    """
    Apply the sliding-window chunking logic to a single text block
    with entities at relative positions and lengths.
    """
    chunks: list[str] = []
    i = 0
    j = 0
    n = len(relative_positions)

    # build the first window
    while j < n and relative_positions[j] + relative_lengths[j] <= max_window_size:
        j += 1

    first_chunk_end = relative_positions[j - 1] + relative_lengths[j - 1]
    chunks.append(block_text[:first_chunk_end])

    # slide the window over remaining entities
    for k in range(j, n):
        start_position = relative_positions[k]
        length = relative_lengths[k]

        # drop oldest entities until the new one fits
        while i < k:
            window_origin = relative_positions[i]

            if start_position + length - window_origin <= max_window_size:
                break

            i += 1

        window_origin = relative_positions[i]
        chunk_end = start_position + length
        chunks.append(block_text[window_origin:chunk_end])

    return chunks


def chunk(text: str, max_window_size: int = 200) -> list[str]:
    """
    Break text into chunks that each include a sliding window of
    [math_placeholder | int] tokens without exceeding max_window_size.
    """
    positions, lengths = parse_placeholders(text)
    block_ranges = find_contiguous_block_ranges(positions, lengths, max_window_size)
    chunks: list[str] = []

    for i, j in block_ranges:
        # extract the text slice
        block_start_offset = positions[i]
        block_end_offset = positions[j - 1] + lengths[j - 1]
        block_text = text[block_start_offset:block_end_offset]

        # find relative positions and lengths
        relative_positions = [position - block_start_offset for position in positions[i:j]]
        relative_lengths = lengths[i:j]

        # chunk
        block_chunks = chunk_block(
            block_text, relative_positions, relative_lengths, max_window_size
        )
        chunks.extend(block_chunks)

    return chunks

In [7]:
math_article_parser_service = infrastructure_container.math_article_parser_service()

In [8]:
math_nodes_, positions, template = math_article_parser_service.parse_for_index(math_article)

In [None]:
print(template)

In [None]:
max_window_size = 1000

chunks = chunk(template, max_window_size)
len(chunks)

163

In [None]:
for x in chunks:
    print(x)
    print()
    print('-------')
    print()

In [None]:
MATH_PLACEHOLDER_TEMPLATE = '[math_placeholder | {index}]'


def inject_latex(chunk: str, placeholder_index_to_latex: dict[int, str]) -> str:
    """
    Replace each [math_placeholder | i] in the chunk with [<latex> | i],
    preserving the index.
    """
    for index, latex in placeholder_index_to_latex.items():
        original = MATH_PLACEHOLDER_TEMPLATE.format(index=index)
        replacement = f'[{latex} | {index}]'
        chunk = chunk.replace(original, replacement)

    return chunk

In [34]:
placeholder_index_to_latex = {i: math_node.latex for i, math_node in enumerate(math_nodes_)}
formatted_chunks = [inject_latex(chunk, placeholder_index_to_latex) for chunk in chunks]

In [37]:
print(formatted_chunks[2])

[$$
\mathbf{x} \leftarrow \mathbf{x}-\eta \nabla f(\mathrm{x})
$$ | 2]

If we introduce an index for the iterations, then we can write this as an equation:

[$$
\mathbf{x}_{t+1}=\mathbf{x}_{t}-\eta \nabla f\left(\mathbf{x}_{t}\right)
$$ | 3]

The idea with Newton's method is to take the point [$\mathbf{x}_{t}$ | 4] (the current minimum) and compute at it the quadratic approximation of the function [$f(\mathrm{x})$ | 5], and then move to the minimizer of this quadratic approximation (which is known analytically). If [$f$ | 6] is a function of one variable, this would look like this:


[image_placeholder | 1]


The black curve is the function [$f(x)$ | 7] that we minimize. We start from point [$x_{0}$ | 8]. At this point we do a quadratic approximation of the function [$f$ | 9], thus obtaining a parabola that is tangential to the function [$f$ | 10] at the point [$x_{0}$ | 11]. The search then moves to a point that minimizes the quadratic approximation of [$f$ | 12]


In [None]:
# TODO make a "preview"

## Mathpix

In [None]:
mathpix_client = infrastructure_container.mathpix_client()

In [None]:
mathpix_client.convert_image(url='https://iili.io/FKvvD0J.png')

In [None]:
from pathlib import Path


pdf_path = Path('../.tmp/test_formulas.pdf')
mathpix_client.convert_pdf(file_path=Path('../.tmp/test_formulas.pdf'))

In [None]:
zip_path = Path('../.tmp/mathpix/downloads/data.zip')
target_path = Path('../.tmp/mathpix/data.zip')

from math_rag.shared.utils import ZipExtractorUtil


ZipExtractorUtil.extract(zip_path, target_path)

# TODO find .tex file in extracted dir, convert to katex, save to minio