In [None]:
from typing import TYPE_CHECKING


if TYPE_CHECKING:
    from math_rag.application.containers import ApplicationContainer
    from math_rag.infrastructure.containers import InfrastructureContainer

    application_container: ApplicationContainer
    infrastructure_container: InfrastructureContainer

In [None]:
RESET = False
%load_ext hooks.notebook_hook

In [None]:
from pathlib import Path


google_drive_repository = infrastructure_container.google_drive_repository()

file_id = google_drive_repository.get_file_id(
    Path('ml/lectures/L07-LogisticRegression2/2024_08_10_2174b40686820b4cb591g.tex')
)

if not file_id:
    raise ValueError()

file_content = google_drive_repository.get_file_by_id(file_id)

In [None]:
latex = file_content.getvalue().decode()
latex

In [None]:
latex_parser_service = infrastructure_container.latex_parser_service()
latex_node_walker_service = infrastructure_container.latex_node_walker_service()

In [None]:
def chunk(
    text: str, positions: list[int], lengths: list[int], max_window_size: int = 200
) -> list[str]:
    n = len(positions)
    chunks: list[str] = []

    # 1) build the first window
    # if the very first entity is by itself too long, it can never be chunked
    first_len = lengths[0]

    if first_len > max_window_size:
        raise ValueError(
            f'Entity at position {positions[0]} '
            f'has length {first_len}, which exceeds '
            f'the maximum window size of {max_window_size} characters'
        )

    start_index = 0
    end_index = 0

    # greedily expand until adding the next entity would overflow
    while end_index < n:
        entity_end = positions[end_index] + lengths[end_index]

        if entity_end <= max_window_size:
            end_index += 1
        else:
            break

    # Emit the first chunk
    first_end = positions[end_index - 1] + lengths[end_index - 1]
    chunks.append(text[:first_end])

    # 2) slide the window over each subsequent entity
    for current_index in range(end_index, n):
        new_start = positions[current_index]
        last_end = positions[current_index - 1] + lengths[current_index - 1]

        # gap check
        gap = new_start - last_end

        if gap > max_window_size:
            raise ValueError(
                f'Gap of {gap} characters between entity ending at {last_end} '
                f'and next entity at {new_start} exceeds '
                f'the maximum window size of {max_window_size} characters'
            )

        # slide off old entities until the new one fits
        while start_index < current_index:
            window_start = positions[start_index] if start_index > 0 else 0
            candidate_end = new_start + lengths[current_index]

            if candidate_end - window_start <= max_window_size:
                break
            start_index += 1

        window_start = positions[start_index] if start_index > 0 else 0
        chunks.append(text[window_start : new_start + lengths[current_index]])

    return chunks

In [None]:
mathpix_client = infrastructure_container.mathpix_client()

In [None]:
mathpix_client.convert_image(url='https://iili.io/FKvvD0J.png')

In [None]:
from pathlib import Path


pdf_path = Path('../.tmp/test_formulas.pdf')
mathpix_client.convert_pdf(file_path=Path('../.tmp/test_formulas.pdf'))

In [None]:
zip_path = Path('../.tmp/mathpix/downloads/data.zip')
target_path = Path('../.tmp/mathpix/data.zip')

from math_rag.shared.utils import ZipExtractorUtil


ZipExtractorUtil.extract(zip_path, target_path)

# TODO find .tex file in extracted dir, convert to katex, save to minio