In [1]:
from typing import TYPE_CHECKING


if TYPE_CHECKING:
    from math_rag.application.containers import ApplicationContainer
    from math_rag.infrastructure.containers import InfrastructureContainer

    application_container: ApplicationContainer
    infrastructure_container: InfrastructureContainer

In [2]:
RESET = False
%load_ext hooks.notebook_hook

2025-07-14 04:30:09,539 - INFO - datasets - config.py:54 - PyTorch version 2.6.0 available.


In [3]:
from logging import getLogger
from pathlib import Path

import aiofiles
import magic

from dependency_injector.wiring import Provide, inject
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile

from math_rag.application.base.clients import BaseLatexConverterClient
from math_rag.application.base.repositories.documents import BaseMathProblemRepository
from math_rag.application.containers import ApplicationContainer
from math_rag.core.models import MathProblem

In [10]:
CONTENT_TYPES = ['image/png', 'application/pdf']
UPLOADS_DIR_PATH = Path('../.tmp/uploads')
CHUNK_SIZE = 1024 * 1024

In [5]:
import io

from pathlib import Path

from starlette.datastructures import UploadFile


class TestUploadFile(UploadFile):
    def __init__(
        self,
        file: io.BufferedReader,
        *,
        filename: str | None = None,
        size: int | None = None,
        headers: dict[str, str] | None = None,
        content_type: str | None = None,
    ):
        headers = headers or {}
        if content_type:
            headers = {**headers, 'content-type': content_type}
        super().__init__(file=file, filename=filename, size=size, headers=headers)
        self._content_type = content_type

    @property
    def content_type(self) -> str:
        return self._content_type or super().content_type

In [27]:
# pdf_path = Path('../.tmp') / '07short.pdf'
pdf_path = Path('../.tmp') / 'image.png'

file = TestUploadFile(file=pdf_path.open('rb'), filename=pdf_path.name, content_type='image/png')

In [28]:
mathpix_client = application_container.latex_converter_client()
respository = infrastructure_container.math_problem_repository()

In [29]:
async def validate_file(file: UploadFile):
    # read header
    if file.content_type not in CONTENT_TYPES:
        raise HTTPException(status_code=400, detail=f'Invalid file type: {file.content_type}')

    # read body (double check)
    header = await file.read(1024)
    await file.seek(0)
    file_content_type = magic.from_buffer(header, mime=True)

    if file_content_type not in CONTENT_TYPES:
        raise HTTPException(status_code=400, detail=f'Invalid file type: {file_content_type}')


async def write_file(file: UploadFile, dir: Path) -> Path:
    await file.seek(0)
    path = dir / file.filename

    async with aiofiles.open(path, 'wb') as out_file:
        while True:
            chunk = await file.read(CHUNK_SIZE)

            if not chunk:
                break

            await out_file.write(chunk)

    await file.close()

    return path

In [30]:
await validate_file(file)
path = await write_file(file, UPLOADS_DIR_PATH)

In [31]:
if path.suffix == '.pdf':
    tex_zip_bytes = mathpix_client.convert_pdf(file_path=path, url=None)

else:
    text = mathpix_client.convert_image(file_path=path, url=None)

# math_problem = MathProblem(
#     latex=...,
#     katex=...,
#     is_inline=...,
# )
# await repository.insert_one(math_problem)

# TODO mock UploadFile in notebook and test there!

2025-07-14 05:26:32,308 - mathpix - INFO - Creating new Image: path=../.tmp/uploads/image.png
2025-07-14 05:26:32,308 - INFO - mathpix - mathpix_client.py:61 - Creating new Image: path=../.tmp/uploads/image.png
2025-07-14 05:26:32,310 - mathpix - INFO - Processing image: path=../.tmp/uploads/image.png, url=, include_line_data=False
2025-07-14 05:26:32,310 - INFO - mathpix - image.py:65 - Processing image: path=../.tmp/uploads/image.png, url=, include_line_data=False
2025-07-14 05:26:34,126 - mathpix - INFO - OCR processing successful
2025-07-14 05:26:34,126 - INFO - mathpix - image.py:83 - OCR processing successful


In [35]:
from pathlib import Path


tex_bytes = text.encode('utf-8')

tex_path = Path('output.tex')
tex_path.write_bytes(tex_bytes)  # file now exists with the .tex content

247

In [34]:
path.stem

'image'

In [22]:
import io
import zipfile

from pathlib import Path


def extract_tex_zip(tex_zip_bytes: bytes, file_path: Path) -> Path:
    dest_dir = file_path.parent / file_path.stem
    dest_dir.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(io.BytesIO(tex_zip_bytes)) as zf:
        zf.extractall(dest_dir)
    return dest_dir


extract_dir = extract_tex_zip(tex_zip_bytes, path)
extract_dir

PosixPath('../.tmp/uploads/07short')

In [24]:
def get_tex_file_data(dir_path: Path) -> tuple[str, bytes]:
    nested_dir = next(p for p in dir_path.iterdir() if p.is_dir())
    tex_file = next(p for p in nested_dir.iterdir() if p.suffix == '.tex')
    return tex_file.name, tex_file.read_bytes()


file_name, file_bytes = get_tex_file_data(extract_dir)

In [None]:
from math_rag.core.models import MathArticle


math_article = MathArticle(
    math_expression_dataset_id=None,
    math_expression_index_id=None,  # TODO
    name=file_name,
    bytes=file_bytes,
)