In [1]:
from typing import TYPE_CHECKING


if TYPE_CHECKING:
    from math_rag.application.containers import ApplicationContainer
    from math_rag.infrastructure.containers import InfrastructureContainer

    application_container: ApplicationContainer
    infrastructure_container: InfrastructureContainer

In [2]:
RESET = False
%load_ext hooks.notebook_hook

2025-06-22 21:37:49,533 - INFO - datasets - config.py:54 - PyTorch version 2.6.0 available.


In [3]:
from pathlib import Path


google_drive_repository = infrastructure_container.google_drive_repository()

file_id = google_drive_repository.get_file_id(
    Path('ml/lectures/L07-LogisticRegression2/2024_08_10_2174b40686820b4cb591g.tex')
)

if not file_id:
    raise ValueError()

file_content = google_drive_repository.get_file_by_id(file_id)

2025-06-22 21:37:50,168 - INFO - googleapiclient.discovery_cache - __init__.py:49 - file_cache is only supported with oauth2client<4.0.0


In [4]:
latex = file_content.getvalue().decode()

In [4]:
from math_rag.core.models import MathArticle


math_article = MathArticle(
    math_expression_dataset_id=None, index_id=None, name='article', bytes=file_content.getvalue()
)

In [6]:
math_article_parser_service = infrastructure_container.math_article_parser_service()

In [7]:
math_nodes_, positions, template = math_article_parser_service.parse_for_index(math_article)

In [8]:
print(template)


Jan Šnajder, lectures, v2.0

Last time we introduced the logistic regression algorithm. We defined the model and derived the cross-entropy error function as the negative probability of the labels in the training set. We established that minimizing that error had no solution in closed form, so we turned to iterative procedures. We have considered the simplest such procedure, the gradient descent algorithm, and we applied it to logistic regression, in standard (batch) and stochastic variant. In the end, we talked about regularization, specifically [math_placeholder | 0] regularization, which we incorporated quite straightforwardly into the optimization process.

Today we'll talk a bit more about logistic regression. First, we'll consider some more efficient (read: faster) alternatives to gradient descent. Second, we'll consider the extension of binary logistic regression to multiclass logistic regression. Third, we'll look at all the models discussed thus far and see what they have in c

In [None]:
from math_rag.infrastructure.utils import TemplateChunkerUtil


max_window_size = 1000

chunks = TemplateChunkerUtil.chunk(template, max_window_size)
len(chunks)

163

In [None]:
for x in chunks:
    print(x)
    print()
    print('-------')
    print()

In [None]:
MATH_PLACEHOLDER_TEMPLATE = '[math_placeholder | {index}]'


def inject_latex(chunk: str, placeholder_index_to_latex: dict[int, str]) -> str:
    """
    Replace each [math_placeholder | i] in the chunk with [<latex> | i],
    preserving the index.
    """
    for index, latex in placeholder_index_to_latex.items():
        original = MATH_PLACEHOLDER_TEMPLATE.format(index=index)
        replacement = f'[{latex} | {index}]'
        chunk = chunk.replace(original, replacement)

    return chunk

In [34]:
placeholder_index_to_latex = {i: math_node.latex for i, math_node in enumerate(math_nodes_)}
formatted_chunks = [inject_latex(chunk, placeholder_index_to_latex) for chunk in chunks]

In [37]:
print(formatted_chunks[2])

[$$
\mathbf{x} \leftarrow \mathbf{x}-\eta \nabla f(\mathrm{x})
$$ | 2]

If we introduce an index for the iterations, then we can write this as an equation:

[$$
\mathbf{x}_{t+1}=\mathbf{x}_{t}-\eta \nabla f\left(\mathbf{x}_{t}\right)
$$ | 3]

The idea with Newton's method is to take the point [$\mathbf{x}_{t}$ | 4] (the current minimum) and compute at it the quadratic approximation of the function [$f(\mathrm{x})$ | 5], and then move to the minimizer of this quadratic approximation (which is known analytically). If [$f$ | 6] is a function of one variable, this would look like this:


[image_placeholder | 1]


The black curve is the function [$f(x)$ | 7] that we minimize. We start from point [$x_{0}$ | 8]. At this point we do a quadratic approximation of the function [$f$ | 9], thus obtaining a parabola that is tangential to the function [$f$ | 10] at the point [$x_{0}$ | 11]. The search then moves to a point that minimizes the quadratic approximation of [$f$ | 12]


In [None]:
# TODO make a "preview"

## Mathpix

In [None]:
mathpix_client = infrastructure_container.mathpix_client()

In [None]:
mathpix_client.convert_image(url='https://iili.io/FKvvD0J.png')

In [None]:
from pathlib import Path


pdf_path = Path('../.tmp/test_formulas.pdf')
mathpix_client.convert_pdf(file_path=Path('../.tmp/test_formulas.pdf'))

In [None]:
zip_path = Path('../.tmp/mathpix/downloads/data.zip')
target_path = Path('../.tmp/mathpix/data.zip')

from math_rag.shared.utils import ZipExtractorUtil


ZipExtractorUtil.extract(zip_path, target_path)

# TODO find .tex file in extracted dir, convert to katex, save to minio