In [1]:
from typing import TYPE_CHECKING


if TYPE_CHECKING:
    from math_rag.application.containers import ApplicationContainer
    from math_rag.infrastructure.containers import InfrastructureContainer

    application_container: ApplicationContainer
    infrastructure_container: InfrastructureContainer

In [2]:
RESET = False
%load_ext hooks.notebook_hook

2025-07-04 19:16:02,497 - INFO - datasets - config.py:54 - PyTorch version 2.6.0 available.


In [3]:
from pathlib import Path


google_drive_repository = infrastructure_container.google_drive_repository()
math_article_parser_service = infrastructure_container.math_article_parser_service()

file_id = google_drive_repository.get_file_id(
    Path('ml/lectures/L07-LogisticRegression2/2024_08_10_2174b40686820b4cb591g.tex')
)

if not file_id:
    raise ValueError()

file_content = google_drive_repository.get_file_by_id(file_id)

2025-06-28 19:47:06,688 - INFO - googleapiclient.discovery_cache - __init__.py:49 - file_cache is only supported with oauth2client<4.0.0


In [4]:
from math_rag.core.models import Index


index = Index()

In [5]:
katex_corrector_assistant = application_container.katex_corrector_assistant()
katex_corrector_retrier_assistant = application_container.katex_corrector_retrier_assistant()
math_expression_description_writer_assistant = (
    application_container.math_expression_description_writer_assistant()
)
math_expression_description_optimizer_assistant = (
    application_container.math_expression_description_optimizer_assistant()
)
math_expression_comparator_assistant = application_container.math_expression_comparator_assistant()
math_expression_relationship_description_writer_assistant = (
    application_container.math_expression_relationship_description_writer_assistant()
)

### 1

In [6]:
from math_rag.core.models import MathArticle, MathExpression


math_article = MathArticle(
    math_expression_dataset_id=None,
    index_id=None,
    name='article',
    bytes=file_content.getvalue(),
)
math_nodes, _, template = math_article_parser_service.parse_for_index(math_article)
katexes = [math_node.latex.strip('$') for math_node in math_nodes]

In [None]:
katex_corrector_service = application_container.katex_corrector_service()

valid_katexes = await katex_corrector_service.correct(katexes, max_num_retries=3)
math_expressions = [
    MathExpression(
        math_article_id=math_article.id,
        math_expression_dataset_id=None,
        index_id=index.id,
        latex=node.latex,
        katex=katex,
        position=node.position,
        is_inline=node.is_inline,
    )
    for node, katex in zip(math_nodes, valid_katexes)
]
math_expressions.sort(key=lambda x: x.position)

### 2

In [None]:
# TODO
from uuid import UUID

from pydantic import BaseModel


class MathExpressionContext(BaseModel):  # TODO unused at the moment
    math_expression_id: UUID  # TODO
    text: str
    index_to_katex: dict[int, str]


# TODO connect everything with math expression id

In [None]:
from math_rag.infrastructure.utils import TemplateContextChunkerUtil, TemplateFormatterUtil


context_chunks = TemplateContextChunkerUtil.chunk(template, max_context_size=1000)

In [None]:
from math_rag.application.models.assistants.inputs import (
    MathExpressionDescriptionWriter as AssistantInput,
)
from math_rag.core.models import MathExpressionDescription


SLICE = 5  # TODO remove later, just for testing


index_to_katex = {i: math_expression.katex for i, math_expression in enumerate(math_expressions)}
index_to_math_expression_id = {
    i: math_expression.id for i, math_expression in enumerate(math_expressions)
}

inputs: list[AssistantInput] = []
input_id_to_math_expression_id: dict[UUID, UUID] = {}

for chunk in context_chunks[:SLICE]:
    context, indexes = TemplateFormatterUtil.format(chunk, index_to_katex, omit_wrapper=False)

    for i in indexes:
        katex = index_to_katex[index]
        input = AssistantInput(katex=f'[{i} | {katex}]', context=context)
        inputs.append(input)

        input_id_to_math_expression_id[input.id] = index_to_math_expression_id[i]

outputs = await math_expression_description_writer_assistant.concurrent_assist(inputs)

math_expression_descriptions = [
    MathExpressionDescription(
        index_id=index.id,
        math_expression_id=input_id_to_math_expression_id[output.input_id],
        description=output.description,
    )
    for output in outputs
]

In [10]:
input_id_to_input = {input.id: input for input in inputs}

for output in outputs:
    print(input_id_to_input[output.input_id].katex)
    print(output.description)
    print('-----')
    print()

[4 | \mathbf{x}_{t}]
the value of the parameter vector at iteration t in an iterative optimization process
-----

[5 | f(\mathrm{x})]
the function whose minimization is being considered in the context of optimization methods, specifically in the discussion of gradient descent and Newton's method
-----

[1 | f(\mathbf{x})]
the function being minimized in the context of optimization methods, where the input is a vector variable
-----

[4 | \mathbf{x}_{t}]
The current value of the parameter vector at iteration t in an iterative optimization algorithm.
-----

[6 | f]
the function being minimized in the context of optimization methods, specifically referenced as the objective function in both gradient descent and Newton's method
-----

[1 | f(\mathbf{x})]
A function evaluated at a vector variable, representing the objective function to be minimized in the context of optimization methods such as gradient descent and Newton's method.
-----

[8 | x_{0}]
the initial value or starting point for 

In [None]:
math_expression_description_repository = (
    infrastructure_container.math_expression_description_repository()
)

math_expression_description_repository.insert_many(math_expression_descriptions)

### 3

In [None]:
from math_rag.application.models.assistants.inputs import (
    MathExpressionDescriptionOptimizer as AssistantInput,
)


inputs = [AssistantInput(description=description) for description in math_expression_descriptions]
outputs = await math_expression_description_optimizer_assistant.concurrent_assist(inputs)
math_expression_descriptions_optimized = [output.description for output in outputs]

In [13]:
input_id_to_input = {input.id: input for input in inputs}

for output in outputs:
    print(output.description)
    print('-----')
    print()

Current value of the parameter vector at iteration t in an iterative optimization algorithm.
-----

Objective function minimized in optimization methods, specifically in gradient descent and Newton's method.
-----

Function targeted for minimization in optimization methods, particularly in gradient descent and Newton's method.
-----

Function of a variable minimized in optimization methods such as gradient descent and Newton's method.
-----

value of the parameter vector at iteration t in an iterative optimization process
-----

Function minimized using Newton's method, evaluated at variable x.
-----

L2 regularization is a penalty term in optimization that prevents overfitting by discouraging large parameter values.
-----

Standard parameter update rule in gradient descent: the current parameter vector is updated by subtracting the product of the learning rate and the gradient of a function with respect to the parameter vector.
-----

Function minimized in optimization methods with a 

In [None]:
default_embedder = application_container.default_embedder()

In [None]:
# TODO embeddings

embeddings = ...

In [None]:
math_expression_description_optimized_repository = (
    infrastructure_container.math_expression_description_optimized_repository()
)

await math_expression_description_optimized_repository.upsert_many(
    math_expression_descriptions_optimized, embeddings
)

### 4

In [None]:
from math_rag.application.models.assistants.inputs import (
    MathExpressionComparator as AssistantInput,
)


input = AssistantInput(katex=None, context=None, other_katex=None, other_context=None)