In [1]:
from typing import TYPE_CHECKING


if TYPE_CHECKING:
    from math_rag.application.containers import ApplicationContainer
    from math_rag.infrastructure.containers import InfrastructureContainer

    application_container: ApplicationContainer
    infrastructure_container: InfrastructureContainer

In [2]:
RESET = False
%load_ext hooks.notebook_hook

2025-07-11 11:49:22,098 - INFO - datasets - config.py:54 - PyTorch version 2.6.0 available.


In [3]:
from pathlib import Path


google_drive_repository = infrastructure_container.google_drive_repository()
math_article_parser_service = infrastructure_container.math_article_parser_service()

file_id = google_drive_repository.get_file_id(
    Path('ml/lectures/L07-LogisticRegression2/2024_08_10_2174b40686820b4cb591g.tex')
)

if not file_id:
    raise ValueError()

file_content = google_drive_repository.get_file_by_id(file_id)

2025-07-11 11:49:22,899 - INFO - googleapiclient.discovery_cache - __init__.py:49 - file_cache is only supported with oauth2client<4.0.0


In [None]:
from math_rag.core.models import MathExpressionIndex


index = MathExpressionIndex()

In [None]:
katex_corrector_assistant = application_container.katex_corrector_assistant()
katex_corrector_retrier_assistant = application_container.katex_corrector_retrier_assistant()
math_expression_description_writer_assistant = (
    application_container.math_expression_description_writer_assistant()
)
math_expression_description_optimizer_assistant = (
    application_container.math_expression_description_optimizer_assistant()
)
math_expression_comparator_assistant = application_container.math_expression_comparator_assistant()
math_expression_relationship_description_writer_assistant = (
    application_container.math_expression_relationship_description_writer_assistant()
)

default_embedder = application_container.default_embedder()
math_expression_description_opt_repository = (
    infrastructure_container.math_expression_description_opt_repository()
)
math_expression_description_repository = (
    infrastructure_container.math_expression_description_repository()
)
math_expression_group_repository = infrastructure_container.math_expression_group_repository()
math_expression_repository = infrastructure_container.math_expression_repository()
clusterer_service = application_container.clusterer_service()

## Nodes

### 1

In [None]:
from math_rag.core.models import MathArticle, MathExpression


math_article = MathArticle(
    math_expression_dataset_id=None,
    math_expression_index_id=None,
    name='article',
    bytes=file_content.getvalue(),
)
math_nodes, _, template = math_article_parser_service.parse_for_index(math_article)
katexes = [math_node.latex.strip('$') for math_node in math_nodes]

In [8]:
print(template)


Jan Šnajder, lectures, v2.0

Last time we introduced the logistic regression algorithm. We defined the model and derived the cross-entropy error function as the negative probability of the labels in the training set. We established that minimizing that error had no solution in closed form, so we turned to iterative procedures. We have considered the simplest such procedure, the gradient descent algorithm, and we applied it to logistic regression, in standard (batch) and stochastic variant. In the end, we talked about regularization, specifically [math_placeholder | 0] regularization, which we incorporated quite straightforwardly into the optimization process.

Today we'll talk a bit more about logistic regression. First, we'll consider some more efficient (read: faster) alternatives to gradient descent. Second, we'll consider the extension of binary logistic regression to multiclass logistic regression. Third, we'll look at all the models discussed thus far and see what they have in c

In [None]:
katex_corrector_service = application_container.katex_corrector_service()

valid_katexes = await katex_corrector_service.correct(katexes, max_num_retries=3)
math_expressions = [
    MathExpression(
        math_article_id=math_article.id,
        math_expression_dataset_id=None,
        math_expression_group_id=None,
        math_expression_index_id=index.id,
        latex=node.latex,
        katex=katex,
        position=node.position,
        is_inline=node.is_inline,
    )
    for node, katex in zip(math_nodes, valid_katexes)
]
math_expressions.sort(key=lambda x: x.position)

### 2

In [None]:
# TODO
from uuid import UUID

from pydantic import BaseModel


class MathArticleTemplate(BaseModel):
    id: UUID
    math_article_id: UUID
    math_expression_index_id: UUID
    text: str


class MathArticleTemplateChunk(BaseModel):
    id: UUID
    math_article_id: UUID
    text: str


class MathExpressionContextTemplate(BaseModel):
    id: UUID
    math_expression_id: UUID
    math_expression_ids: list[UUID]  # TODO
    math_expression_index_id: UUID
    text: str

In [None]:
from math_rag.infrastructure.utils import (
    TemplateChunkerUtil,
    TemplateContextChunkerUtil,
    TemplateFormatterUtil,
    TemplateIndexFinderUtil,
)


contexts = TemplateContextChunkerUtil.chunk(template, max_context_size=1000)
chunks = TemplateChunkerUtil.chunk(template, max_window_size=2048, max_padding=256)

In [None]:
print(contexts[1])

("above consideration, we can conclude that the batch gradient descent could be improved if we take into account not only the slope (gradient) but also the curvature (the change in gradient, i.e., the second derivative) of the error function. Such optimization methods are referred to as second-order optimization, as opposed to first-order optimization methods, such as gradient descent. The basic second-order optimization method is the Newton's method.\n\n\nConsider minimization of function [math_placeholder | 1]. We know that the parameter update in gradient descent is as follows:\n\n[math_placeholder | 2]\n\nIf we introduce an index for the iterations, then we can write this as an equation:\n\n[math_placeholder | 3]\n\nThe idea with Newton's method is to take the point [math_placeholder | 4] (the current minimum) and compute at it the quadratic approximation of the function [math_placeholder | 5], and then move to the minimizer of this quadratic approximation (which is known analytica

In [None]:
from math_rag.application.models.assistants.inputs import (
    MathExpressionDescriptionWriter as AssistantInput,
)
from math_rag.core.models import MathExpressionDescription
from math_rag.infrastructure.constants.services import MATH_TEMPLATE


SLICE = 5  # TODO remove later, just for testing


index_to_katex = {i: math_expression.katex for i, math_expression in enumerate(math_expressions)}
index_to_math_expression_id = {
    i: math_expression.id for i, math_expression in enumerate(math_expressions)
}

inputs: list[AssistantInput] = []
input_id_to_math_expression_id: dict[UUID, UUID] = {}

for context in contexts[:SLICE]:
    formatted_context, indexes = TemplateFormatterUtil.format(
        context, index_to_katex, omit_wrapper=False
    )

    for i in indexes:
        katex = index_to_katex[index]
        input = AssistantInput(
            katex=MATH_TEMPLATE.format(katex=katex, index=i), context=formatted_context
        )
        inputs.append(input)

        input_id_to_math_expression_id[input.id] = index_to_math_expression_id[i]

outputs = await math_expression_description_writer_assistant.concurrent_assist(inputs)

math_expression_descriptions = [
    MathExpressionDescription(
        math_expression_index_id=index.id,
        math_expression_id=input_id_to_math_expression_id[output.input_id],
        description=output.description,
    )
    for output in outputs
]

In [10]:
input_id_to_input = {input.id: input for input in inputs}

for output in outputs:
    print(input_id_to_input[output.input_id].katex)
    print(output.description)
    print('-----')
    print()

[4 | \mathbf{x}_{t}]
the value of the parameter vector at iteration t in an iterative optimization process
-----

[5 | f(\mathrm{x})]
the function whose minimization is being considered in the context of optimization methods, specifically in the discussion of gradient descent and Newton's method
-----

[1 | f(\mathbf{x})]
the function being minimized in the context of optimization methods, where the input is a vector variable
-----

[4 | \mathbf{x}_{t}]
The current value of the parameter vector at iteration t in an iterative optimization algorithm.
-----

[6 | f]
the function being minimized in the context of optimization methods, specifically referenced as the objective function in both gradient descent and Newton's method
-----

[1 | f(\mathbf{x})]
A function evaluated at a vector variable, representing the objective function to be minimized in the context of optimization methods such as gradient descent and Newton's method.
-----

[8 | x_{0}]
the initial value or starting point for 

In [None]:
# math_expression_description_repository.insert_many(math_expression_descriptions)

### 3

In [None]:
from math_rag.application.models.assistants.inputs import (
    MathExpressionDescriptionOptimizer as AssistantInput,
)
from math_rag.application.utils import InputCreatorUtil
from math_rag.core.models import MathExpressionDescriptionOpt


inputs, input_id_to_item = InputCreatorUtil.create(
    math_expression_descriptions, lambda x: AssistantInput(description=x.description)
)
outputs = await math_expression_description_optimizer_assistant.concurrent_assist(inputs)
math_expression_descriptions_opt = [
    MathExpressionDescriptionOpt(
        math_expression_id=input_id_to_item[output.input_id].math_expression_id,
        math_expression_description_id=input_id_to_item[output.input_id].id,
        math_expression_index_id=index.id,
        description=output.description,
    )
    for output in outputs
]

In [13]:
input_id_to_input = {input.id: input for input in inputs}

for output in outputs:
    print(output.description)
    print('-----')
    print()

Current value of the parameter vector at iteration t in an iterative optimization algorithm.
-----

Objective function minimized in optimization methods, specifically in gradient descent and Newton's method.
-----

Function targeted for minimization in optimization methods, particularly in gradient descent and Newton's method.
-----

Function of a variable minimized in optimization methods such as gradient descent and Newton's method.
-----

value of the parameter vector at iteration t in an iterative optimization process
-----

Function minimized using Newton's method, evaluated at variable x.
-----

L2 regularization is a penalty term in optimization that prevents overfitting by discouraging large parameter values.
-----

Standard parameter update rule in gradient descent: the current parameter vector is updated by subtracting the product of the learning rate and the gradient of a function with respect to the parameter vector.
-----

Function minimized in optimization methods with a 

In [None]:
from math_rag.application.models.embedders import EmbedderInput


inputs, input_id_to_item = InputCreatorUtil.create(
    math_expression_descriptions_opt, lambda x: EmbedderInput(text=x.description)
)
outputs = await default_embedder.concurrent_embed(inputs)

In [None]:
from more_itertools import unzip


input_id_to_embedding = {output.input_id: output.embedding for output in outputs}


descriptions, embeddings = unzip(
    (input_id_to_item[i.id], input_id_to_embedding[i.id]) for i in inputs
)

# TODO save to the mongo as well
# await math_expression_description_opt_repository.upsert_many(descriptions, embeddings)

### 4

In [None]:
callback = clusterer_service.cluster
clusters = await math_expression_description_opt_repository.cluster(callback)

In [None]:
import os

from sklearn.datasets import make_blobs


os.environ['NUMBA_CPU_FEATURES'] = str()  # avoid kernel crash on arm
import umap


# synthetic data
X, y = make_blobs(
    n_samples=500,
    centers=5,
    n_features=10,
    cluster_std=1.0,
    random_state=42,
)

reducer = umap.UMAP(
    n_components=2,
    metric='euclidean',
    random_state=None,
)
X_umap = reducer.fit_transform(X)

In [None]:
import pandas as pd
import plotly.express as px


df = pd.DataFrame(
    {
        'UMAP1': X_umap[:, 0],
        'UMAP2': X_umap[:, 1],
        'cluster': y,
    }
)

fig = px.scatter(
    df,
    x='UMAP1',
    y='UMAP2',
    color='cluster',
    hover_data=['cluster'],
)
fig.show()

### 5

In [None]:
from itertools import combinations

from math_rag.application.models.assistants.inputs import MathExpressionComparator as AssistantInput
from math_rag.core.models import MathExpressionGroup


# NOTE: cluster must consists of math expression ids
for cluster in clusters:
    descriptions = await math_expression_description_opt_repository.find_many(cluster)

    for pair in list(combinations(descriptions, 2)):
        pass

    # TODO how to get context

    inputs, input_id_to_item = InputCreatorUtil.create(
        math_expression_descriptions,
        lambda x: AssistantInput(katex=None, context=None, other_katex=None, other_context=None),
    )
    outputs = await math_expression_comparator_assistant.concurrent_assist(inputs)

    group = MathExpressionGroup(math_expression_index_id=index.id)
    # await math_expression_group_repository.insert_one(group)
    # await math_expression_repository.update_group_id(cluster, group.id)

In [2]:
import math


math.comb(50, 2)

1225

## Relationships

### 1

In [None]:
from math_rag.application.models.assistants.inputs import (
    MathExpressionRelationshipDescriptionWriter as AssistantInput,
)


# TODO MISSING RELATIONSHIP DETECTOR PROMPT???

# TODO
# iterate over chunks
# send input for each (previous index, last index) pair

# TODO add direction for each relationship

for chunk in chunks:
    indexes = TemplateIndexFinderUtil.find(chunk)

    inputs, input_id_to_item = InputCreatorUtil.create(
        math_expression_descriptions,
        lambda x: AssistantInput(chunk=..., source=..., target=...),
    )

In [None]:
from math_rag.core.models import MathExpressionRelationship

In [None]:
from math_rag.core.models import MathExpressionRelationshipDescription