In [None]:
from typing import TYPE_CHECKING


if TYPE_CHECKING:
    from math_rag.application.containers import ApplicationContainer
    from math_rag.infrastructure.containers import InfrastructureContainer

    application_container: ApplicationContainer
    infrastructure_container: InfrastructureContainer

In [None]:
RESET = False
%load_ext hooks.notebook_hook

In [None]:
from pathlib import Path


google_drive_repository = infrastructure_container.google_drive_repository()
math_article_parser_service = infrastructure_container.math_article_parser_service()

file_id = google_drive_repository.get_file_id(
    Path('ml/lectures/L07-LogisticRegression2/2024_08_10_2174b40686820b4cb591g.tex')
)

if not file_id:
    raise ValueError()

file_content = google_drive_repository.get_file_by_id(file_id)

In [None]:
from math_rag.core.models import MathArticle, MathExpression


math_article = MathArticle(
    math_expression_dataset_id=None,
    index_id=None,
    name='article',
    bytes=file_content.getvalue(),
)
math_nodes_, _, template = math_article_parser_service.parse_for_index(math_article)
math_expressions: list[MathExpression] = ...  # NOTE obtained from math_nodes_
index_to_katex = {i: math_expression.katex for i, math_expression in enumerate(math_expressions)}

In [None]:
from math_rag.infrastructure.utils import TemplateContextChunkerUtil, TemplateFormatterUtil


template_chunks = TemplateContextChunkerUtil.chunk(template, max_context_size=1000)
chunks = [
    TemplateFormatterUtil.format(chunk, index_to_katex, omit_wrapper=False)
    for chunk in template_chunks
]

# TODO send to llm
# 1. MathExpressionDescriptionWriter
# 2. MathExpressionDescriptionRefiner
# 3. MathExpressionLinker?? -> checks whether 2 descriptions are actually the same (embedding clustering before)
#


class MathExpressionDescription:
    pass

In [None]:
_SYSTEM_PROMPT_TEMPLATE = """
You are an expert math-expression description writer.

Your task is to produce a precise, self-contained description of a target mathematical expression, based strictly on the surrounding context.

### Instructions:
- Be concise and unambiguous.
- Only describe what can be inferred from the given context.
- Avoid introducing any external assumptions or definitions.
"""

_USER_PROMPT_TEMPLATE = """
### Math expression of interest:
{katex}

### Math expression with surrounding context:
{context}

### Description:
"""

In [None]:
_SYSTEM_PROMPT_TEMPLATE = """
You are an expert at optimizing text descriptions for vector-search embedding.

Your task is to refine a provided description by removing all non-essential phrasing and meta commentary, while preserving every factual detail exactly as given.

### Instructions:
- Eliminate filler language and introductory or self-referential statements (e.g., “This description…”).
- Retain all information and nuance present in the input.
- Produce a concise, information-dense output optimized for embedding and retrieval.
"""

_USER_PROMPT_TEMPLATE = """
### Original description:
{description}

### Optimized description:
"""

In [None]:
_SYSTEM_PROMPT_TEMPLATE = """
You are an expert math-expression comparator.

Your task is to decide whether two given mathematical expressions represent exactly the same entity, based solely on their surrounding contexts.

### Instructions:
- Rely only on information inferable from the two contexts.
- Determine exact equivalence: return true if and only if they are the same entity.
- Provide a concise reason justifying your decision.
- Do not introduce external assumptions or definitions.
- Output must be valid JSON with keys "identical" (boolean) and "reason" (string).
"""

_USER_PROMPT_TEMPLATE = """
### First math expression:
{katex1}

### Context for first expression:
{context1}

### Second math expression:
{katex2}

### Context for second expression:
{context2}

### Decision (JSON):
{
  "identical": <true/false>,
  "reason": "<your explanation here>"
}
"""