In [1]:
from typing import TYPE_CHECKING


if TYPE_CHECKING:
    from math_rag.application.containers import ApplicationContainer
    from math_rag.infrastructure.containers import InfrastructureContainer

    application_container: ApplicationContainer
    infrastructure_container: InfrastructureContainer

In [2]:
RESET = False
%load_ext hooks.notebook_hook

2025-06-18 20:57:27,717 - INFO - PyTorch version 2.6.0 available.


### Correct

In [3]:
incorrect_katex = r'd\omega = \theta \w \omega'
error = (
    r'KaTeX parse error: Undefined control sequence: \w at position 18: …omega = \theta \̲w̲ ̲\omega'
)

In [4]:
from math_rag.application.models.assistants import KatexCorrectorAssistantInput


input = KatexCorrectorAssistantInput(katex=incorrect_katex, error=error)

In [5]:
katex_corrector_assistant = application_container.katex_corrector_assistant()

output = await katex_corrector_assistant.assist(input)
corrected_katex = output.katex
print(corrected_katex)
display(Math(corrected_katex))

d\omega = \theta \wedge \omega


NameError: name 'Math' is not defined

### Dataset

In [None]:
hugging_face_api = infrastructure_container.hugging_face_api()
hugging_face_username = infrastructure_container.config.hugging_face.username()
hugging_face_token = infrastructure_container.config.hugging_face.token()

In [None]:
from datasets import load_dataset
from datasets.download import DownloadConfig


repo_id = f'{hugging_face_username}/mathexpressiondataset'

download_config = DownloadConfig(
    max_retries=3,
    disable_tqdm=True,
)

dataset_dict = load_dataset(
    path=repo_id,
    split=None,
    download_config=download_config,
    token=hugging_face_token,
    trust_remote_code=True,
)

In [None]:
from typing import cast

from datasets import ClassLabel


class_label = cast(ClassLabel, dataset_dict['train'].features['label'])
class_label.names

['equality', 'inequality', 'constant', 'variable', 'other']

In [None]:
for x in dataset_dict['train']:
    print(x)

### OpenAI finish reasons

In [None]:
response_with_length_finish_reason = {
    'id': 'batch_req_68338415d224819095e7e42c4aeac8f8',
    'custom_id': 'b9b237e5-d1c8-4348-9c90-f29e998228a6',
    'response': {
        'status_code': 200,
        'request_id': 'a12fbd18fe2366571f083c2b6e707c96',
        'body': {
            'id': 'chatcmpl-BbB9Kx0P0WJlR2r4I1SuRkYa2Mvn0',
            'object': 'chat.completion',
            'created': 1748200694,
            'model': 'gpt-4.1-nano-2025-04-14',
            'choices': [
                {
                    'index': 0,
                    'message': {
                        'role': 'assistant',
                        'content': 'some too long content...',
                        'refusal': None,
                        'annotations': [],
                    },
                    'logprobs': None,
                    'finish_reason': 'length',
                }
            ],
            'usage': {
                'prompt_tokens': 285,
                'completion_tokens': 1024,
                'total_tokens': 1309,
                'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0},
                'completion_tokens_details': {
                    'reasoning_tokens': 0,
                    'audio_tokens': 0,
                    'accepted_prediction_tokens': 0,
                    'rejected_prediction_tokens': 0,
                },
            },
            'service_tier': 'default',
            'system_fingerprint': 'fp_eede8f0d45',
        },
    },
    'error': None,
}

In [None]:
from openai import NOT_GIVEN
from openai.lib._parsing._completions import (
    parse_chat_completion,
)
from openai.types.chat import ChatCompletion


chat_completion = ChatCompletion(**response_with_length_finish_reason['response']['body'])

for choice in chat_completion.choices:
    if choice.finish_reason == 'length' or choice.finish_reason == 'content_filter':
        print(choice.finish_reason)
        pass

target = parse_chat_completion(
    response_format=NOT_GIVEN,
    input_tools=NOT_GIVEN,
    chat_completion=chat_completion,
)

length


LengthFinishReasonError: Could not parse response content as the length limit was reached - CompletionUsage(completion_tokens=1024, prompt_tokens=285, total_tokens=1309, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))

### Label Studio

In [4]:
from uuid import UUID


importer_service = application_container.math_expression_label_task_importer_service()
exporter_service = application_container.math_expression_label_exporter_service()

In [5]:
project_id = await importer_service.import_tasks(
    None, dataset_id=UUID('1260dfdb-8c6a-4001-a007-1c60766a6754'), split_name='test'
)

2025-06-18 20:57:44,787 - INFO - Project mathexpressionlabeltask created
2025-06-18 20:57:44,817 - INFO - Imported 2 tasks into mathexpressionlabeltask


In [11]:
labels = await exporter_service.export(project_id)
labels

[MathExpressionLabel(id=UUID('4f847ae9-000c-4c6c-824e-036a6e7cf99e'), math_expression_id=UUID('d5bf6a13-502a-48e1-a508-31c75c30b0ec'), math_expression_dataset_id=UUID('1260dfdb-8c6a-4001-a007-1c60766a6754'), index_id=None, timestamp=datetime.datetime(2025, 6, 18, 15, 6, 59, 357000, tzinfo=TzInfo(UTC)), value=<MathExpressionLabelEnum.INEQUALITY: 'inequality'>),
 MathExpressionLabel(id=UUID('8754c627-1fc4-4107-8de3-a4c0629cfa2d'), math_expression_id=UUID('d5e70102-2a3b-431c-8b11-70f93e2440c2'), math_expression_dataset_id=UUID('1260dfdb-8c6a-4001-a007-1c60766a6754'), index_id=None, timestamp=datetime.datetime(2025, 6, 18, 15, 6, 59, 357000, tzinfo=TzInfo(UTC)), value=<MathExpressionLabelEnum.VARIABLE: 'variable'>)]