In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import nest_asyncio


sys.path.insert(0, os.path.abspath('..'))
nest_asyncio.apply()

In [2]:
import logging


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# logging.getLogger('pylatexenc.latexwalker').setLevel(logging.ERROR)
# logging.getLogger('httpx').setLevel(logging.WARNING)
# logging.getLogger('openai').setLevel(logging.WARNING)

In [3]:
from math_rag.infrastructure.containers import InfrastructureContainer


RESET = False

# containers
infrastructure_container = InfrastructureContainer()
infrastructure_container.init_resources()
infrastructure_container.wire(modules=[__name__])

application_container = infrastructure_container.application_container()
application_container.init_resources()
application_container.wire(modules=[__name__])

# seed
for object_seeder in infrastructure_container.object_seeders():
    object_seeder.seed(reset=RESET)

for document_seeder in infrastructure_container.document_seeders():
    await document_seeder.seed(reset=RESET)

for embedding_seeder in infrastructure_container.embedding_seeders():
    await embedding_seeder.seed(reset=RESET)

# index
for document_indexer in infrastructure_container.document_indexers():
    await document_indexer.index(reset=RESET)

2025-06-16 19:13:08,934 - INFO - PyTorch version 2.6.0 available.
2025-06-16 19:13:09,247 - INFO - HTTP Request: GET http://qdrant:6333 "HTTP/1.1 200 OK"
2025-06-16 19:13:09,251 - INFO - HTTP Request: GET http://qdrant:6333/collections/mathexpressiondescriptionembedding/exists "HTTP/1.1 200 OK"


In [None]:
from uuid import UUID

from math_rag.core.enums import MathExpressionDatasetBuildStage
from math_rag.core.models import MathExpressionDataset


math_expression_dataset_builder_service = (
    application_container.math_expression_dataset_builder_service()
)
math_expression_dataset_repository = application_container.math_expression_dataset_repository()


current_dataset = MathExpressionDataset(
    # build_from_dataset_id=UUID('dd5b1e2d-dc0c-4ae9-9f24-2fd2d37f42fa'),
    # build_from_stage=MathExpressionDatasetBuildStage.LOAD_MATH_EXPRESSION_SAMPLES,
)
await math_expression_dataset_repository.insert_one(current_dataset)
await math_expression_dataset_builder_service.build(current_dataset)

In [2]:
from math_rag.application.models.assistants import MathExpressionLabelerAssistantOutput


MathExpressionLabelerAssistantOutput.model_json_schema()

{'$defs': {'MathExpressionLabelEnum': {'enum': ['equality',
    'inequality',
    'constant',
    'variable',
    'other'],
   'title': 'MathExpressionLabelEnum',
   'type': 'string'}},
 'properties': {'label': {'$ref': '#/$defs/MathExpressionLabelEnum'}},
 'required': ['label'],
 'title': 'MathExpressionLabelerAssistantOutput',
 'type': 'object'}

### Correct

In [6]:
incorrect_katex = r'd\omega = \theta \w \omega'
error = (
    r'KaTeX parse error: Undefined control sequence: \w at position 18: …omega = \theta \̲w̲ ̲\omega'
)

In [7]:
from math_rag.application.models.assistants import KatexCorrectorAssistantInput


input = KatexCorrectorAssistantInput(katex=incorrect_katex, error=error)

In [None]:
katex_corrector_assistant = application_container.katex_corrector_assistant()

output = await katex_corrector_assistant.assist(input)
corrected_katex = output.katex
print(corrected_katex)
display(Math(corrected_katex))

### Dataset

In [4]:
hugging_face_api = infrastructure_container.hugging_face_api()
hugging_face_username = infrastructure_container.config.hugging_face.username()
hugging_face_token = infrastructure_container.config.hugging_face.token()

In [6]:
from datasets import load_dataset
from datasets.download import DownloadConfig


repo_id = f'{hugging_face_username}/mathexpressiondataset'

download_config = DownloadConfig(
    max_retries=3,
    disable_tqdm=True,
)

dataset_dict = load_dataset(
    path=repo_id,
    split=None,
    download_config=download_config,
    token=hugging_face_token,
    trust_remote_code=True,
)

In [8]:
from typing import cast

from datasets import ClassLabel


class_label = cast(ClassLabel, dataset_dict['train'].features['label'])
class_label.names

['equality', 'inequality', 'constant', 'variable', 'other']

In [None]:
for x in dataset_dict['train']:
    print(x)

### OpenAI finish reasons

In [4]:
response_with_length_finish_reason = {
    'id': 'batch_req_68338415d224819095e7e42c4aeac8f8',
    'custom_id': 'b9b237e5-d1c8-4348-9c90-f29e998228a6',
    'response': {
        'status_code': 200,
        'request_id': 'a12fbd18fe2366571f083c2b6e707c96',
        'body': {
            'id': 'chatcmpl-BbB9Kx0P0WJlR2r4I1SuRkYa2Mvn0',
            'object': 'chat.completion',
            'created': 1748200694,
            'model': 'gpt-4.1-nano-2025-04-14',
            'choices': [
                {
                    'index': 0,
                    'message': {
                        'role': 'assistant',
                        'content': 'some too long content...',
                        'refusal': None,
                        'annotations': [],
                    },
                    'logprobs': None,
                    'finish_reason': 'length',
                }
            ],
            'usage': {
                'prompt_tokens': 285,
                'completion_tokens': 1024,
                'total_tokens': 1309,
                'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0},
                'completion_tokens_details': {
                    'reasoning_tokens': 0,
                    'audio_tokens': 0,
                    'accepted_prediction_tokens': 0,
                    'rejected_prediction_tokens': 0,
                },
            },
            'service_tier': 'default',
            'system_fingerprint': 'fp_eede8f0d45',
        },
    },
    'error': None,
}

In [None]:
from openai import NOT_GIVEN
from openai.lib._parsing._completions import (
    parse_chat_completion,
)
from openai.types.chat import ChatCompletion


chat_completion = ChatCompletion(**response_with_length_finish_reason['response']['body'])

for choice in chat_completion.choices:
    if choice.finish_reason == 'length' or choice.finish_reason == 'content_filter':
        print(choice.finish_reason)
        pass

target = parse_chat_completion(
    response_format=NOT_GIVEN,
    input_tools=NOT_GIVEN,
    chat_completion=chat_completion,
)

length


LengthFinishReasonError: Could not parse response content as the length limit was reached - CompletionUsage(completion_tokens=1024, prompt_tokens=285, total_tokens=1309, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))

### Label Studio

In [4]:
hugging_face_api = infrastructure_container.hugging_face_api()
hugging_face_username = infrastructure_container.config.hugging_face.username()

In [5]:
from uuid import UUID

from label_studio_sdk import Project
from label_studio_sdk.label_interface import LabelInterface
from label_studio_sdk.label_interface.create import choices

from math_rag.core.models import MathExpressionSample


async_label_studio = infrastructure_container.async_label_studio()
dataset_loader_service = infrastructure_container.dataset_loader_service()

In [6]:
# ——— 1. Load your dataset
dataset_name = 'mathexpressiondataset'
repo_id = f'{hugging_face_username}/{dataset_name}'

split_name_to_samples, _ = dataset_loader_service.load(
    dataset_id=UUID('017d10b1-3001-4aaf-966d-1ace1e6e07b9'),
    dataset_name=dataset_name,
    dataset_metadata_file_name=None,
    sample_type=MathExpressionSample,
    max_retries=3,
)
samples = split_name_to_samples['test']

In [6]:
async_pager = await async_label_studio.projects.list()

2025-06-16 19:13:27,797 - INFO - HTTP Request: POST http://host.docker.internal:8080//api/token/refresh/ "HTTP/1.1 200 OK"
2025-06-16 19:13:27,812 - INFO - HTTP Request: GET http://host.docker.internal:8080/api/projects/?page=1 "HTTP/1.1 200 OK"


In [7]:
from typing import cast


name = 'math-rag'
project = None

async for p in async_pager:
    p = cast(Project, p)
    if p.title == name:
        project = p

2025-06-16 19:13:29,797 - INFO - HTTP Request: GET http://host.docker.internal:8080/api/projects/?page=2 "HTTP/1.1 404 Not Found"


In [8]:
project

Project(id=6, title='math-rag', description='', label_config='<View>\n  <HyperText name="latex" value="$latex" toName="latex"/>\n  <Choices name="label" toName="latex">\n    <Choice value="Positive"/>\n    <Choice value="Negative"/>\n  </Choices>\n</View>', expert_instruction='', show_instruction=False, show_skip_button=True, enable_empty_annotation=True, show_annotation_history=False, organization=1, prompts=None, color='#FFFFFF', maximum_annotations=1, annotation_limit_count=None, annotation_limit_percent=None, is_published=False, model_version='', is_draft=False, created_by=UserSimple(id=1, first_name='', last_name='', email='luka.panic2002@gmail.com', avatar=None), created_at=datetime.datetime(2025, 6, 15, 20, 12, 3, 639966, tzinfo=TzInfo(UTC)), min_annotations_to_start_training=0, start_training_on_annotation_update=False, show_collab_predictions=True, num_tasks_with_annotations=0, task_number=2, useful_annotation_number=0, ground_truth_number=0, skipped_annotations_number=0, tota

In [9]:
label_config = LabelInterface.create(
    {
        'latex': 'HyperText',
        'label': choices(['Positive', 'Negative']),
    }
)

In [10]:
if not project:
    project = await async_label_studio.projects.create(title=name, label_config=label_config)
    print(f'Created project {name} (ID {project.id})')

else:
    await async_label_studio.projects.update(project.id, label_config=label_config)
    print(f'Updated template in existing project {name} (ID {project.id})')

2025-06-16 19:13:35,434 - INFO - HTTP Request: PATCH http://host.docker.internal:8080/api/projects/6/ "HTTP/1.1 200 OK"


Updated template in existing project math-rag (ID 6)


In [11]:
task = """
<span class="katex">
  <span class="katex-html" aria-hidden="true">
    <span class="strut" style="height:0.68333em;"></span>
    <span class="mord">0</span><span class="mord">.</span><span class="mord">9</span><span class="mord">6</span><span class="mord">1</span>
    <span class="mord">
      <span class="mopen">(</span>
      <span class="mord">±</span><span class="mord">0</span><span class="mord">.</span><span class="mord">0</span><span class="mord">0</span><span class="mord">5</span>
      <span class="mclose">)</span>
    </span>
  </span>
</span>
""".strip()

task = '<span class="katex"><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8389em;vertical-align:-0.1944em;"></span><span class="mord">0.961</span><span class="mord mathnormal">p</span><span class="mord mathnormal">m</span><span class="mord">0.005</span></span></span></span>'

request = [{'latex': task}]

In [None]:
# TODO LabelStudioPublisherService
# TODO LabelStudioLoaderService

In [12]:
# ——— 4. Import tasks programmatically
response = await async_label_studio.projects.import_tasks(id=project.id, request=request)
print(f'Imported {response.task_count} tasks into {name}')

2025-06-16 19:13:40,372 - INFO - HTTP Request: POST http://host.docker.internal:8080/api/projects/6/import "HTTP/1.1 201 Created"


Imported 1 tasks into math-rag


In [59]:
# 5. export labeled tasks
# project_ext = await label_studio.projects.get(id=proj.id)

export_snapshot = await async_label_studio.projects.exports.create(project_id=project.id)

2025-06-15 20:00:53,183 - INFO - HTTP Request: POST http://host.docker.internal:8080/api/projects/5/exports "HTTP/1.1 201 Created"


In [60]:
export = await async_label_studio.projects.exports.get(
    project_id=project.id, export_pk=export_snapshot.id
)

# TODO check export.status in loop
print(export.status)

match export.status:
    case 'created':
        pass

    case 'in_progress':
        pass

    case 'failed':
        pass

    case 'completed':
        pass

2025-06-15 20:01:10,888 - INFO - HTTP Request: GET http://host.docker.internal:8080/api/projects/5/exports/1 "HTTP/1.1 200 OK"


completed


In [61]:
import json

from io import BytesIO


buffer = BytesIO()

async for chunk in async_label_studio.projects.exports.download(
    project_id=project.id, export_pk=export_snapshot.id, export_type='JSON'
):
    buffer.write(chunk)

buffer.seek(0)

text = buffer.read().decode('utf-8')
data = json.loads(text)

data

2025-06-15 20:01:24,154 - INFO - HTTP Request: GET http://host.docker.internal:8080/api/projects/5/exports/1/download?exportType=JSON "HTTP/1.1 200 OK"


[{'id': 1,
  'annotations': [{'id': 4,
    'completed_by': 1,
    'result': [{'value': {'choices': ['Positive']},
      'id': 'n6dSAejmv2',
      'from_name': 'label',
      'to_name': 'latex',
      'type': 'choices',
      'origin': 'manual'}],
    'was_cancelled': False,
    'ground_truth': False,
    'created_at': '2025-06-15T19:58:38.319145Z',
    'updated_at': '2025-06-15T19:58:38.319159Z',
    'draft_created_at': '2025-06-15T19:58:21.765456Z',
    'lead_time': 8.174999999999999,
    'prediction': {},
    'result_count': 1,
    'unique_id': '36225ed4-2a7a-487c-a6f1-056c9a6988e5',
    'import_id': None,
    'last_action': None,
    'bulk_created': False,
    'task': 1,
    'project': 5,
    'updated_by': 1,
    'parent_prediction': None,
    'parent_annotation': None,
    'last_created_by': None}],
  'drafts': [],
  'predictions': [],
  'data': {'id': '2483fd27-c35d-41bd-b1ba-027dd9a114e0',
   'math_expression_dataset_id': '017d10b1-3001-4aaf-966d-1ace1e6e07b9',
   'timestamp': '2