In [3]:
%load_ext autoreload
%autoreload 2

import os
import sys

import nest_asyncio


sys.path.insert(0, os.path.abspath('..'))
nest_asyncio.apply()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import logging


logging.basicConfig(
    level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s'
)
logging.getLogger('pylatexenc.latexwalker').setLevel(logging.ERROR)
logging.getLogger('httpx').setLevel(logging.WARNING)
logging.getLogger('openai').setLevel(logging.WARNING)

In [5]:
from math_rag.infrastructure.containers import InfrastructureContainer


RESET = False

infrastructure_container = InfrastructureContainer()
infrastructure_container.init_resources()

# seeders
math_article_seeder = infrastructure_container.math_article_seeder()
math_expression_seeder = infrastructure_container.math_expression_seeder()
math_expression_classification_seeder = (
    infrastructure_container.math_expression_classification_seeder()
)
em_failed_request_seeder = infrastructure_container.em_failed_request_seeder()
llm_failed_request_seeder = infrastructure_container.llm_failed_request_seeder()

math_article_seeder.seed(reset=RESET)
await math_expression_seeder.seed(reset=RESET)
await math_expression_classification_seeder.seed(reset=RESET)
await em_failed_request_seeder.seed(reset=RESET)
await llm_failed_request_seeder.seed(reset=RESET)

# repositories
math_article_repository = infrastructure_container.math_article_repository()
math_expression_repository = infrastructure_container.math_expression_repository()
math_expression_classification_repository = (
    infrastructure_container.math_expression_classification_repository()
)
google_file_repository = infrastructure_container.google_file_repository()

# other
katex_corrector_assistant = infrastructure_container.katex_corrector_assistant()

2025-05-20 09:19:08,451 - INFO - PyTorch version 2.6.0 available.


TypeError: Too few arguments for typing.AsyncGenerator; actual 1, expected 2

In [None]:
# await math_expression_seeder.seed(reset=True)
# await llm_failed_request_seeder.seed(reset=True)

### Load

In [None]:
from math_rag.application.enums.arxiv import MathCategory


math_article_loader_service = infrastructure_container.math_article_loader_service()
# await math_article_loader_service.load(MathCategory, 200)

### Parse

In [4]:
math_expression_loader_service = (
    infrastructure_container.math_expression_loader_service()
)

await math_expression_loader_service.load()

2025-05-20 08:08:38,652 - INFO - HTTP Request: POST http://host.docker.internal:7025/validate-many "HTTP/1.1 200 OK"
2025-05-20 08:08:38,654 - INFO - Validated KaTeX: 586/641
2025-05-20 08:08:39,431 - INFO - HTTP Request: GET https://api.openai.com/v1/models "HTTP/1.1 200 OK"
2025-05-20 08:08:42,539 - INFO - HTTP Request: GET https://api.openai.com/v1/models "HTTP/1.1 200 OK"
2025-05-20 08:08:43,699 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-20 08:08:43,781 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-20 08:08:43,797 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-20 08:08:43,809 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-20 08:08:43,829 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-20 08:08:43,842 - INFO - HTTP Request: POST http

ReadTimeout: 

### Label

In [None]:
# TODO


### Display

In [None]:
from IPython.display import Math, display


for i, latex_math_node in enumerate(math_nodes[:100]):
    latex = latex_math_node.latex_verbatim()
    math_display_object = Math(latex)

    display(math_display_object)

### Correct

In [6]:
incorrect_katex = r'd\omega = \theta \w \omega'
error = r'KaTeX parse error: Undefined control sequence: \w at position 18: …omega = \theta \̲w̲ ̲\omega'

In [7]:
from math_rag.application.models.assistants import KatexCorrectorAssistantInput


input = KatexCorrectorAssistantInput(katex=incorrect_katex, error=error)

In [None]:
output = await katex_corrector_assistant.assist(input)
corrected_katex = output.katex
print(corrected_katex)
display(Math(corrected_katex))

In [None]:
import copy


inputs = [copy.deepcopy(input) for _ in range(5000)]
outputs = await katex_corrector_assistant.concurrent_assist(inputs)

### Dataset

#### Upload

In [4]:
from math_rag.application.models.datasets import (
    MathExpressionDataset,
    MathExpressionSample,
)
from math_rag.core.enums import MathExpressionLabelEnum


samples = [
    MathExpressionSample(
        latex=f'x + {i} = 5',
        label=MathExpressionLabelEnum.EQUALITY,
    )
    for i in range(10)
]
dataset = MathExpressionDataset(samples)

In [5]:
from math_rag.application.assistants.prompts import MATH_EXPRESSION_LABELER_PROMPT
from math_rag.application.models.datasets import (
    DatasetMetadataFile,
    DatasetSplitSettings,
)


settings = DatasetSplitSettings(
    train_ratio=0.8, validate_ratio=0.1, test_ratio=0.1, seed=42
)

json_str = MATH_EXPRESSION_LABELER_PROMPT.model_dump_json(indent=4)
content = json_str.encode('utf-8')
metadata_file = DatasetMetadataFile(name='prompt.json', content=content)

In [6]:
dataset_publisher_service = infrastructure_container.dataset_publisher_service()
dataset_publisher_service.publish(
    dataset, MathExpressionSample, settings, metadata_file
)

2025-05-14 21:12:31,414 - INFO - Dataset KebabSeller/mathexpressiondataset already exists


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

#### Download

In [7]:
from datasets import load_dataset
from datasets.download import DownloadConfig
from decouple import config


HF_USERNAME = config('HF_USERNAME', default=None)
HF_TOKEN = config('HF_TOKEN', default=None)

download_config = DownloadConfig(
    max_retries=3,
    disable_tqdm=True,
)

dataset_dict = load_dataset(
    path=f'{HF_USERNAME}/mathexpressiondataset',
    split=None,
    download_config=download_config,
    token=HF_TOKEN,
    trust_remote_code=True,
)

README.md:   0%|          | 0.00/641 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

validate-00000-of-00001.parquet:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8 [00:00<?, ? examples/s]

Generating validate split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1 [00:00<?, ? examples/s]

In [8]:
from typing import cast

from datasets import ClassLabel


class_label = cast(ClassLabel, dataset_dict['train'].features['label'])
class_label.names

['equality', 'inequality', 'constant', 'variable', 'other']