In [1]:
from typing import TYPE_CHECKING


if TYPE_CHECKING:
    from math_rag.application.containers import ApplicationContainer
    from math_rag.infrastructure.containers import InfrastructureContainer

    application_container: ApplicationContainer
    infrastructure_container: InfrastructureContainer

In [2]:
RESET = True
%load_ext hooks.notebook_hook

2025-06-22 16:15:10,203 - INFO - PyTorch version 2.6.0 available.


In [3]:
importer_service = application_container.math_expression_label_task_importer_service()
exporter_service = application_container.math_expression_label_exporter_service()

result_repository = infrastructure_container.math_expression_dataset_test_result_repository()
label_repository = infrastructure_container.math_expression_label_repository()

In [4]:
from uuid import UUID


dataset_id = UUID('7cdbb987-0994-4d59-8d9c-5e827e000769')
split_name = 'test'

In [5]:
# project_id = await importer_service.import_tasks(None, dataset_id=dataset_id, split_name=split_name)
project_id = 38

In [6]:
human_labels = await exporter_service.export(project_id)

In [7]:
math_expression_ids = [label.math_expression_id for label in human_labels]

In [24]:
gpt_4_1_nano_labels = await label_repository.find_many(
    filter=dict(math_expression_dataset_id=dataset_id, math_expression_id=math_expression_ids)
)

In [13]:
results = await result_repository.find_many(
    filter=dict(
        math_expression_dataset_id=dataset_id, math_expression_dataset_split_name=split_name
    )
)

# llama_labels = results[0].math_expression_labels
# gpt_4_1_labels = results[1].math_expression_labels
gpt_4_1_mini_labels = results[0].math_expression_labels
gpt_4_1_labels = results[1].math_expression_labels
o3_labels = results[8].math_expression_labels

In [14]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from math_rag.core.enums import MathExpressionLabelEnum
from math_rag.core.models import MathExpressionLabel

In [15]:
def _sort_by_math_expression_id(labels: list[MathExpressionLabel]) -> list[MathExpressionLabel]:
    return sorted(
        labels,
        key=lambda label: label.math_expression_id,
    )


def prepare(math_expression_labels: list[MathExpressionLabel]) -> list[str]:
    sorted = _sort_by_math_expression_id(math_expression_labels)

    return [label.value.value for label in sorted]


def evaluate_multiclass_labels(
    y_true: list[str],
    y_pred: list[str],
    labels: list[str],
):
    acc = accuracy_score(y_true, y_pred)
    report = classification_report(
        y_true,
        y_pred,
        labels=labels,
        target_names=labels,
        zero_division=0,
    )
    cm = confusion_matrix(
        y_true,
        y_pred,
        labels=labels,
    )

    print(acc)
    print(report)
    print(cm)

In [17]:
labels = [e.value for e in MathExpressionLabelEnum]

## Results

In [25]:
y_true = prepare(human_labels)
y_pred = prepare(gpt_4_1_nano_labels)

evaluate_multiclass_labels(y_true, y_pred, labels)

0.6289592760180995
              precision    recall  f1-score   support

    equality       1.00      0.74      0.85        42
  inequality       1.00      1.00      1.00         7
    constant       0.28      1.00      0.44        17
    variable       0.44      1.00      0.61        22
       other       0.86      0.47      0.60       133

    accuracy                           0.63       221
   macro avg       0.72      0.84      0.70       221
weighted avg       0.81      0.63      0.65       221

[[31  0  1  0 10]
 [ 0  7  0  0  0]
 [ 0  0 17  0  0]
 [ 0  0  0 22  0]
 [ 0  0 43 28 62]]


In [26]:
y_true = prepare(human_labels)
y_pred = prepare(gpt_4_1_mini_labels)

evaluate_multiclass_labels(y_true, y_pred, labels)

0.8144796380090498
              precision    recall  f1-score   support

    equality       1.00      1.00      1.00        42
  inequality       1.00      1.00      1.00         7
    constant       0.30      1.00      0.46        17
    variable       1.00      0.95      0.98        22
       other       0.99      0.70      0.82       133

    accuracy                           0.81       221
   macro avg       0.86      0.93      0.85       221
weighted avg       0.94      0.81      0.85       221

[[42  0  0  0  0]
 [ 0  7  0  0  0]
 [ 0  0 17  0  0]
 [ 0  0  0 21  1]
 [ 0  0 40  0 93]]


In [27]:
y_true = prepare(human_labels)
y_pred = prepare(gpt_4_1_labels)

evaluate_multiclass_labels(y_true, y_pred, labels)

0.8235294117647058
              precision    recall  f1-score   support

    equality       1.00      1.00      1.00        42
  inequality       1.00      1.00      1.00         7
    constant       0.32      1.00      0.49        17
    variable       1.00      0.86      0.93        22
       other       0.97      0.73      0.83       133

    accuracy                           0.82       221
   macro avg       0.86      0.92      0.85       221
weighted avg       0.93      0.82      0.85       221

[[42  0  0  0  0]
 [ 0  7  0  0  0]
 [ 0  0 17  0  0]
 [ 0  0  0 19  3]
 [ 0  0 36  0 97]]


In [28]:
y_true = prepare(human_labels)
y_pred = prepare(o3_labels)

evaluate_multiclass_labels(y_true, y_pred, labels)

0.8280542986425339
              precision    recall  f1-score   support

    equality       1.00      1.00      1.00        42
  inequality       0.88      1.00      0.93         7
    constant       0.31      0.94      0.46        17
    variable       1.00      1.00      1.00        22
       other       0.99      0.72      0.83       133

    accuracy                           0.83       221
   macro avg       0.83      0.93      0.85       221
weighted avg       0.94      0.83      0.86       221

[[42  0  0  0  0]
 [ 0  7  0  0  0]
 [ 0  0 16  0  1]
 [ 0  0  0 22  0]
 [ 0  1 36  0 96]]


In [4]:
40_000 * 200 / 1_000_000 * 0.4

3.2