In [1]:
from typing import TYPE_CHECKING


if TYPE_CHECKING:
    from math_rag.application.containers import ApplicationContainer
    from math_rag.infrastructure.containers import InfrastructureContainer

    application_container: ApplicationContainer
    infrastructure_container: InfrastructureContainer

In [2]:
RESET = False
%load_ext hooks.notebook_hook

2025-07-06 11:23:21,157 - INFO - datasets - config.py:54 - PyTorch version 2.6.0 available.


In [3]:
importer_service = application_container.math_expression_label_task_importer_service()
exporter_service = application_container.math_expression_label_exporter_service()

result_repository = infrastructure_container.math_expression_dataset_test_result_repository()
label_repository = infrastructure_container.math_expression_label_repository()

In [4]:
from uuid import UUID


dataset_id = UUID('b5887579-5742-4e0f-91ce-2fae26bf2c01')
split_name = 'test'

In [5]:
# project_id = await importer_service.import_tasks(None, dataset_id=dataset_id, split_name=split_name)
project_id = 39

human_labels = await exporter_service.export(project_id)

In [None]:
math_expression_ids = [label.math_expression_id for label in human_labels]

gpt_4_1_mini_labels = await label_repository.find_many(
    filter=dict(math_expression_dataset_id=dataset_id, math_expression_id=math_expression_ids)
)

In [21]:
results = await result_repository.find_many(
    filter=dict(
        math_expression_dataset_id=dataset_id, math_expression_dataset_split_name=split_name
    )
)

llama_labels = results[0].math_expression_labels
gpt_4_1_labels = results[1].math_expression_labels
gpt_4_1_nano_labels = results[2].math_expression_labels
llama_3fbdb8c2_labels = results[8].math_expression_labels  # [3, 4] had wrong config

In [7]:
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

from math_rag.core.enums import MathExpressionLabelEnum
from math_rag.core.models import MathExpressionLabel

In [32]:
def _sort_by_math_expression_id(labels: list[MathExpressionLabel]) -> list[MathExpressionLabel]:
    return sorted(
        labels,
        key=lambda label: label.math_expression_id,
    )


def prepare(math_expression_labels: list[MathExpressionLabel]) -> list[str]:
    sorted = _sort_by_math_expression_id(math_expression_labels)

    return [label.value.value for label in sorted]


def evaluate_multiclass_labels(
    y_true: list[str],
    y_pred: list[str],
    labels: list[str],
):
    accuracy = accuracy_score(y_true, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    report = classification_report(
        y_true,
        y_pred,
        labels=labels,
        target_names=labels,
        zero_division=0,
    )
    cm = confusion_matrix(
        y_true,
        y_pred,
        labels=labels,
    )

    print(f'accuracy: {accuracy}')
    print(f'balanced_accuracy: {balanced_accuracy}')
    print(f'precision: {precision}')
    print(f'recall: {recall}')
    print(f'f1: {f1}')
    print(report)
    print(cm)

In [24]:
labels = [e.value for e in MathExpressionLabelEnum]

### New results

In [21]:
y_true = prepare(human_labels)
y_pred = prepare(gpt_4_1_mini_labels)

evaluate_multiclass_labels(y_true, y_pred, labels)

accuracy: 0.8921334922526818
balanced_accuracy: 0.9021598734660079
precision: 0.8079103519713113
recall: 0.9021598734660079
f1: 0.8482130246869071
              precision    recall  f1-score   support

    equality       0.94      0.94      0.94       937
  inequality       0.74      0.96      0.84       311
    constant       0.75      1.00      0.86        15
    variable       0.67      0.74      0.70       236
       other       0.94      0.88      0.91      1857

    accuracy                           0.89      3356
   macro avg       0.81      0.90      0.85      3356
weighted avg       0.90      0.89      0.89      3356

[[ 880   15    0    0   42]
 [   5  298    0    1    7]
 [   0    0   15    0    0]
 [   1    0    0  174   61]
 [  53   89    5   83 1627]]


In [22]:
y_true = prepare(human_labels)
y_pred = prepare(llama_labels)

evaluate_multiclass_labels(y_true, y_pred, labels)

accuracy: 0.6129320619785459
balanced_accuracy: 0.722245539260162
precision: 0.5350198160088759
recall: 0.722245539260162
f1: 0.485150573845835
              precision    recall  f1-score   support

    equality       0.94      0.15      0.27       937
  inequality       0.38      0.95      0.55       311
    constant       0.20      1.00      0.33        15
    variable       0.41      0.73      0.53       236
       other       0.74      0.77      0.75      1857

    accuracy                           0.61      3356
   macro avg       0.54      0.72      0.49      3356
weighted avg       0.74      0.61      0.58      3356

[[ 145  299   42   13  438]
 [   0  297    1    1   12]
 [   0    0   15    0    0]
 [   1    3    0  173   59]
 [   8  174   18  230 1427]]


In [23]:
y_true = prepare(human_labels)
y_pred = prepare(gpt_4_1_labels)

evaluate_multiclass_labels(y_true, y_pred, labels)

accuracy: 0.8888557806912991
balanced_accuracy: 0.8569899570718409
precision: 0.8202673861072635
recall: 0.8569899570718409
f1: 0.8203052644965687
              precision    recall  f1-score   support

    equality       0.94      0.91      0.93       937
  inequality       0.71      0.95      0.81       311
    constant       0.68      1.00      0.81        15
    variable       0.86      0.51      0.64       236
       other       0.91      0.91      0.91      1857

    accuracy                           0.89      3356
   macro avg       0.82      0.86      0.82      3356
weighted avg       0.89      0.89      0.89      3356

[[ 857   27    0    0   53]
 [   7  295    0    1    8]
 [   0    0   15    0    0]
 [   1    0    0  120  115]
 [  43   92    7   19 1696]]


In [24]:
y_true = prepare(human_labels)
y_pred = prepare(gpt_4_1_nano_labels)

evaluate_multiclass_labels(y_true, y_pred, labels)

accuracy: 0.6367699642431466
balanced_accuracy: 0.8026210410949343
precision: 0.5858758081096148
recall: 0.8026210410949343
f1: 0.5852453180940094
              precision    recall  f1-score   support

    equality       0.98      0.54      0.69       937
  inequality       0.71      0.95      0.81       311
    constant       0.23      1.00      0.37        15
    variable       0.24      0.94      0.38       236
       other       0.77      0.59      0.67      1857

    accuracy                           0.64      3356
   macro avg       0.59      0.80      0.59      3356
weighted avg       0.78      0.64      0.67      3356

[[ 503   38   26   61  309]
 [   3  294    0    6    8]
 [   0    0   15    0    0]
 [   1    2    1  221   11]
 [   5   79   24  645 1104]]


### Fine tune job `3fbdb8c2-e6f3-407c-a212-50c312dada76` results

In [33]:
y_true = prepare(human_labels)
y_pred = prepare(llama_3fbdb8c2_labels)

evaluate_multiclass_labels(y_true, y_pred, labels)

# NOTE: without fine-tuning
#
# accuracy: 0.6129320619785459
# balanced_accuracy: 0.722245539260162
# precision: 0.5350198160088759
# recall: 0.722245539260162
# f1: 0.485150573845835

# NOTE: when weights are not merged properly
# accuracy: 0.23450536352800955
# balanced_accuracy: 0.393903190098157
# precision: 0.37471870336836843
# recall: 0.393903190098157
# f1: 0.15413872393249559

accuracy: 0.23420738974970204
balanced_accuracy: 0.39261231715058964
precision: 0.3761494880102315
recall: 0.39261231715058964
f1: 0.15490696956950198
              precision    recall  f1-score   support

    equality       0.86      0.08      0.15       937
  inequality       0.13      0.84      0.22       311
    constant       0.02      0.80      0.04        15
    variable       0.20      0.01      0.02       236
       other       0.67      0.23      0.35      1857

    accuracy                           0.23      3356
   macro avg       0.38      0.39      0.15      3356
weighted avg       0.64      0.23      0.26      3356

[[  76  674  156    0   31]
 [   2  261   35    0   13]
 [   0    0   12    0    3]
 [   0   52   13    2  169]
 [  10 1061  343    8  435]]


### Old results

In [25]:
y_true = prepare(human_labels)
y_pred = prepare(gpt_4_1_nano_labels)

evaluate_multiclass_labels(y_true, y_pred, labels)

0.6289592760180995
              precision    recall  f1-score   support

    equality       1.00      0.74      0.85        42
  inequality       1.00      1.00      1.00         7
    constant       0.28      1.00      0.44        17
    variable       0.44      1.00      0.61        22
       other       0.86      0.47      0.60       133

    accuracy                           0.63       221
   macro avg       0.72      0.84      0.70       221
weighted avg       0.81      0.63      0.65       221

[[31  0  1  0 10]
 [ 0  7  0  0  0]
 [ 0  0 17  0  0]
 [ 0  0  0 22  0]
 [ 0  0 43 28 62]]


In [26]:
y_true = prepare(human_labels)
y_pred = prepare(gpt_4_1_mini_labels)

evaluate_multiclass_labels(y_true, y_pred, labels)

0.8144796380090498
              precision    recall  f1-score   support

    equality       1.00      1.00      1.00        42
  inequality       1.00      1.00      1.00         7
    constant       0.30      1.00      0.46        17
    variable       1.00      0.95      0.98        22
       other       0.99      0.70      0.82       133

    accuracy                           0.81       221
   macro avg       0.86      0.93      0.85       221
weighted avg       0.94      0.81      0.85       221

[[42  0  0  0  0]
 [ 0  7  0  0  0]
 [ 0  0 17  0  0]
 [ 0  0  0 21  1]
 [ 0  0 40  0 93]]


In [27]:
y_true = prepare(human_labels)
y_pred = prepare(gpt_4_1_labels)

evaluate_multiclass_labels(y_true, y_pred, labels)

0.8235294117647058
              precision    recall  f1-score   support

    equality       1.00      1.00      1.00        42
  inequality       1.00      1.00      1.00         7
    constant       0.32      1.00      0.49        17
    variable       1.00      0.86      0.93        22
       other       0.97      0.73      0.83       133

    accuracy                           0.82       221
   macro avg       0.86      0.92      0.85       221
weighted avg       0.93      0.82      0.85       221

[[42  0  0  0  0]
 [ 0  7  0  0  0]
 [ 0  0 17  0  0]
 [ 0  0  0 19  3]
 [ 0  0 36  0 97]]


In [28]:
y_true = prepare(human_labels)
y_pred = prepare(o3_labels)

evaluate_multiclass_labels(y_true, y_pred, labels)

0.8280542986425339
              precision    recall  f1-score   support

    equality       1.00      1.00      1.00        42
  inequality       0.88      1.00      0.93         7
    constant       0.31      0.94      0.46        17
    variable       1.00      1.00      1.00        22
       other       0.99      0.72      0.83       133

    accuracy                           0.83       221
   macro avg       0.83      0.93      0.85       221
weighted avg       0.94      0.83      0.86       221

[[42  0  0  0  0]
 [ 0  7  0  0  0]
 [ 0  0 16  0  1]
 [ 0  0  0 22  0]
 [ 0  1 36  0 96]]


In [4]:
40_000 * 200 / 1_000_000 * 0.4

3.2