In [1]:
from typing import TYPE_CHECKING


if TYPE_CHECKING:
    from math_rag.application.containers import ApplicationContainer
    from math_rag.infrastructure.containers import InfrastructureContainer

    application_container: ApplicationContainer
    infrastructure_container: InfrastructureContainer

In [2]:
RESET = False
%load_ext hooks.notebook_hook

2025-07-15 07:35:46,792 - INFO - datasets - config.py:54 - PyTorch version 2.6.0 available.


### Cleanup

In [3]:
math_expression_description_opt_embedding_repository = (
    infrastructure_container.math_expression_description_opt_embedding_repository()
)
math_expression_description_opt_repository = (
    infrastructure_container.math_expression_description_opt_repository()
)
math_expression_description_repository = (
    infrastructure_container.math_expression_description_repository()
)
math_expression_group_repository = infrastructure_container.math_expression_group_repository()
math_expression_group_graph_repository = (
    await infrastructure_container.math_expression_group_graph_repository()
)
math_expression_graph_repository = await infrastructure_container.math_expression_graph_repository()
math_expression_repository = infrastructure_container.math_expression_repository()

math_expression_relationship_repository = (
    infrastructure_container.math_expression_relationship_repository()
)
math_expression_relationship_description_repository = (
    infrastructure_container.math_expression_relationship_description_repository()
)
math_expression_context_repository = infrastructure_container.math_expression_context_repository()
math_article_chunk_repository = infrastructure_container.math_article_chunk_repository()
math_expression_index_repository = infrastructure_container.math_expression_index_repository()
task_repository = infrastructure_container.task_repository()

In [8]:
from uuid import UUID


# index_id_to_remove = UUID('TODO')
index_id_to_remove = UUID('6fcd9f59-e5c7-4fd6-a3d6-6355b044756a')
common_filter = {'math_expression_index_id': index_id_to_remove}

await math_expression_index_repository.delete_one(filter={'id': index_id_to_remove})

await math_expression_repository.delete_many(filter=common_filter.copy())
await math_expression_context_repository.delete_many(filter=common_filter.copy())
await math_expression_description_repository.delete_many(filter=common_filter.copy())
await math_expression_description_opt_repository.delete_many(filter=common_filter.copy())
await math_expression_group_repository.delete_many(filter=common_filter.copy())
await math_article_chunk_repository.delete_many(filter=common_filter.copy())
await math_expression_relationship_repository.delete_many(filter=common_filter.copy())
await math_expression_relationship_description_repository.delete_many(filter=common_filter.copy())

await math_expression_description_opt_embedding_repository.clear()
await math_expression_graph_repository.clear()
await math_expression_group_graph_repository.clear()



In [7]:
await task_repository.delete_one(filter={'id': UUID('8a69638a-f35f-4556-ab48-00ae49ac1502')})

1

### Playground

In [None]:
from pathlib import Path

from math_rag.core.models import MathArticle


google_drive_repository = infrastructure_container.google_drive_repository()
math_article_parser_service = infrastructure_container.math_article_parser_service()

file_id = google_drive_repository.get_file_id(
    Path('ml/lectures/L07-LogisticRegression2/2024_08_10_2174b40686820b4cb591g.tex')
)

if not file_id:
    raise ValueError()

file_content = google_drive_repository.get_file_by_id(file_id)

math_article = MathArticle(
    math_expression_dataset_id=None,
    math_expression_index_id=None,
    name='article',
    bytes=file_content.getvalue(),
)

2025-07-13 16:44:33,634 - INFO - googleapiclient.discovery_cache - __init__.py:49 - file_cache is only supported with oauth2client<4.0.0


In [None]:
grouper_service = application_container.grouper_service()
embedding_repository = (
    infrastructure_container.math_expression_description_opt_embedding_repository()
)

In [None]:
from qdrant_client.http.models import Record


grouped_descriptions = await embedding_repository.group(grouper_service.group)
grouped_records: list[list[Record]] = []

for descriptions in grouped_descriptions:
    ids = [x.id for x in descriptions]
    records = await embedding_repository.client.retrieve(
        collection_name=embedding_repository.collection_name,
        ids=[str(id) for id in ids],
        with_payload=True,
        with_vectors=True,
    )

    for record in records:
        # remove some data for a clener diagram
        record.payload['text'] = record.payload['text'][:50]
        record.payload.pop('math_expression_description_id')
        record.payload.pop('math_expression_index_id')
        record.payload.pop('timestamp')

    grouped_records.append(records)

In [19]:
import os

import pandas as pd
import plotly.express as px

from sklearn.datasets import make_blobs


os.environ['NUMBA_CPU_FEATURES'] = str()  # avoid kernel crash on arm
import umap

#### Example data

In [20]:
# synthetic data
X, y = make_blobs(
    n_samples=500,
    centers=5,
    n_features=10,
    cluster_std=1.0,
    random_state=42,
)

reducer = umap.UMAP(
    n_components=2,
    metric='euclidean',
    random_state=None,
)
X_umap = reducer.fit_transform(X)

In [21]:
df = pd.DataFrame(
    {
        'UMAP1': X_umap[:, 0],
        'UMAP2': X_umap[:, 1],
        'cluster': y,
    }
)

fig = px.scatter(
    df,
    x='UMAP1',
    y='UMAP2',
    color='cluster',
    hover_data=['cluster'],
)
fig.show()

#### Real data

In [22]:
records = [r for grp in grouped_records for r in grp]
vectors = [r.vector for r in records]
cluster_labels = [i for i, grp in enumerate(grouped_records) for _ in grp]

# figure out which payload keys exist across all records
payload_keys = set().union(*(r.payload.keys() for r in records))

reducer = umap.UMAP(
    n_components=2,
    metric='cosine',
    random_state=None,
)
X_umap = reducer.fit_transform(vectors)

In [23]:
rows = []
for x, y, label, record in zip(X_umap[:, 0], X_umap[:, 1], cluster_labels, records):
    row = {
        'UMAP_1': x,
        'UMAP_2': y,
        'cluster': label,
        'id': record.id,
    }
    row.update(record.payload or {})  # add all payload fields
    rows.append(row)

df = pd.DataFrame(rows)

fig = px.scatter(
    df,
    x='UMAP_1',
    y='UMAP_2',
    color='cluster',
    hover_data=list(payload_keys) + ['id', 'cluster'],
)
fig.show()

In [None]:
# len(math_expression_relationships)
# 2389 < 2735 because llm decided that some of them are not connected
# gpt 4o: 2389
# gpt 4o nano: 2692 (bad)