# Index

In [2]:
from src.data.index_and_search import index_df, get_db_object
from loguru import logger

# run in batches to avoid memory issues - index every 100 rows in the df
def index(df_to_index, batch_size=100):
    n = len(df_to_index)
    for i in range(0, n, batch_size):
        batch_df = df_to_index.iloc[i:i+batch_size]
        print(f"--- {i} / {n} ---")
        try:
            index_df(
                df=batch_df,
                index_by_col="question_description",
                need_to_embed_col=True,
                id_col="question_description",
                collection_name="math_questions",
            )
        except Exception as e:
            logger.error(f"Error indexing batch starting at row {i}: {e}")


# Preprocess + Indexing - take first 20,000 rows from each math full dataset

In [2]:
db = get_db_object()
db.print_example("math_questions", 2)

Example from collection 'math_questions': 
([Record(id='000e2a02-ca5c-5d79-b01c-b093ecae91fb', payload={'question': "b'What is the second derivative of i**5 + 54*i**3*t + 2*i*t + 103*i + 2 wrt i?\\n'", 'answer': "b'20*i**3 + 324*i*t\\n'", 'module': 'calculus__differentiate', 'question_description': "Topic: calculus__differentiate\nQuestion: b'What is the second derivative of i**5 + 54*i**3*t + 2*i*t + 103*i + 2 wrt i?\\n'"}, vector=None, shard_key=None, order_value=None), Record(id='000e5735-9c20-5e1e-bf11-aaf35f819c0e', payload={'question': "b'Solve 27*i - 277 - 133 = -32 for i.\\n'", 'answer': "b'14\\n'", 'module': 'algebra__linear_1d', 'question_description': "Topic: algebra__linear_1d\nQuestion: b'Solve 27*i - 277 - 133 = -32 for i.\\n'"}, vector=None, shard_key=None, order_value=None)], '000fe6b6-eb2c-57f9-9a5b-b4df0a6809f3')


In [None]:
from src.utils.folders_utils import get_repo_folder
import pandas as pd
from pathlib import Path

repo_folder = get_repo_folder()
print(f"Repository folder: {repo_folder}")


directory = Path(repo_folder / "src/data/DB_questions/Math/math_dataset_csvs")

dfs = []

# Loop through all CSV files
for file_path in directory.glob("*.csv"):
    if "algebra__linear_1d_full.csv" in file_path.name or "mini" in file_path.name \
        or "algebra__polynomial_roots_full" in file_path.name:
        continue
    print(f"Processing {file_path.name}...")

    df = pd.read_csv(file_path)[25500:27000]

    df["question_description"] = df.apply(lambda row: f"Topic: {row['module']}\nQuestion: {row['question']}", axis=1)

    index(df)

Repository folder: /Users/kereng/Projects/PrivateTeacherAgent
Processing numbers__gcd_full.csv...
--- 0 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:25<00:00,  3.86it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 19754.63it/s]


Collection 'math_questions' size: count=53003
--- 100 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:22<00:00,  4.44it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 10580.72it/s]


Collection 'math_questions' size: count=53103
--- 200 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:20<00:00,  4.81it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 10816.75it/s]


Collection 'math_questions' size: count=53203
--- 300 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:21<00:00,  4.71it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 12287.76it/s]


Collection 'math_questions' size: count=53303
--- 400 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:21<00:00,  4.62it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 12986.67it/s]


Collection 'math_questions' size: count=53403
--- 500 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:20<00:00,  4.98it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 11658.94it/s]


Collection 'math_questions' size: count=53503
--- 600 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:24<00:00,  4.14it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 12632.30it/s]


Collection 'math_questions' size: count=53603
--- 700 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:28<00:00,  3.48it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 13213.74it/s]


Collection 'math_questions' size: count=53703
--- 800 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:22<00:00,  4.44it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 13327.52it/s]


Collection 'math_questions' size: count=53803
--- 900 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:23<00:00,  4.18it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 28248.28it/s]


Collection 'math_questions' size: count=53903
--- 1000 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:20<00:00,  4.88it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 18332.55it/s]


Collection 'math_questions' size: count=54003
--- 1100 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:24<00:00,  4.15it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 37065.25it/s]


Collection 'math_questions' size: count=54103
--- 1200 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:19<00:00,  5.07it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 14412.92it/s]


Collection 'math_questions' size: count=54203
--- 1300 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:21<00:00,  4.63it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 9850.41it/s]


Collection 'math_questions' size: count=54303
--- 1400 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:22<00:00,  4.41it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 9867.79it/s]


Collection 'math_questions' size: count=54403
Processing polynomials__expand_full.csv...
--- 0 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:23<00:00,  4.27it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 12523.68it/s]


Collection 'math_questions' size: count=54503
--- 100 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:21<00:00,  4.75it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 9632.34it/s]


Collection 'math_questions' size: count=54603
--- 200 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:24<00:00,  4.09it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 11752.37it/s]


Collection 'math_questions' size: count=54703
--- 300 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:22<00:00,  4.46it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 9794.28it/s]


Collection 'math_questions' size: count=54803
--- 400 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:24<00:00,  4.01it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 8954.15it/s]


Collection 'math_questions' size: count=54903
--- 500 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:22<00:00,  4.38it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 22025.44it/s]
[32m2025-08-14 13:38:27.295[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mindex[0m:[36m19[0m - [31m[1mError indexing batch starting at row 500: The write operation timed out[0m


--- 600 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:21<00:00,  4.75it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 30321.00it/s]


Collection 'math_questions' size: count=55003
--- 700 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:24<00:00,  4.07it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 35959.40it/s]


Collection 'math_questions' size: count=55103
--- 800 / 1500 ---


Generating embeddings: 100%|██████████| 100/100 [00:21<00:00,  4.74it/s]
Inserting data into math_questions: 100%|██████████| 100/100 [00:00<00:00, 28842.69it/s]


In [None]:
dfs[0]
