# Index

In [23]:
from src.data.index_and_search import index_df, get_db_object
from loguru import logger

# run in batches to avoid memory issues - index every 100 rows in the df
def index(df_to_index, batch_size=50):
    n = len(df_to_index)
    for i in range(0, n, batch_size):
        batch_df = df_to_index.iloc[i:i+batch_size]
        print(f"--- {i} / {n} ---")
        try:
            index_df(
                df=batch_df,
                index_by_col="question_description",
                need_to_embed_col=True,
                id_col="question_description",
                collection_name="math_questions",
            )
        except Exception as e:
            logger.error(f"Error indexing batch starting at row {i}: {e}")


# Preprocess + Indexing - take first 20,000 rows from each math full dataset

In [21]:
db = get_db_object()
db.print_example("math_questions", 2)

Example from collection 'math_questions': 
([Record(id='000e5735-9c20-5e1e-bf11-aaf35f819c0e', payload={'question': "b'Solve 27*i - 277 - 133 = -32 for i.\\n'", 'answer': "b'14\\n'", 'module': 'algebra__linear_1d', 'question_description': "Topic: algebra__linear_1d\nQuestion: b'Solve 27*i - 277 - 133 = -32 for i.\\n'"}, vector=None, shard_key=None, order_value=None), Record(id='00105762-8f71-55a6-9078-6be13cbc0110', payload={'question': "b'Solve 49 + 87 = 8*y + 48 for y.\\n'", 'answer': "b'11\\n'", 'module': 'algebra__linear_1d', 'question_description': "Topic: algebra__linear_1d\nQuestion: b'Solve 49 + 87 = 8*y + 48 for y.\\n'"}, vector=None, shard_key=None, order_value=None)], '001210d9-ce79-5f2c-89f2-c7e93e89ec0f')


In [24]:
from src.utils.folders_utils import get_repo_folder
import pandas as pd
from pathlib import Path

repo_folder = get_repo_folder()
print(f"Repository folder: {repo_folder}")


directory = Path(repo_folder / "src/data/DB_questions/Math/math_dataset_csvs")

dfs = []

# Loop through all CSV files
for file_path in directory.glob("*.csv"):
    if "algebra__linear_1d_full.csv" in file_path.name or "mini" in file_path.name \
        or "algebra__polynomial_roots_full" in file_path.name:
        continue
    print(f"Processing {file_path.name}...")

    df = pd.read_csv(file_path)[:20000]

    df["question_description"] = df.apply(lambda row: f"Topic: {row['module']}\nQuestion: {row['question']}", axis=1)

    index(df)

Repository folder: C:\Users\ordad\PycharmProjects\PrivateTeacherAgent
Processing arithmetic__add_or_sub_full.csv...
--- 0 / 20000 ---


Generating embeddings: 100%|██████████| 50/50 [00:11<00:00,  4.48it/s]
Inserting data into math_questions: 100%|██████████| 50/50 [00:00<00:00, 2949.33it/s]


Collection 'math_questions' size: count=13155
--- 50 / 20000 ---


Generating embeddings: 100%|██████████| 50/50 [00:11<00:00,  4.51it/s]
Inserting data into math_questions: 100%|██████████| 50/50 [00:00<00:00, 5570.27it/s]
[32m2025-08-14 09:29:48.601[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mindex[0m:[36m19[0m - [31m[1mError indexing batch starting at row 50: The write operation timed out[0m


--- 100 / 20000 ---


Generating embeddings: 100%|██████████| 50/50 [00:10<00:00,  4.58it/s]
Inserting data into math_questions: 100%|██████████| 50/50 [00:00<00:00, 7162.65it/s]
[32m2025-08-14 09:30:16.096[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mindex[0m:[36m19[0m - [31m[1mError indexing batch starting at row 100: The write operation timed out[0m


--- 150 / 20000 ---


Generating embeddings:  12%|█▏        | 6/50 [00:02<00:20,  2.15it/s]


KeyboardInterrupt: 

In [1]:
dfs[0]


Repository folder: /Users/kereng/Projects/PrivateTeacherAgent
/Users/kereng/Projects/PrivateTeacherAgent/src/data/DB_questions/Math/math_dataset_csvs/algebra__linear_1d_full.csv


Unnamed: 0,question,answer,module
0,b'Solve 24 = 1601*c - 1605*c for c.\n',b'-6\n',algebra__linear_1d
1,b'Solve 657 = -220*t + 1086*t + 22307 for t.\n',b'-25\n',algebra__linear_1d
2,b'Solve -11*y - 263*y + 3162 = -88*y for y.\n',b'17\n',algebra__linear_1d
3,b'Solve 0 = -11*b - 4148 + 4225 for b.\n',b'7\n',algebra__linear_1d
4,b'Solve 65*l - 361 + 881 = 0 for l.\n',b'-8\n',algebra__linear_1d
