# Index

In [None]:
from src.data.index_and_search import index_df, get_db_object
from loguru import logger

# run in batches to avoid memory issues - index every 100 rows in the df
def index(df_to_index, batch_size=100):
    n = len(df_to_index)
    for i in range(0, n, batch_size):
        batch_df = df_to_index.iloc[i:i+batch_size]
        print(f"--- {i} / {n} ---")
        try:
            index_df(
                df=batch_df,
                index_by_col="question_description",
                need_to_embed_col=True,
                id_col="question_description",
                collection_name="math_questions",
            )
        except Exception as e:
            logger.error(f"Error indexing batch starting at row {i}: {e}")


# Preprocess + Indexing - take first 20,000 rows from each math full dataset

In [None]:
db = get_db_object()
db.print_example("math_questions", 2)

In [None]:
from src.utils.folders_utils import get_repo_folder
import pandas as pd
from pathlib import Path

repo_folder = get_repo_folder()
print(f"Repository folder: {repo_folder}")


directory = Path(repo_folder / "src/data/DB_questions/Math/math_dataset_csvs")

dfs = []

# Loop through all CSV files
for file_path in directory.glob("*.csv"):
    if "algebra__linear_1d_full.csv" in file_path.name or "mini" in file_path.name \
        or "algebra__polynomial_roots_full" in file_path.name:
        continue
    print(f"Processing {file_path.name}...")

    df = pd.read_csv(file_path)[27000:28000]

    df["question_description"] = df.apply(lambda row: f"Topic: {row['module']}\nQuestion: {row['question']}", axis=1)

    index(df)

In [None]:
dfs[0]
