# Preprocess

In [None]:
from src.utils.folders_utils import get_repo_folder
import pandas as pd

repo_folder = get_repo_folder()
print(f"Repository folder: {repo_folder}")

sat_path = repo_folder / "src/data/DB_questions/SAT/sat_world_and_us_history.csv"
print(sat_path)

df = pd.read_csv(sat_path)
df.head()

In [None]:
df

In [None]:
df['subject'].unique()
# rename col prompt -> question
df.rename(columns={"prompt": "question", "subject": "topic"}, inplace=True)


df['question_description'] = "Topic: " + df['topic'] + "\nQuestion: " + df['question']

df

In [None]:
select_cols_to_index = [
    "question_description",
    "topic",
    "question",
    "A",
    "B",
    "C",
    "D",
    "E",
    "answer"
]

df_to_index = df[select_cols_to_index]
df_to_index.to_csv("sat_questions_to_index.csv", index=False)

df_to_index

In [None]:
df_to_index = pd.read_csv("sat_questions_to_index.csv")

# drop na by col question and col answer
print(len(df_to_index))
df_to_index = df_to_index.dropna(subset=["question", "answer"])
df_to_index = df_to_index.reset_index(drop=True)
print(len(df_to_index))

df_to_index.to_csv("sat_questions_to_index.csv", index=False)

In [None]:
# filter by payload size - we'll filter out rows that have length greater than 3000 characters (in all columns together)
print(len(df_to_index))
filtered = df_to_index[df_to_index.apply(lambda x: x.str.len().sum() <= 3000, axis=1)]
filtered = filtered.reset_index(drop=True)
print(len(filtered))

# filtered.to_csv("SAT_questions_to_index.csv", index=False)

In [None]:
df_to_index.apply(lambda x: x.str.len().sum(), axis=1)

In [None]:
# simple test for this: df_to_index[df_to_index.apply(lambda x: x.str.len().sum() <= 3000, axis=1)]

simple_df = pd.DataFrame({
    "question_description": [
        "This is a short question.",
        "This is a very long question that exceeds the limit of 3000 characters. "
    ],
    "topic": ["History", "Geography"],
})
simple_df.apply(lambda x: x.str.len().sum(), axis=1)

# Index

In [1]:
import pandas as pd

df_to_index = pd.read_csv("sat_questions_to_index.csv")

df_to_index

Unnamed: 0,question_description,topic,question,A,B,C,D,E,answer
0,Topic: world_history\nQuestion: Sumer and Egyp...,world_history,Sumer and Egypt were similar in all of the fol...,agricultural dependence on the silt left behin...,belief in a polytheistic religion centered on ...,enjoyment of protection from invasion by natur...,development of distinct systems of written lan...,engagement in trade with other nearby cultures,C
1,Topic: world_history\nQuestion: A follower of ...,world_history,A follower of the Chinese philosophy of Daoism...,Government intrusiveness should be kept to a m...,Government works best when run by well-educate...,Government should be focused on strengthening ...,Governments that allow too much freedom are do...,Governments have an obligation to suppress ind...,A
2,Topic: world_history\nQuestion: Which of these...,world_history,Which of these was NOT a characteristic of ear...,Ability to make and use simple tools,Tendency to live in permanent settlements,Reliance on hunting and gathering techniques,Propensity for creating artistic cave drawings,Mastery of fire for cooking and heating purposes,B
3,Topic: world_history\nQuestion: Buddhism was f...,world_history,Buddhism was founded in part as a response to ...,Islam,Daoism,Judaism,Hinduism,Christianity,D
4,Topic: world_history\nQuestion: The Kush city ...,world_history,The Kush city of Meroe rose to prominence main...,salt,iron,gold,grain,silver,B
...,...,...,...,...,...,...,...,...,...
1374,Topic: us_history\nQuestion: The Election of 2...,us_history,The Election of 2000 was significant because,the Supreme Court of Florida decided the elect...,"Katherine Harris, a Democrat, validated the el...",Al Gore won more popular votes than George W. ...,"Jeb Bush, the brother of George W. Bush, was g...",George W. Bush was the first son to follow his...,C
1375,"Topic: us_history\nQuestion: In his 2001 ""War ...",us_history,"In his 2001 ""War on Terror"" speech to Congress...",immediately close all terrorist camps.,release all foreign prisoners.,transfer all terrorists on Afghani soil to the...,hand over Saddam Hussein.,give U.S. troops access to terror camps to ens...,D
1376,Topic: us_history\nQuestion: Mortgage-backed s...,us_history,Mortgage-backed securities are,bundles of subprime mortgages traded like stocks.,low-interest loans offered to people with trou...,another name for adjustable-rate mortgages.,recession-proof investments.,stable assets.,A
1377,Topic: us_history\nQuestion: George W. Bush's ...,us_history,George W. Bush's immigration reform policy can...,amnesty for all people who immigrated to the U...,"unrealistic, in that it refused to acknowledge...",a policy to deport immediately all illegal imm...,a pathway to citizenship for illegal immigrant...,a way to safeguard American jobs for U.S. citi...,D


In [5]:
from src.data.index_and_search import index_df
from loguru import logger

# run in batches to avoid memory issues - index every 100 rows in the df
n = len(df_to_index)
for i in range(0, n, 100):
    batch_df = df_to_index.iloc[i:i+100]
    print(f"--- {i} / {n} ---")
    try:
        index_df(
            df=batch_df, 
            index_by_col="question_description", 
            need_to_embed_col=True, 
            id_col="question_description",
            collection_name="sat_questions",
        )
    except Exception as e:
        logger.error(f"Error indexing batch starting at row {i}: {e}")


[32m2025-08-07 18:48:16.340[0m | [1mINFO    [0m | [36msrc.data.index_and_search[0m:[36mindex_df[0m:[36m172[0m - [1mEmbedding col 'question_description'...[0m


--- 0 / 1379 ---


Generating embeddings: 100%|██████████| 100/100 [00:23<00:00,  4.20it/s]
Inserting data into sat_questions: 100%|██████████| 100/100 [00:00<00:00, 16453.41it/s]
[32m2025-08-07 18:48:44.999[0m | [1mINFO    [0m | [36msrc.data.index_and_search[0m:[36mindex_df[0m:[36m172[0m - [1mEmbedding col 'question_description'...[0m


Collection 'sat_questions' size: count=100
--- 100 / 1379 ---


Generating embeddings: 100%|██████████| 100/100 [00:24<00:00,  4.15it/s]
Inserting data into sat_questions: 100%|██████████| 100/100 [00:00<00:00, 20129.12it/s]
[32m2025-08-07 18:49:15.046[0m | [1mINFO    [0m | [36msrc.data.index_and_search[0m:[36mindex_df[0m:[36m172[0m - [1mEmbedding col 'question_description'...[0m


Collection 'sat_questions' size: count=200
--- 200 / 1379 ---


Generating embeddings: 100%|██████████| 100/100 [00:24<00:00,  4.05it/s]
Inserting data into sat_questions: 100%|██████████| 100/100 [00:00<00:00, 14112.26it/s]
[32m2025-08-07 18:49:46.654[0m | [1mINFO    [0m | [36msrc.data.index_and_search[0m:[36mindex_df[0m:[36m172[0m - [1mEmbedding col 'question_description'...[0m


Collection 'sat_questions' size: count=300
--- 300 / 1379 ---


Generating embeddings: 100%|██████████| 100/100 [00:23<00:00,  4.32it/s]
Inserting data into sat_questions: 100%|██████████| 100/100 [00:00<00:00, 21336.37it/s]
[32m2025-08-07 18:50:14.173[0m | [1mINFO    [0m | [36msrc.data.index_and_search[0m:[36mindex_df[0m:[36m172[0m - [1mEmbedding col 'question_description'...[0m


Collection 'sat_questions' size: count=400
--- 400 / 1379 ---


Generating embeddings: 100%|██████████| 100/100 [00:24<00:00,  4.16it/s]
Inserting data into sat_questions: 100%|██████████| 100/100 [00:00<00:00, 15391.94it/s]
[32m2025-08-07 18:50:44.099[0m | [1mINFO    [0m | [36msrc.data.index_and_search[0m:[36mindex_df[0m:[36m172[0m - [1mEmbedding col 'question_description'...[0m


Collection 'sat_questions' size: count=500
--- 500 / 1379 ---


Generating embeddings: 100%|██████████| 100/100 [00:25<00:00,  3.94it/s]
Inserting data into sat_questions: 100%|██████████| 100/100 [00:00<00:00, 10329.27it/s]
[32m2025-08-07 18:51:14.232[0m | [1mINFO    [0m | [36msrc.data.index_and_search[0m:[36mindex_df[0m:[36m172[0m - [1mEmbedding col 'question_description'...[0m


Collection 'sat_questions' size: count=600
--- 600 / 1379 ---


Generating embeddings: 100%|██████████| 100/100 [00:24<00:00,  4.16it/s]
Inserting data into sat_questions: 100%|██████████| 100/100 [00:00<00:00, 26324.63it/s]
[32m2025-08-07 18:51:42.783[0m | [1mINFO    [0m | [36msrc.data.index_and_search[0m:[36mindex_df[0m:[36m172[0m - [1mEmbedding col 'question_description'...[0m


Collection 'sat_questions' size: count=700
--- 700 / 1379 ---


Generating embeddings: 100%|██████████| 100/100 [00:24<00:00,  4.16it/s]
Inserting data into sat_questions: 100%|██████████| 100/100 [00:00<00:00, 17253.41it/s]
[32m2025-08-07 18:52:11.241[0m | [1mINFO    [0m | [36msrc.data.index_and_search[0m:[36mindex_df[0m:[36m172[0m - [1mEmbedding col 'question_description'...[0m


Collection 'sat_questions' size: count=800
--- 800 / 1379 ---


Generating embeddings: 100%|██████████| 100/100 [00:23<00:00,  4.23it/s]
Inserting data into sat_questions: 100%|██████████| 100/100 [00:00<00:00, 9635.88it/s]
[32m2025-08-07 18:52:39.199[0m | [1mINFO    [0m | [36msrc.data.index_and_search[0m:[36mindex_df[0m:[36m172[0m - [1mEmbedding col 'question_description'...[0m


Collection 'sat_questions' size: count=900
--- 900 / 1379 ---


Generating embeddings: 100%|██████████| 100/100 [00:25<00:00,  3.94it/s]
Inserting data into sat_questions: 100%|██████████| 100/100 [00:00<00:00, 12159.52it/s]
[32m2025-08-07 18:53:09.199[0m | [1mINFO    [0m | [36msrc.data.index_and_search[0m:[36mindex_df[0m:[36m172[0m - [1mEmbedding col 'question_description'...[0m


Collection 'sat_questions' size: count=1000
--- 1000 / 1379 ---


Generating embeddings: 100%|██████████| 100/100 [00:23<00:00,  4.30it/s]
Inserting data into sat_questions: 100%|██████████| 100/100 [00:00<00:00, 5732.27it/s]
[32m2025-08-07 18:53:36.744[0m | [1mINFO    [0m | [36msrc.data.index_and_search[0m:[36mindex_df[0m:[36m172[0m - [1mEmbedding col 'question_description'...[0m


Collection 'sat_questions' size: count=1100
--- 1100 / 1379 ---


Generating embeddings: 100%|██████████| 100/100 [00:24<00:00,  4.04it/s]
Inserting data into sat_questions: 100%|██████████| 100/100 [00:00<00:00, 13813.87it/s]
[32m2025-08-07 18:54:06.307[0m | [1mINFO    [0m | [36msrc.data.index_and_search[0m:[36mindex_df[0m:[36m172[0m - [1mEmbedding col 'question_description'...[0m


Collection 'sat_questions' size: count=1200
--- 1200 / 1379 ---


Generating embeddings: 100%|██████████| 100/100 [00:23<00:00,  4.26it/s]
Inserting data into sat_questions: 100%|██████████| 100/100 [00:00<00:00, 14028.71it/s]
[32m2025-08-07 18:54:34.229[0m | [1mINFO    [0m | [36msrc.data.index_and_search[0m:[36mindex_df[0m:[36m172[0m - [1mEmbedding col 'question_description'...[0m


Collection 'sat_questions' size: count=1300
--- 1300 / 1379 ---


Generating embeddings: 100%|██████████| 79/79 [00:24<00:00,  3.26it/s]
Inserting data into sat_questions: 100%|██████████| 79/79 [00:00<00:00, 33690.90it/s]


Collection 'sat_questions' size: count=1379
