# Preprocess

In [None]:
from datasets import load_dataset

# Load all splits
ds = load_dataset("derek-thomas/ScienceQA")

# Or just load a specific split
ds_test = load_dataset("derek-thomas/ScienceQA", split="test")


In [None]:
ds_test

In [None]:
print(ds["train"].features)
print(ds["test"][0])

In [None]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("derek-thomas/ScienceQA", split="train")

rows = []
for i, item in enumerate(ds):
    if "image" in item and item["image"] is None:
        continue
    rows.append({
        "question": item["question"],
        "choices": "|".join(item["choices"]),  # or JSON list
        "answer_index": item["answer"],
        "lecture": item.get("lecture", ""),
        "solution": item.get("solution", "There is no solution provided."),  # Default message if no solution
        "hint": item.get("hint", "There is no hint provided."),  # Default message if no hint
        "task": item.get("task",""),
        "difficulty": item.get("difficulty", "unknown"),  # Default to 'unknown' if not present
        "subject": item.get("subject", "unknown"),  # Default to 'unknown' if not present
        "topic": item.get("topic", "unknown"),  # Default to 'unknown' if not present
        "category": item.get("category", "unknown"),  # Default to 'unknown' if not present
        "skill": item.get("skill", "unknown")  # Default to 'unknown' if not present
    })
    if i <10:  # Print first 10 items
        print(f"Item {i}: {item['question']} | Choices: {item['choices']} | Answer Index: {item['answer']} \n Lecture: {item.get('lecture', '')} \n Solution: {item.get('solution', '')}\n")


df = pd.DataFrame(rows)
df['question_full'] = df['category'] + " - " + df['topic'] + " - " + df['skill'] + " - " + df['question']

df.to_csv("scienceqa_text_only.csv", index=False)


In [None]:
import pandas as pd

check_df = pd.read_csv("scienceqa.csv")
check_df['question_description'] = "Category: " + check_df['category'] + "\nTopic: " + check_df['topic'] + "\nSkill: " + check_df['skill'] + "\nQuestion: " + check_df['question']
check_df

In [None]:

print(check_df['question_description'].iloc[60])

In [None]:
check_df['task'].unique()

In [None]:
check_df['difficulty'].unique()

In [None]:
len(check_df[check_df['solution'].notna()]['solution'])

In [None]:
check_df.iloc[3]

In [None]:
# calc length of lecture
check_df['lecture_length'] = check_df['lecture'].apply(lambda x: len(str(x).split()) if x is not None else 0)
# stats of lecture length
print(check_df['lecture_length'].describe())

In [None]:
# change name of column answer_index -> correct_answer_index
check_df.rename(columns={'answer_index': 'correct_answer_index'}, inplace=True)

# select relevant columns
select_columns = [
    'question_description', 
    'category', 
    'topic', 
    'skill',
    'lecture', 
    'question', 
    'choices', 
    'correct_answer_index', 
    'solution', 
    'hint'
]

df_to_index = check_df[select_columns]
df_to_index.to_csv("science_questions_to_index.csv", index=False)
df_to_index

In [None]:
import pandas as pd
df_to_index = pd.read_csv("science_questions_to_index.csv")
df_to_index

In [None]:
df_to_index = pd.read_csv("science_questions_to_index.csv")

# drop na by col question and col answer
print(len(df_to_index))
df_to_index = df_to_index.dropna(subset=["question", "correct_answer_index"])
df_to_index = df_to_index.reset_index(drop=True)
print(len(df_to_index))

# df_to_index.to_csv("science_questions_to_index.csv", index=False)

In [None]:
# filter by payload size - we'll filter out rows that have length greater than 3000 characters (in all columns together)
print(len(df_to_index))
filtered = df_to_index[df_to_index.apply(lambda x: x.str.len().sum() <= 3000, axis=1)]
filtered = filtered.reset_index(drop=True)
print(len(filtered))

filtered.to_csv("science_questions_to_index.csv", index=False)

# Index

In [None]:
import pandas as pd
df_to_index = pd.read_csv("science_questions_to_index.csv")
df_to_index

In [None]:
# drop duplicates by question_description
print(len(df_to_index))
dedup = df_to_index.drop_duplicates(subset=["question_description"])
print(len(dedup))
df_to_index = dedup.reset_index(drop=True)
df_to_index.to_csv("science_questions_to_index.csv", index=False)

In [None]:
from src.data.index_and_search import index_df
from loguru import logger

# run in batches to avoid memory issues - index every 100 rows in the df
n = len(df_to_index)
for i in range(0, n, 100):
    batch_df = df_to_index.iloc[i:i+100]
    print(f"--- {i} / {n} ---")
    try:
        index_df(
            df=batch_df, 
            index_by_col="question_description", 
            need_to_embed_col=True, 
            id_col="question_description",
            collection_name="science_questions",
        )
    except Exception as e:
        logger.error(f"Error indexing batch starting at row {i}: {e}")


In [16]:
from src.data.index_and_search import DB

query = """
Category: Geography
Topic: maps
Skill: reading maps
"""

db = DB()
results = db.search_by_query_vec(collection_name="science_questions", query=query, top_k=4)
print(f"Search results for query '{query}': \n{results}")


Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.10s/it]


Search results for query '
Category: Geography
Topic: maps
Skill: reading maps
': 
[{'question_description': 'Category: Geography\nTopic: geography\nSkill: Identify oceans and continents\nQuestion: Which continent is highlighted?', 'category': 'Geography', 'topic': 'geography', 'skill': 'Identify oceans and continents', 'lecture': 'A continent is one of the seven largest areas of land on earth.', 'question': 'Which continent is highlighted?', 'choices': 'Africa|Asia|Europe', 'correct_answer_index': 2, 'solution': 'This continent is Europe.', 'hint': None}, {'question_description': 'Category: Maps\nTopic: geography\nSkill: Read a map: cardinal directions\nQuestion: Which of these states is farthest south?', 'category': 'Maps', 'topic': 'geography', 'skill': 'Read a map: cardinal directions', 'lecture': 'Maps have four cardinal directions, or main directions. Those directions are north, south, east, and west.\nA compass rose is a set of arrows that point to the cardinal directions. A com

In [17]:
results[0]

{'question_description': 'Category: Geography\nTopic: geography\nSkill: Identify oceans and continents\nQuestion: Which continent is highlighted?',
 'category': 'Geography',
 'topic': 'geography',
 'skill': 'Identify oceans and continents',
 'lecture': 'A continent is one of the seven largest areas of land on earth.',
 'question': 'Which continent is highlighted?',
 'choices': 'Africa|Asia|Europe',
 'correct_answer_index': 2,
 'solution': 'This continent is Europe.',
 'hint': None}

In [18]:
results[1]

{'question_description': 'Category: Maps\nTopic: geography\nSkill: Read a map: cardinal directions\nQuestion: Which of these states is farthest south?',
 'category': 'Maps',
 'topic': 'geography',
 'skill': 'Read a map: cardinal directions',
 'lecture': 'Maps have four cardinal directions, or main directions. Those directions are north, south, east, and west.\nA compass rose is a set of arrows that point to the cardinal directions. A compass rose usually shows only the first letter of each cardinal direction.\nThe north arrow points to the North Pole. On most maps, north is at the top of the map.',
 'question': 'Which of these states is farthest south?',
 'choices': 'Colorado|Maine|South Dakota|Massachusetts',
 'correct_answer_index': 0,
 'solution': 'To find the answer, look at the compass rose. Look at which way the south arrow is pointing. Colorado is farthest south.',
 'hint': None}

In [19]:
results[2]

{'question_description': 'Category: Geography\nTopic: geography\nSkill: Read a map: cardinal directions\nQuestion: Which of these states is farthest south?',
 'category': 'Geography',
 'topic': 'geography',
 'skill': 'Read a map: cardinal directions',
 'lecture': 'Maps have four cardinal directions, or main directions. Those directions are north, south, east, and west.\nA compass rose is a set of arrows that point to the cardinal directions. A compass rose usually shows only the first letter of each cardinal direction.\nThe north arrow points to the North Pole. On most maps, north is at the top of the map.',
 'question': 'Which of these states is farthest south?',
 'choices': 'Nebraska|Michigan|Vermont|Maine',
 'correct_answer_index': 0,
 'solution': 'To find the answer, look at the compass rose. Look at which way the south arrow is pointing. Nebraska is farthest south.',
 'hint': None}

In [20]:
results[3]

{'question_description': 'Category: Maps\nTopic: geography\nSkill: Use lines of latitude and longitude\nQuestion: Which of these continents does the equator intersect?',
 'category': 'Maps',
 'topic': 'geography',
 'skill': 'Use lines of latitude and longitude',
 'lecture': 'Lines of latitude and lines of longitude are imaginary lines drawn on some globes and maps. They can help you find places on globes and maps.\nLines of latitude show how far north or south a place is. We use units called degrees to describe how far a place is from the equator. The equator is the line located at 0° latitude. We start counting degrees from there.\nLines north of the equator are labeled N for north. Lines south of the equator are labeled S for south. Lines of latitude are also called parallels because each line is parallel to the equator.\nLines of longitude are also called meridians. They show how far east or west a place is. We use degrees to help describe how far a place is from the prime meridian.

In [4]:
from src.data.index_and_search import DB

db = DB()
db.print_collection_size("science_questions")
db.print_collection_size("sat_questions")
db.print_collection_size("history_questions")
db.print_collection_size("math_questions")

Collection 'science_questions' size: count=1637
Collection 'sat_questions' size: count=1379
Collection 'history_questions' size: count=66300
Collection 'math_questions' size: count=41405
