# Preprocess

In [None]:
from src.utils.folders_utils import get_repo_folder
import json
import pandas as pd

repo_folder = get_repo_folder()
print(f"Repository folder: {repo_folder}")

chr_america_path = repo_folder / "src/data/DB_questions/History/ChroniclingAmericaQA/train.json"
print(chr_america_path)

with open(chr_america_path, "r") as f:
    data = json.load(f)  # loads the full list of dicts

df = pd.DataFrame(data)
df.head()

In [None]:
df

In [None]:
# write func that extracts the area from the para_id
# para_id is in the format of name and then number, we want to extract the name and we dont know how many words it has
# e.g. "New_York_1" -> "New_York", "California_2" -> "California"

def extract_area(para_id):
    parts = para_id.split('_')
    # while last part is a digit remove it
    while parts and parts[-1].isdigit():
        parts.pop()
    return '_'.join(parts)

df['US_state'] = df['para_id'].apply(lambda x: extract_area(x))
df['US_state'].unique()

In [None]:
df[['para_id', 'context', 'question', 'answer', 'US_state', 'publication_date']]

In [None]:
# print the content of each column in df[['para_id', 'context', 'question', 'answer']].iloc[0]
for column in df[['para_id', 'context', 'question', 'answer']].columns:
    print(f"{column}: {df[column].iloc[0]}")


In [None]:
# calc len of context column
df['context_len'] = df['context'].apply(lambda x: len(x.split()))
print(f"Average context length: {df['context_len'].mean()} words")
print(f"Max context length: {df['context_len'].max()} words")
print(f"Min context length: {df['context_len'].min()} words")

In [None]:
# filter df where context_len < 220
df_filtered = df[df['context_len'] < 220].reset_index(drop=True)
print(f"Number of rows with context_len < 220: {len(df_filtered)}")

In [None]:
df_filtered['question_description'] = 'US State: ' + df_filtered['US_state'] + '\nQuestion: ' + df_filtered['question']

columns_to_keep = [
    'question_description', 
    'US_state',
    'publication_date',
    'context', 
    'question',
    'answer'
]

df_to_index = df_filtered[columns_to_keep]
# save df_to_index to csv
df_to_index.to_csv("history_questions_to_index.csv", index=False)
print("Data saved to history_questions_to_index.csv")

In [None]:
df_to_index = pd.read_csv("history_questions_to_index.csv")

# drop na by col question and col answer
print(len(df_to_index))
df_to_index = df_to_index.dropna(subset=["question", "answer"])
df_to_index = df_to_index.reset_index(drop=True)
print(len(df_to_index))

# df_to_index.to_csv("history_questions_to_index.csv", index=False)

In [None]:
# filter by payload size - we'll filter out rows that have length greater than 3000 characters (in all columns together)
print(len(df_to_index))
filtered = df_to_index[df_to_index.apply(lambda x: x.str.len().sum() <= 3000, axis=1)]
filtered = filtered.reset_index(drop=True)
print(len(filtered))

filtered.to_csv("history_questions_to_index.csv", index=False)

# Index

In [1]:
import pandas as pd
df_to_index = pd.read_csv("history_questions_to_index.csv")
df_to_index

Unnamed: 0,question_description,US_state,publication_date,context,question,answer
0,US State: New_Hampshire\nQuestion: Who is the ...,New_Hampshire,1807-08-04,Aiscellaneous Repository. From the Albany Regi...,"Who is the author of the book, ""Horrors of Sla...",WILLIAM RAY
1,US State: New_Hampshire\nQuestion: What is the...,New_Hampshire,1807-08-04,Upon the correction of this remedy the stomach...,What is the number of the agency that sells Bi...,48
2,US State: New_Hampshire\nQuestion: Who is the ...,New_Hampshire,1807-08-04,Upon the correction of this remedy the stomach...,Who is the Vendor of Bitters in Portsmouth?,CHARLES PEIRCE
3,US State: New_Hampshire\nQuestion: Who receive...,New_Hampshire,1807-08-04,"Also FOR SALE AS ABOVE, NEW GOODS, STEPHEN HAR...",Who received a large assortment of JEWELRY and...,STEPHEN HARDY
4,US State: New_Hampshire\nQuestion: How much wa...,New_Hampshire,1807-08-04,At a meeting of the committee of the : subscri...,How much was the third assessment of St. John'...,Twenty Dollars
...,...,...,...,...,...,...
172055,US State: North_Carolina\nQuestion: Who is the...,North_Carolina,1918-11-30,Mr. Leslie Ray of Texas and Mr. Grover Sherril...,Who is the president of Catawba College?,Leslie Ray
172056,US State: North_Carolina\nQuestion: Who was th...,North_Carolina,1918-11-30,Mr. Leslie Ray of Texas and Mr. Grover Sherril...,Who was the president of Catawba College?,Ida Troilinger
172057,US State: North_Carolina\nQuestion: How long h...,North_Carolina,1918-11-30,There are plenty of such gifts; you'll find lo...,How long has CHICHESTER'S DIAMOND BRAND PILLS ...,25 years
172058,US State: North_Carolina\nQuestion: On what da...,North_Carolina,1918-11-30,There are plenty of such gifts; you'll find lo...,On what day of the week is Trinity Lutheran sc...,"November 24,1918"


In [2]:
df_to_index['question_description'].iloc[0]

'US State: New_Hampshire\nQuestion: Who is the author of the book, "Horrors of Slavery, or the American Turf in Tripoli"?'

In [3]:
# check for duplicates by question_description
print(len(df_to_index))
dedup = df_to_index.drop_duplicates(subset=["question_description"])
print(len(dedup))

# # save dedup to csv
# dedup.to_csv("history_questions_to_index.csv", index=False)
# print("Data saved to history_questions_to_index.csv after deduplication")

172060
172060


In [None]:
from src.data.index_and_search import index_df
from loguru import logger

# run in batches to avoid memory issues - index every 100 rows in the df
n = len(df_to_index)
for i in range(0, n, 100):
    batch_df = df_to_index.iloc[i:i+100]
    print(f"--- {i} / {n} ---")
    try:
        index_df(
            df=batch_df, 
            index_by_col="question_description", 
            need_to_embed_col=True, 
            id_col="question_description",
            collection_name="history_questions",
        )
    except Exception as e:
        logger.error(f"Error indexing batch starting at row {i}: {e}")
