In [None]:
# BEIR data loading and evaluation
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval

# lotus model
import pandas as pd
import lotus
from lotus.models import LM, SentenceTransformersRM, CrossEncoderReranker
from lotus.vector_store import FaissVS
import numpy as np

import logging
import pathlib, os
import time

In [17]:
"""Initialize Lotus components and configure settings."""
os.environ['DEEPSEEK_API_KEY'] = "sk-520ee20b73934d8ca49d3d16318d9d40"

lm = LM(model="deepseek/deepseek-chat")
rm = SentenceTransformersRM(model="intfloat/e5-base-v2")
reranker = CrossEncoderReranker(model="mixedbread-ai/mxbai-rerank-large-v1")
vs = FaissVS()
# Configure all components
lotus.settings.configure(lm=lm, rm=rm, reranker=reranker, vs=vs)


2025-05-08 13:30:43,338 - INFO - Use pytorch device_name: cuda
2025-05-08 13:30:43,339 - INFO - Load pretrained SentenceTransformer: intfloat/e5-base-v2
2025-05-08 13:30:45,530 - INFO - Use pytorch device: cuda


In [2]:
# Configure logging
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

In [3]:
#### Download scifact.zip dataset and unzip the dataset
dataset = "fever"
url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
out_dir = os.path.abspath("datasets")

In [4]:
corpus, queries, qrels = GenericDataLoader(data_folder=f"{out_dir}/fever/").load(split="dev")
logger.info(f"Dataset loaded. Corpus size: {len(corpus)}, Queries size: {len(queries)}")

2025-05-08 12:20:21,570 - INFO - Loading Corpus...
100%|██████████| 5416568/5416568 [00:35<00:00, 153100.88it/s]
2025-05-08 12:20:59,607 - INFO - Loaded 5416568 DEV Documents.
2025-05-08 12:20:59,852 - INFO - Doc Example: {'text': 'The following are the football ( soccer ) events of the year 1928 throughout the world .', 'title': '1928 in association football'}
2025-05-08 12:20:59,853 - INFO - Loading Queries...
2025-05-08 12:21:00,505 - INFO - Loaded 6666 DEV Queries.
2025-05-08 12:21:00,506 - INFO - Query Example: Fox 2000 Pictures released the film Soul Food.
2025-05-08 12:21:00,508 - INFO - Dataset loaded. Corpus size: 5416568, Queries size: 6666


In [12]:
# === Convert corpus to DataFrame ===
corpus_df = pd.DataFrame([{
    "text": doc['text']
} for _, doc in corpus.items()])
corpus_df_1000 = corpus_df.sample(n=1000, random_state=42).reset_index(drop=True)


In [13]:
# create index
corpus_df_1000 = corpus_df_1000.sem_index('text', f"{out_dir}/fever/index_1000/")

100%|██████████| 16/16 [00:14<00:00,  1.13it/s]


In [21]:
# corpus_df = corpus_df.sem_index('text', "/mnt/homes/ktle/lotus-ai/benchmark/datasets/fever/index")
# === Convert queries to DataFrame with gold label
queries_df = pd.DataFrame([{
    "query_id": query_id,
    "claim": query,
} for query_id, query in queries.items()])

# === Sample n_sample queries
queries_df = queries_df.sample(n=10, random_state=42).reset_index(drop=True)


In [15]:
# load index
vs.load_index(f"{out_dir}/fever/index_1000/")

In [22]:
try:
    # Step 1: Query rewriting
    logger.info("Performing query rewriting...")
    queries_df = queries_df.sem_map("Rewrite the claim into a semantic search query question to retrieve relevant Wikipedia evidence. Claim: {claim}. Only respond with the query and do not need include site:wikipedia.org.")
    logger.info("Query rewriting completed successfully")
except Exception as e:
    logger.error(f"Error during query rewriting: {e}")
    logger.warning("Falling back to original claim text for retrieval")    
results = []
print(queries_df.iloc[1])


2025-05-08 13:41:45,340 - INFO - Performing query rewriting...
Mapping: 100%|██████████ 10/10 LM calls [00:05<00:00,  1.82it/s]
2025-05-08 13:41:50,841 - INFO - Query rewriting completed successfully


query_id                                        139059
claim       Murda Beatz was born on February 21, 1994.
_map                        When was Murda Beatz born?
Name: 1, dtype: object


In [None]:
# Step 2: Document retrieval for each claim
for idx, row in queries_df.iterrows():
    query_id = row["query_id"]
    claim = row["claim"]
    query_text = row["_map"]
    
    logger.info(f"Processing claim {idx+1}/{len(queries_df)}: ID={query_id}")
    logger.info(f"Original claim: {claim}")
    logger.info(f"Mapped query: {query_text}")
    
    # Search the corpus for documents relevant to the query
    search_results = corpus_df_1000.sem_search(
        'text',
        f"Which text answer this question: {query_text}?",
        K=5,
    )
    logger.info(f"Retrieved {len(search_results)} documents")
    
    logger.info(f'Started filtering process for claim: {claim}')
    supporting_search_results = search_results.sem_filter(
        f"Text provide evidence for {claim}?"
    )
            # If you have a unique identifier column like 'doc_id'
    supporting_ids = supporting_search_results['doc_id'].tolist()
    unsupporting_search_results = search_results[~search_results['doc_id'].isin(supporting_ids)]
    results.append({
        "claim": claim,
        "mapped_query": query_text,
        "supporting_texts": "\n---\n".join(supporting_search_results['text'].tolist()),
        "non_supporting_texts": "\n---\n".join(unsupporting_search_results['text'].tolist()),
    })
