## Step 1: Preprocessing data

In [1]:
import pandas as pd
import os
import re

**Load Multi-hop Corpus**

In [12]:
with open('VDT2025_Multihop_RAG/multihoprag_corpus.txt', 'r') as f:
    # Read the entire file content
    corpus = f.read()

# Remove <endofpassage> tags
cleaned_corpus = corpus.replace('<endofpassage>', '')

# Split by Title
entries = re.split(r'Title:', corpus)
data = []
for entry in entries:
    if not entry.strip():
        continue
    # Split title and passage, keeping 'Passage:' and '<endofpassage>'
    title_match = re.match(r'([^\n]+)\n(Passage:.*?)(?=Title:|$)', entry, re.DOTALL)
    if title_match:
        title = title_match.group(1).strip()
        passage = title_match.group(2).strip()
        data.append([title, passage])


In [13]:
df = pd.DataFrame(data, columns=['title_name', 'content'])
def extract_passages(corpus_text):
    """
    Extracts all passages (including 'Passage:' and '<endofpassage>') from the corpus text.
    Returns a list of passage strings.
    """
    pattern = r'Passage:(.*?<endofpassage>)'
    passages = re.findall(pattern, corpus_text, re.DOTALL)
    # Add back the 'Passage:' prefix to each passage
    passages = [p.replace('<endofpassage>', '').strip() for p in passages]
    combined_passage = " ".join(passages)
    return combined_passage

# Apply to the content of the DataFrame
df['content'] = df['content'].apply(extract_passages)
print("Number of titles:", len(df))
df.head()

Number of titles: 609


Unnamed: 0,title_name,content
0,200+ of the best deals from Amazon's Cyber Mon...,"Table of Contents Table of Contents Echo, Fire..."
1,ASX set to drop as Wall Street’s September slu...,"ETF provider Betashares, which manages $30 bil..."
2,Amazon sellers sound off on the FTC's 'long-ov...,A worker sorts out parcels in the outbound doc...
3,"Christmas Day preview: 49ers, Ravens square of...","Christmas Day isn't just for the NBA, as the N..."
4,"Raiders vs. Lions live score, updates, highlig...",The Lions just needed to get themselves back i...


**Load Multi-hop RAG**

In [14]:
import pandas as pd
import json

# Load the JSON file
with open("VDT2025_Multihop_RAG/MultiHopRAG.json", "r") as f:
    query_data = json.load(f)

# Convert to DataFrame
query_df = pd.DataFrame(query_data)
query_df = query_df.drop('question_type', axis=1)
print("Number of samples to query:", len(query_df))
print(query_df.head())

Number of samples to query: 2556
                                               query              answer  \
0  Who is the individual associated with the cryp...   Sam Bankman-Fried   
1  Which individual is implicated in both inflati...        Donald Trump   
2  Who is the figure associated with generative A...          Sam Altman   
3  Do the TechCrunch article on software companie...                 Yes   
4  Which online betting platform provides a welco...  Caesars Sportsbook   

                                       evidence_list  
0  [{'title': 'The FTX trial is bigger than Sam B...  
1  [{'title': 'Donald Trump defrauded banks with ...  
2  [{'title': 'OpenAI's ex-chairman accuses board...  
3  [{'title': 'Here’s how Rainforest, a budding S...  
4  [{'title': '2023 Kentucky online sports bettin...  


## Step 2: Indexing

In [None]:
#### INDEXING ####

# Load blog
from langchain_community.document_loaders import DataFrameLoader
loader = DataFrameLoader(df, page_content_column="content")
blog_docs = loader.load()

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(blog_docs)

# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

In [164]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

## Step 3: Decomposition

In [212]:
from langchain.prompts import ChatPromptTemplate

# Decomposition
template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question} \n
Output (5 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [213]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

# LLM
llm = ChatOpenAI(temperature=0)

# Chain
generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))

### Answer individually

In [280]:
start = 0
end = 1
pred_responses = []

for idx in range(start, end):
    # Run
    question = query_df['query'][idx]
    questions = generate_queries_decomposition.invoke({"question":question})
    
    import textwrap

    # For a single long string
    print(f"### START EXAMPLE {idx + 1} ###")
    print(textwrap.fill(f"Question: {question}", width=100))
    print()
    print("DECOMPOSITION:")

    # For a list of strings
    for q in questions:
        print(textwrap.fill(q, width=100))
        print("-" * 40)
    
    # Answer each sub-question individually 

    from langchain import hub
    from langchain_core.prompts import ChatPromptTemplate
    from langchain_core.runnables import RunnablePassthrough, RunnableLambda
    from langchain_core.output_parsers import StrOutputParser
    from langchain_openai import ChatOpenAI

    # RAG prompt
    prompt_rag = hub.pull("rlm/rag-prompt")

    total_retrieved_docs = []
    def retrieve_and_rag(question,prompt_rag,sub_question_generator_chain):
        """RAG on each sub-question"""
        
        # Use our decomposition / 
        sub_questions = sub_question_generator_chain.invoke({"question":question})
        
        # Initialize a list to hold RAG chain results
        rag_results = []
        
        for sub_question in sub_questions:
            
            # Retrieve documents for each sub-question
            retrieved_docs = retriever.invoke(sub_question)
            for doc in retrieved_docs:
                title = doc.metadata["title_name"]
                fact = doc.page_content
                doc_content = {'title': title, 'fact': fact}
                total_retrieved_docs.append(doc_content)
            
            # Use retrieved documents and sub-question in RAG chain
            answer = (prompt_rag | llm | StrOutputParser()).invoke({"context": retrieved_docs, 
                                                                    "question": sub_question})
            rag_results.append(answer)
        
        return rag_results,sub_questions

    # Wrap the retrieval and RAG process in a RunnableLambda for integration into a chain
    answers, questions = retrieve_and_rag(question, prompt_rag, generate_queries_decomposition)

    def format_qa_pairs(questions, answers):
        """Format Q and A pairs"""
        
        formatted_string = ""
        for i, (question, answer) in enumerate(zip(questions, answers), start=1):
            formatted_string += f"Question {i}: {question}\nAnswer {i}: {answer}\n\n"
        return formatted_string.strip()

    context = format_qa_pairs(questions, answers)

    # Prompt
    template = """Here is a set of Q+A pairs:

    {context}

    Use these to synthesize an answer to the question: {question}
    Put the final answer clearly at the end of your response, using the following format: <your_answer>.
    For Yes/No question, only respond <Yes> or <No>.
    For question which you think not possible to determine the answer, respond: <Insufficient information.>
    """

    prompt = ChatPromptTemplate.from_template(template)

    final_rag_chain = (
        prompt
        | llm
        | StrOutputParser()
    )

    answer = final_rag_chain.invoke({"context":context,"question":question})
    # For a single long string
    print(textwrap.fill(f"Answer: {answer}", width=100))
    print(f"### END EXAMPLE {idx + 1} ###")
    print()

    pred_response = {
        "query": question,
        "answer": answer,
        "evidence_list": total_retrieved_docs
    }

    pred_responses.append(pred_response)

### START EXAMPLE 1 ###
Question: Who is the individual associated with the cryptocurrency industry facing a criminal trial
on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by
prosecutors of committing fraud for personal gain?

DECOMPOSITION:
1. What are the specific fraud and conspiracy charges faced by the individual in the cryptocurrency
industry?
----------------------------------------
2. What evidence do prosecutors have against the individual in the criminal trial for fraud and
conspiracy?
----------------------------------------
3. How has the cryptocurrency industry been impacted by the criminal trial of this individual?
----------------------------------------
4. What is the background and history of the individual in the cryptocurrency industry prior to
facing criminal charges?
----------------------------------------
5. Are there any updates or developments in the case reported by other news sources besides The
Verge and TechCrun

## Step 4: Evaluation

In [215]:
# Function to check if there is an intersection of words between two strings
def has_intersection(a, b):
    a_words = set(a.split())
    b_words = set(b.split())
    return len(a_words.intersection(b_words)) > 0

# Function to calculate evaluation metrics
def calculate_metrics(pred_list, gold_list):
    tp = sum(1 for pred, gold in zip(pred_list, gold_list) if has_intersection(pred.lower(), gold.lower()))
    fp = sum(1 for pred, gold in zip(pred_list, gold_list) if not has_intersection(pred.lower(), gold.lower()))
    fn = len(gold_list) - tp
    tn = len(pred_list) - tp 

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0

    return precision, recall, f1, accuracy

In [216]:
responses_df = pd.DataFrame(pred_responses)
print(responses_df.head())
responses_df.to_json("output.json", orient="records", indent=4, force_ascii=False)

                                               query  \
0  Who is the individual associated with the cryp...   
1  Which individual is implicated in both inflati...   
2  Who is the figure associated with generative A...   
3  Do the TechCrunch article on software companie...   
4  Which online betting platform provides a welco...   

                                              answer  \
0  The individual associated with the cryptocurre...   
1  The individual implicated in both inflating th...   
2  The figure associated with generative AI techn...   
3  The TechCrunch article on software companies r...   
4  DraftKings Sportsbook is the online betting pl...   

                                       evidence_list  
0  [{'title': 'The FTX trial is bigger than Sam B...  
1  [{'title': 'The $777 million surprise: Donald ...  
2  [{'title': 'Amazon’s Tye Brady discusses gener...  
3  [{'title': 'Sam Altman backs teens’ startup, G...  
4  [{'title': 'Vermont Sportsbook Promos and Spor..

In [217]:
evaluate_samples = len(responses_df["answer"])
pred_list = responses_df["answer"][:evaluate_samples].tolist()
gold_list = query_df["answer"][:evaluate_samples].tolist()

In [218]:
import re

def extract_boxed_answer(text):
    """
    Extracts the answer in the format <answer> from a string.
    Returns the answer inside the angle brackets, or None if not found.
    """
    match = re.search(r"<([^<>]+)>", text)
    if match:
        return match.group(1).strip()
    return text.replace(".", " ")

pred_list = [extract_boxed_answer(p) for p in pred_list]

In [219]:
for pred in pred_list:
    print(textwrap.fill(pred, width=100))
    print("-" * 40)

The individual associated with the cryptocurrency industry facing a criminal trial on fraud and
conspiracy charges, as reported by both The Verge and TechCrunch, and accused by prosecutors of
committing fraud for personal gain is Sam Bankman-Fried
----------------------------------------
Donald Trump
----------------------------------------
Sam Altman
----------------------------------------
Yes
----------------------------------------
DraftKings Sportsbook
----------------------------------------
Sam Bankman-Fried
----------------------------------------
Yes
----------------------------------------
Yes
----------------------------------------
OpenAI
----------------------------------------
Google
----------------------------------------
The character portrayed by Sridevi awarded the Congressional Gold Medal
----------------------------------------
Yes
----------------------------------------
Yes
----------------------------------------
Insufficient information.
-----------------------

In [220]:
calculate_metrics(pred_list, gold_list)

(0.78, 0.78, 0.78, 0.6944444444444444)

### Calculate Precision & Recall Retrieval

In [227]:
import json
import pandas as pd

# Load MultiHopRAG.json
with open("VDT2025_Multihop_RAG/MultiHopRAG.json", "r") as f:
    multihoprag_data = json.load(f)

# Convert to DataFrame
multihoprag_df = pd.DataFrame(multihoprag_data)

# Show the first few rows
print(multihoprag_df.head())

                                               query              answer  \
0  Who is the individual associated with the cryp...   Sam Bankman-Fried   
1  Which individual is implicated in both inflati...        Donald Trump   
2  Who is the figure associated with generative A...          Sam Altman   
3  Do the TechCrunch article on software companie...                 Yes   
4  Which online betting platform provides a welco...  Caesars Sportsbook   

      question_type                                      evidence_list  
0   inference_query  [{'title': 'The FTX trial is bigger than Sam B...  
1   inference_query  [{'title': 'Donald Trump defrauded banks with ...  
2   inference_query  [{'title': 'OpenAI's ex-chairman accuses board...  
3  comparison_query  [{'title': 'Here’s how Rainforest, a budding S...  
4   inference_query  [{'title': '2023 Kentucky online sports bettin...  


In [274]:
import json
import pandas as pd

# Load MultiHopRAG.json
with open("4_decomposition.json", "r") as f:
    load_responses = json.load(f)

# Convert to DataFrame
load_responses_df = pd.DataFrame(load_responses)

# Show the first few rows
print(load_responses_df.head())

                                               query  \
0  Who is the individual associated with the cryp...   
1  Which individual is implicated in both inflati...   
2  Who is the figure associated with generative A...   
3  Do the TechCrunch article on software companie...   
4  Which online betting platform provides a welco...   

                                              answer  \
0  The individual associated with the cryptocurre...   
1  The individual implicated in both inflating th...   
2  The figure associated with generative AI techn...   
3  The TechCrunch article on software companies d...   
4  The online betting platform that provides a we...   

                                       evidence_list  
0  [{'title': 'The FTX trial is bigger than Sam B...  
1  [{'title': 'The $777 million surprise: Donald ...  
2  [{'title': 'Amazon’s Tye Brady discusses gener...  
3  [{'title': 'Adobe: Thanksgiving US online sale...  
4  [{'title': 'Vermont Sportsbook Promos and Spor..

In [275]:
pred_evidences = load_responses_df["evidence_list"].tolist()
gold_evidences = []
for item in multihoprag_df["evidence_list"][:50].tolist():
    gold_evidence = []
    for d in item:
        gold_evidence.append({"title": d["title"], "fact": d["fact"]})
    gold_evidences.append(gold_evidence)

In [276]:
pred_evidences[0]

[{'title': 'The FTX trial is bigger than Sam Bankman-Fried',
  'fact': 'Bankman-Fried gave interviews freely — and quickly rose to public prominence in the industry. Though FTX hadn’t been in the business as long as competing exchanges such as Coinbase, Kraken, or Gemini, Bankman-Fried positioned himself as an important, boyish face for crypto. (At one point, Bankman-Fried told a colleague at FTX that “I honestly think it’s negative EV [this may mean “expected value,” as in poker] for me to cut my hair. I think it’s important for people to think I look crazy.”)\n\nBecause he was so successful at this kind of public relations, his fall from grace was another mark against an industry that was already roiled by bankruptcies and scandals. Some additional trouble for the crypto industry is likely to come from one crucial element of the fraud trial — the part where the government must prove intent.\n\nThe first part of proving the government’s case is pretty simple and a little boring: prose

**Testing**

In [277]:
import re

def normalize_text(text):
    # Remove all symbols, keep only letters, numbers, and spaces
    normalized = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    normalized = normalized.lower()
    normalized = re.sub(r'\s+', ' ', normalized).strip()
    return normalized

def calculate_metrics(retrieval_docs, gold_docs):
    # Normalize facts
    retrieval_facts = [normalize_text(doc['title']) for doc in retrieval_docs]
    gold_facts = [normalize_text(doc['title']) for doc in gold_docs]


    # Calculate True Positives
    TP = 0
    for gold_fact in gold_facts:
        for retrieval_fact in retrieval_facts:
            if gold_fact in retrieval_fact:
                TP += 1
                break

    # Calculate False Positives
    FP = 0
    for retrieval_fact in retrieval_facts:
        if not any(gold_fact in retrieval_fact for gold_fact in gold_facts):
            FP += 1

    # Calculate False Negatives
    FN = 0
    for gold_fact in gold_facts:
        if not any(gold_fact in retrieval_fact for retrieval_fact in retrieval_facts):
            FN += 1

    # Calculate Precision, Recall, F1
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1



In [278]:
print(len(pred_evidences))
print(len(gold_evidences))

50
50


In [279]:
total_precision = 0
total_recall = 0
total_f1 = 0
for idx in range(50):
    precision, recall, f1 = calculate_metrics(pred_evidences[idx], gold_evidences[idx])
    total_precision += precision
    total_recall += recall
    total_f1 += f1

print("Mean Precision:", total_precision / 50)
print("Mean Recall:", total_recall / 50)
print("Mean F1:", total_f1 / 50)

Mean Precision: 0.2141990120302813
Mean Recall: 0.66
Mean F1: 0.30100128554833094
