# Multihop Retrieval System

This notebook implements a multi-hop retrieval system that:
1. Uses a pre-populated vector database (FAISS)
2. Decomposes complex questions into sub-questions using an LLM
3. Retrieves relevant documents for each sub-question
4. Answers each sub-question based on retrieved documents
5. Combines the sub-answers into a final comprehensive answer and answer the big question

## Imports

In [44]:
import os
import json
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.prompts import PromptTemplate

## Environment Setup

In [45]:
load_dotenv()

True

## Vector Database Functions

In [None]:
def create_vector_database(dataset_path='dataset/corpus.json', save_path="faiss_index_with_metadata"):
    # Initialize embeddings
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    
    # Load the dataset
    with open(dataset_path, 'r') as f:
        data = json.load(f)
    
    # Create document objects
    docs = []
    for item in data:
        title = item.get('title', 'Unknown Title')  
        body = item.get('body', '')
        doc = Document(page_content=body, metadata={"title": title})
        docs.append(doc)
    
    # Split the text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100)
    split_docs = text_splitter.split_documents(docs)
    
    # Store the embeddings
    vector_store = FAISS.from_documents(split_docs, embeddings)
    vector_store.save_local(save_path)
    
    print(f"FAISS index with metadata stored successfully at {save_path}!")
    return vector_store

In [46]:
def load_vector_database(load_path="faiss_index_with_metadata", preview=False):
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    vector_store = FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
    
    if preview:
        print("Displaying first 5 indexed documents:\n")
        for i, doc in enumerate(vector_store.docstore._dict.values()):
            if i >= 5:  
                break
            print(f"Index {i + 1}:")
            print(f"Title: {doc.metadata.get('title', 'No Title')}")
            print(f"Content: {doc.page_content[:300]}...") 
            print("-" * 50)
    
    return vector_store

## Question Decomposition

In [47]:
def decompose_question(question, model_name="gpt-4o", temperature=0):
    # Create a prompt template
    decompose_prompt = PromptTemplate(
        template="You are given a question. If the question can be broken into multiple smaller phrases, "
                 "return sub phrases and the question. However, if the question is simple and cannot be "
                 "broken into smaller phrases, just return the original question. A sub phrase is just a "
                 "relevant part of the question, not a whole new question. \nQuestion: {question}\nSub-questions:",
        input_variables=["question"]
    )
    
    # Initialize the model
    llm = ChatOpenAI(model_name=model_name, temperature=temperature)
    decompose_chain = decompose_prompt | llm
    
    # Run the chain to get sub-questions
    sub_questions_text = decompose_chain.invoke({"question": question})
    
    # Parse the sub-questions
    sub_questions = [sq.strip().lstrip("0123456789. ") for sq in sub_questions_text.content.splitlines() if sq.strip()]
    
    print(f"Decomposed into {len(sub_questions)} sub-questions:")
    for i, sq in enumerate(sub_questions, 1):
        print(f"{i}. {sq}")
    
    return sub_questions, llm

## Document Retrieval and Answer Extraction

In [48]:
def retrieve_and_answer_subquestions(sub_questions, vector_store, k=3):
    sub_answers = {}
    
    for i, sub_q in enumerate(sub_questions, start=1):
        print(f"\nProcessing sub-question {i}: {sub_q}")
        
        # Retrieve top k relevant documents
        docs_result = vector_store.similarity_search(sub_q, k=k)  
        
        if not docs_result:
            print("No relevant documents found for this sub-question.")
            sub_answers[sub_q] = "No relevant information found."
            continue  

        top_doc = docs_result[0]
        content = top_doc.page_content
        
        print(f"Top document title: {top_doc.metadata.get('title', 'Unknown')}")
        print(f"Top document snippet: {content[:200]}...")
        
        # Extract the first sentence as a simple answer
        answer_piece = content.split('.')[0].strip()
        sub_answers[sub_q] = answer_piece
        
        print(f"Extracted answer: {answer_piece}")
    
    return sub_answers

## Final Answer Generation

In [49]:
def generate_final_answer(sub_answers, llm, original_question):
    # Format the sub-answers for the prompt
    formatted_answers = "\n".join([f"Sub-question: {q}\nAnswer: {a}" for q, a in sub_answers.items()])
    
    # Create the final prompt template with 10-word limit
    final_prompt_template = PromptTemplate(
        template=(
            "You are answering a complex question based on retrieved sub-answers. \n\n"
            "Original question: {original_question}\n\n"
            "Below are the sub-questions and their extracted answers that provide context:\n\n"
            "{formatted_answers}\n\n"
            "Using all the sub-answers as context, answer the original question in EXACTLY 10 words or less."
        ),
        input_variables=["original_question", "formatted_answers"]
    )
    
    # Generate the final response
    final_response = llm.invoke(
        final_prompt_template.format(
            original_question=original_question,
            formatted_answers=formatted_answers
        )
    )
    
    return final_response.content

## Main Multi-hop Retrieval Function

In [50]:
def multihop_retrieval(question, create_new_db=False):
    # Create or load vector database
    if create_new_db:
        vector_store = create_vector_database()
    else:
        vector_store = load_vector_database()
    
    # Decompose the question
    sub_questions, llm = decompose_question(question)
    
    # Retrieve documents and extract answers for each sub-question
    sub_answers = retrieve_and_answer_subquestions(sub_questions, vector_store)
    
    # Generate the final answer
    final_answer = generate_final_answer(sub_answers, llm, question)
    
    print("\n" + "=" * 50)
    print("FINAL ANSWER:")
    print("=" * 50)
    print(final_answer)
    
    return {
        "original_question": question,
        "sub_questions": sub_questions,
        "sub_answers": sub_answers,
        "final_answer": final_answer
    }

## Example Usage

In [51]:
# Example complex question
complex_question = "Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?"

# Run the multi-hop retrieval system
result = multihop_retrieval(complex_question)

Decomposed into 5 sub-questions:
1. Who is the individual associated with the cryptocurrency industry?
2. Who is facing a criminal trial on fraud and conspiracy charges?
3. Which publications reported on this individual?
4. What is the individual accused of by prosecutors?
5. What is the alleged motive behind the fraud?

Processing sub-question 1: Who is the individual associated with the cryptocurrency industry?
Top document title: Is Sam Bankman-Fried a bad ‘man’ or a good ‘boy’? Lawyers swap opening statements before first witnesses take the stand
Top document snippet: The first was a victim: Marc-Antoine Julliard, a Paris-born cocoa trader who lives in London. In 2021, Julliard, who had coiffed hair and spoke with a strong French accent, decided to invest in crypto...
Extracted answer: The first was a victim: Marc-Antoine Julliard, a Paris-born cocoa trader who lives in London

Processing sub-question 2: Who is facing a criminal trial on fraud and conspiracy charges?
Top document t