## Step 1: Preprocessing data

In [2]:
import pandas as pd
import os
import re
import textwrap

In [3]:
import getpass
import os

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

**Load Multi-hop Corpus**

In [4]:
with open('VDT2025_Multihop_RAG/multihoprag_corpus.txt', 'r') as f:
    # Read the entire file content
    corpus = f.read()

# Remove <endofpassage> tags
cleaned_corpus = corpus.replace('<endofpassage>', '')

# Split by Title
entries = re.split(r'Title:', corpus)
data = []
for entry in entries:
    if not entry.strip():
        continue
    # Split title and passage, keeping 'Passage:' and '<endofpassage>'
    title_match = re.match(r'([^\n]+)\n(Passage:.*?)(?=Title:|$)', entry, re.DOTALL)
    if title_match:
        title = title_match.group(1).strip()
        passage = title_match.group(2).strip()
        data.append([title, passage])


In [5]:
df = pd.DataFrame(data, columns=['title_name', 'content'])
def extract_passages(corpus_text):
    """
    Extracts all passages (including 'Passage:' and '<endofpassage>') from the corpus text.
    Returns a list of passage strings.
    """
    pattern = r'Passage:(.*?<endofpassage>)'
    passages = re.findall(pattern, corpus_text, re.DOTALL)
    # Add back the 'Passage:' prefix to each passage
    passages = [p.replace('<endofpassage>', '').strip() for p in passages]
    combined_passage = " ".join(passages)
    return combined_passage

# Apply to the content of the DataFrame
df['content'] = df['content'].apply(extract_passages)
print("Number of titles:", len(df))
df.head()

Number of titles: 609


Unnamed: 0,title_name,content
0,200+ of the best deals from Amazon's Cyber Mon...,"Table of Contents Table of Contents Echo, Fire..."
1,ASX set to drop as Wall Street’s September slu...,"ETF provider Betashares, which manages $30 bil..."
2,Amazon sellers sound off on the FTC's 'long-ov...,A worker sorts out parcels in the outbound doc...
3,"Christmas Day preview: 49ers, Ravens square of...","Christmas Day isn't just for the NBA, as the N..."
4,"Raiders vs. Lions live score, updates, highlig...",The Lions just needed to get themselves back i...


**Load Multi-Hop RAG**

In [6]:
import pandas as pd
import json

# Load the JSON file
with open("VDT2025_Multihop_RAG/MultiHopRAG.json", "r") as f:
    query_data = json.load(f)

# Convert to DataFrame
query_df = pd.DataFrame(query_data)
query_df = query_df.drop('question_type', axis=1)
print("Number of samples to query:", len(query_df))
print(query_df.head())

Number of samples to query: 2556
                                               query              answer  \
0  Who is the individual associated with the cryp...   Sam Bankman-Fried   
1  Which individual is implicated in both inflati...        Donald Trump   
2  Who is the figure associated with generative A...          Sam Altman   
3  Do the TechCrunch article on software companie...                 Yes   
4  Which online betting platform provides a welco...  Caesars Sportsbook   

                                       evidence_list  
0  [{'title': 'The FTX trial is bigger than Sam B...  
1  [{'title': 'Donald Trump defrauded banks with ...  
2  [{'title': 'OpenAI's ex-chairman accuses board...  
3  [{'title': 'Here’s how Rainforest, a budding S...  
4  [{'title': '2023 Kentucky online sports bettin...  


## Step 2: Indexing

In [7]:
#### INDEXING ####

# Load blog
from langchain_community.document_loaders import DataFrameLoader
loader = DataFrameLoader(df, page_content_column="content")
blog_docs = loader.load()

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
documents = text_splitter.split_documents(blog_docs)

# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=documents, 
                                    embedding=OpenAIEmbeddings())


In [38]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

## Step 3: Decomposition

In [39]:
from langchain.prompts import ChatPromptTemplate

# Decomposition
template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [40]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

# LLM
llm = ChatOpenAI(temperature=0)

# Chain
generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))

In [41]:
start = 4
end = 5
pred_responses = []

for idx in range(start, end):
    # Run
    question = query_df['query'][idx]
    questions = generate_queries_decomposition.invoke({"question":question})
    
    import textwrap

    # For a single long string
    print(f"### START EXAMPLE {idx + 1} ###")
    print(textwrap.fill(f"Question: {question}", width=100))
    print()
    print("DECOMPOSITION:")

    metadata = {
        "sub_question": [],
        "retrieved_docs": []
    }
    # For a list of strings
    for q in questions:
        metadata["sub_question"].append(q)
        print(textwrap.fill(q, width=100))
        print("-" * 40)
    
    # Answer each sub-question individually 

    from langchain import hub
    from langchain_core.prompts import ChatPromptTemplate
    from langchain_core.runnables import RunnablePassthrough, RunnableLambda
    from langchain_core.output_parsers import StrOutputParser
    from langchain_openai import ChatOpenAI

    # RAG prompt
    prompt_rag = hub.pull("rlm/rag-prompt")


    total_retrieved_docs = []
    def retrieve_and_rag(question,prompt_rag,sub_question_generator_chain):
        """RAG on each sub-question"""
        
        # Use our decomposition / 
        sub_questions = sub_question_generator_chain.invoke({"question":question})
        
        for sub_question in sub_questions:
            
            # Retrieve documents for each sub-question
            retrieved_docs = retriever.invoke(sub_question)
            sub_docs = []
            for doc in retrieved_docs:
                sub_docs.append(doc)
                total_retrieved_docs.append(doc)
            metadata["retrieved_docs"].append(sub_docs)        
        return sub_questions

    # Wrap the retrieval and RAG process in a RunnableLambda for integration into a chain
    questions = retrieve_and_rag(question, prompt_rag, generate_queries_decomposition)

    print(f"### END EXAMPLE {idx + 1} ###")
    print()

    pred_response = {
        "query": question,
        "metadata": metadata,
        "evidence_list": total_retrieved_docs # a list of Document
    }

    pred_responses.append(pred_response)

### START EXAMPLE 5 ###
Question: Which online betting platform provides a welcome bonus of up to $1000 in bonus bets for
new customers' first losses, runs NBA betting promotions, and is anticipated to extend the same
sign-up offer to new users in Vermont, as reported by both CBSSports.com and Sporting News?

DECOMPOSITION:
1. What online betting platforms offer a welcome bonus of up to $1000 in bonus bets for new
customers' first losses?
----------------------------------------
2. Which online betting platforms run NBA betting promotions?
----------------------------------------
3. Are there any online betting platforms expected to extend the same sign-up offer to new users in
Vermont, as reported by CBSSports.com and Sporting News?
----------------------------------------
### END EXAMPLE 5 ###



## Step 4: Building Knowledge Graph from Retrieved Documents

In [42]:
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Tuple, List, Optional
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
import os
from langchain_community.graphs import Neo4jGraph
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_core.runnables import ConfigurableField, RunnableParallel, RunnablePassthrough

try:
  import google.colab
  from google.colab import output
  output.enable_custom_widget_manager()
except:
  pass

In [43]:
from langchain_neo4j import Neo4jGraph

os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "password"

graph = Neo4jGraph(refresh_schema=False)

In [44]:
responses_df = pd.DataFrame(pred_responses)
print(responses_df.head())
print("Number of evidence documents:", len(responses_df['evidence_list'][0]))

                                               query  \
0  Which online betting platform provides a welco...   

                                            metadata  \
0  {'sub_question': ['1. What online betting plat...   

                                       evidence_list  
0  [page_content='Caesars covers wide range of ma...  
Number of evidence documents: 9


In [45]:
extracted_docs = responses_df['evidence_list'][0]

llm=ChatOpenAI(temperature=0, model_name="gpt-4o")
llm_transformer = LLMGraphTransformer(llm=llm)

graph_documents = llm_transformer.convert_to_graph_documents(extracted_docs)
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [46]:
# directly show the graph resulting from the given Cypher query
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 50"

def showGraph(cypher: str = default_cypher):
    # create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    #display(widget)
    return widget

showGraph()

GraphWidget(layout=Layout(height='800px', width='100%'))

### **Query Inference**

In [47]:
vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(),
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

In [48]:
# Retriever

graph.query(
    "CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

# Extract entities from text
class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

entity_chain = prompt | llm.with_structured_output(Entities)



In [49]:
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

# Fulltext index query
def structured_retriever(question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    result = ""
    entities = entity_chain.invoke({"question": question})
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el['output'] for el in response])
    return result

In [50]:
def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ". join(unstructured_data)}
    """
    return final_data

In [51]:
# Condense a chat history and follow-up question into a standalone question
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""  # noqa: E501
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(temperature=0)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

In [52]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
For Yes/No question, only respond <Yes> or <No>.
For question which you think not possible to determine the answer, respond: <Insufficient information.>
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [53]:
sub_questions = responses_df['metadata'][0]['sub_question']

chat_history = []
for sub_question in sub_questions:
    sub_question = re.sub(r'^\d+\.\s*', '', sub_question)
    print(f"Sub-question: {sub_question}")
    response = chain.invoke({"question": sub_question, "chat_history": []})
    print(f"Response: {response}")
    print("-" * 40)
    chat_history.append((sub_question, response))

Sub-question: What online betting platforms offer a welcome bonus of up to $1000 in bonus bets for new customers' first losses?
Search query: What online betting platforms offer a welcome bonus of up to $1000 in bonus bets for new customers' first losses?




Response: Caesars Sportsbook offers a welcome bonus of up to $1,000 in bonus bets for new customers' first losses.
----------------------------------------
Sub-question: Which online betting platforms run NBA betting promotions?
Search query: Which online betting platforms run NBA betting promotions?




Response: BetMGM and BetRivers run NBA betting promotions.
----------------------------------------
Sub-question: Are there any online betting platforms expected to extend the same sign-up offer to new users in Vermont, as reported by CBSSports.com and Sporting News?
Search query: Are there any online betting platforms expected to extend the same sign-up offer to new users in Vermont, as reported by CBSSports.com and Sporting News?




Response: <Insufficient information.>
----------------------------------------


In [54]:
chain.invoke(
    {
    "question": responses_df['query'][0],
    "chat_history": chat_history,
    }
)

Search query: Which online betting platform is expected to extend the same sign-up offer to new users in Vermont, as reported by CBSSports.com and Sporting News?




"Caesars Sportsbook provides a welcome bonus of up to $1,000 in bonus bets for new customers' first losses, runs NBA betting promotions, and is anticipated to extend the same sign-up offer to new users in Vermont."