It is a starter.

Next step: 

Test the code. Compare KG_as_RAG LLM with a LLM chat model without KG, take the difference.

Generalized the subject (not only for probability, but based on the send in user_input/KG/LLM_extraction)

Consider removing the use of KG memory and session id (not necessary, and increase computational cost in the current code. However, these are necessary if it is a conversation, or include follow-up questions, which is common in probability tests)

Research the most recent RAG approaches

In [None]:
import pandas as pd
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.agents import AgentExecutor, create_react_agent
from langchain.tools import Tool
from langchain import hub
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.schema import StrOutputParser
from langchain_community.chat_message_histories import Neo4jChatMessageHistory
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from uuid import uuid4

In [None]:
# SETUPs
# Add the path of the file (excel file containing quetions for accuracy test)
path = ''
questions_col_name = ''
true_answers_col_name = ''

# Set up API key
OPENAI_API_KEY = "sk-..."

# Connect to the Neo4j graph database (my current neo4j instance)
graph = Neo4jGraph(
    url="neo4j+s://1e69f7f9.databases.neo4j.io",
    username="neo4j",
    password="0qkjXy8Xu3IV0bvwNKR_uPkunv7SbZf92X28jJuLiBY"
)

# Chat prompt that only return answer, so can directly calculate accuracy
chat_prompt_answer = "You are a probability expert. You answer questions about probability concepts, methods, applications, and calculations. You should only return the answer. For example, If the answer is A, then you just output A."
# Chat prompt for cheat sheet
chat_prompt_cheatsheet = "You are a probability expert. You summarize the important concepts, methods, formulas in the provided text.

# Load the agent prompt from LagnSmith
agent_prompt = hub.pull("hwchase17/react-chat") # a common agent, should consider create one for this specific task

In [None]:
df = pd.read_excel(path)
df.to_csv("csv_file.csv", index=False)
questions = df[questions_col_name]
true_answers = df[true_answers_col_name]

In [None]:
# Combine LLM and KG, use KG as RAG
def KG_as_RAG(OPENAI_API_KEY = OPENAI_API_KEY, graph = graph, chat_prompt = chat_prompt_answer, agent_prompt = agent_prompt):
    # Unique session ID
    SESSION_ID = str(uuid4())
    print(f"Session ID: {SESSION_ID}")
    
    llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY)
    embedding_provider = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                chat_prompt,
            ),
            ("human", "{input}"),
        ]
    )
    
    prob_chat = prompt | llm | StrOutputParser()
    
    # Initialize a vector retriever using Neo4jVector for semantic search
    prob_vector = Neo4jVector.from_existing_index(
        embedding_provider,
        graph=graph,
        index_name="probabilityConcepts",
        embedding_node_property="conceptEmbedding",
        text_node_property="conceptDescription",
    )
    
    prob_retriever = RetrievalQA.from_llm(
        llm=llm,
        retriever=prob_vector.as_retriever()
    )
    
    # Using KG as RAG
    def combined_prob_tool(input_text):
        retrieved_knowledge = prob_retriever.invoke(input_text)
        combined_input = f"User Query: {input_text}\n\nRelevant Knowledge: {retrieved_knowledge}"
        response = prob_chat.invoke(combined_input)
        return response
    
    # Memory
    def get_memory(session_id):
        return Neo4jChatMessageHistory(session_id=session_id, graph=graph)
    
    tools = [
        Tool.from_function(
            name="Probability Expert",
            description=(
                "Use this tool to answer probability-related questions. It first searches for relevant knowledge in the knowledge graph, then provides a response using the retrieved information and the user's query."
            ),
            func=combined_prob_tool,
        ),
    ]
    
    agent = create_react_agent(llm, tools, agent_prompt)
    agent_executor = AgentExecutor(agent=agent, tools=tools)
    
    chat_agent = RunnableWithMessageHistory(
        agent_executor,
        get_memory,
        input_messages_key="input",
        history_messages_key="chat_history",
    )
    
    return chat_agent

In [None]:
chat_agent = KG_as_RAG()

# Iterate over each question, get response, and store results
responses = []
for question in df[questions_col_name]:
    response = chat_agent.invoke(
        {"input": question},
        {"configurable": {"session_id": str(uuid4())}},
    )
    responses.append(response["output"])

df['responses'] = responses

In [None]:
# Evaluate accuracy
df['is_correct'] = df['responses'] == df['true_answers']
accuracy = df['is_correct'].mean()

print(f"Accuracy: {accuracy * 100:.2f}%")