### Notebook to run and test RAG with chat history (v2)

### Step 1: Initialize notebook

In [1]:
import os 
import bs4
import getpass 
import numpy as np
import faiss

from langchain_openai import ChatOpenAI, OpenAIEmbeddings 
from langchain_core.tools import tool
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import SystemMessage
from langgraph.prebuilt import ToolNode, tools_condition
from langgraph.graph import MessagesState, StateGraph
from langgraph.graph import END

from langgraph.checkpoint.memory import MemorySaver

from dotenv import load_dotenv, find_dotenv

from utils import *
load_dotenv(find_dotenv())

USER_AGENT environment variable not set, consider setting it to identify your requests.


True

In [2]:
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

### Step 2: Initialize model objects

In [3]:
llm = ChatOpenAI(model="gpt-4o-mini")

In [4]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

### Single document extract 

In [5]:
# Load and chunk contents of the blog
loader = WebBaseLoader(
    web_paths=("https://petguide.dk/hundefoder-maerker/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("entry-content single-page", "entry-title", "entry-meta uppercase is-xsmall")
        )
    ),
)
docs = loader.load()

In [6]:
# initiate the text splitter 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)

total_documents = len(docs)
third = total_documents // 3

# Split the documents into chunks
all_splits = []
for doc in docs:
    splits = text_splitter.split_documents([doc])
    num_splits = len(splits)
    third = num_splits // 3
    
    for i, split in enumerate(splits):
        split.metadata["source"] = doc.metadata.get("source", "Unknown")
        if i < third:
            split.metadata["section"] = "beginning"
        elif i < 2 * third:
            split.metadata["section"] = "middle"
        else:
            split.metadata["section"] = "end"
    all_splits.extend(splits)
    
print(f"Split blog post into {len(all_splits)} sub-documents.")

Split blog post into 31 sub-documents.


In [7]:
# Create embeddings for each chunk
embeddings_list = [embeddings.embed_query(doc.page_content) for doc in all_splits]
embeddings_array = np.array(embeddings_list)

# Create a FAISS index
dimension = embeddings_array.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_array)

### Multiple documents extract

In [None]:
main_url = "https://petguide.dk/bloggen/"
print(f'Getting article links from {main_url}...')
article_links = get_article_links(main_url)
print(f'Found {len(article_links)} article links...')
all_splits, index = load_and_chunk_documents(article_links[:5])

### Initialize RAG

In [184]:
custom_rag_prompt = PromptTemplate.from_template(prompt_template)

In [185]:
graph_builder = StateGraph(MessagesState)

In [186]:
memory = MemorySaver()

In [187]:
@tool(response_format="content_and_artifact")
def retrieve(query: str):
    """Retrieve information related to a query."""
    # Create embeddings for the query
    query_embedding = embeddings.embed_query(query)
    query_embedding = np.array([query_embedding])

    distances, indices = index.search(query_embedding, k=3)
    retrieved_docs = []
    for distance, idx in zip(distances[0], indices[0]):
        print(distance)
        if distance < 1.0:
            retrieved_docs.append(all_splits[idx])
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\n" f"Content: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

# Executre the retrieval tool
tools = ToolNode([retrieve])

In [188]:
# Generate an AIMessage that may include a tool-call to be sent.
def query_or_respond(state: MessagesState):
    """Generate tool call for retrieval or respond."""
    llm_with_tools = llm.bind_tools([retrieve])
    response = llm_with_tools.invoke(state["messages"])
    # MessagesState appends messages to state instead of overwriting
    return {"messages": [response]}

In [189]:
#Generate a response using the retrieved content.
def generate(state: MessagesState):
    """Generate answer."""
    # Get generated ToolMessages
    recent_tool_messages = []
    for message in reversed(state["messages"]):
        if message.type == "tool":
            recent_tool_messages.append(message)
        else:
            break
    tool_messages = recent_tool_messages[::-1]
    print(type(tool_messages))
    display(tool_messages[::-1])
    
    # Format into prompt
    docs_content = "\n\n".join(doc.content for doc in tool_messages)
    system_message_content = (
        "You are an assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer "
        "the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the "
        "answer concise."
        "\n\n"
        f"{docs_content}"
    )
    conversation_messages = [
        message
        for message in state["messages"]
        if message.type in ("human", "system")
        or (message.type == "ai" and not message.tool_calls)
    ]
    prompt = [SystemMessage(system_message_content)] + conversation_messages

    # Run
    response = llm.invoke(prompt)
    return {"messages": [response]}

In [190]:
# build the graph
graph_builder.add_node(query_or_respond)
graph_builder.add_node(tools)
graph_builder.add_node(generate)

# set the entry point
graph_builder.set_entry_point("query_or_respond")
# add conditions for passing from one node to another
graph_builder.add_conditional_edges(
    "query_or_respond",
    tools_condition,
    {
    END: END, # if the condition indicates the end of the process, terminate
    "tools": "tools" # else, proceed to the tools node
     },
)
# add edges to the graph that connect the nodes
graph_builder.add_edge("tools", "generate")
graph_builder.add_edge("generate", END)
# compile the graph
graph = graph_builder.compile()
# initiate the memory saver to save the state of the graph
graph = graph_builder.compile(checkpointer=memory)

# Specify an ID for the thread
config = {"configurable": {"thread_id": "abc123"}}


In [191]:
input_message = "Hvilke fodermærker kommer fra Canada?"

query = {"messages": [{"role": "user", "content": input_message}]}

output = graph.stream(query, stream_mode="values", config=config)

result = [step["messages"] for step in output][-1]

0.99249
0.9999153
1.091716
<class 'list'>


[ToolMessage(content="Source: {'source': 'https://petguide.dk/hundefoder-maerker/', 'start_index': 2207, 'section': 'beginning'}\nContent: 1st Choice\n1st Choice er kvalitetsfoder lavet i Canada, af gode lokale råvarer skabt med det formål, at sikre at hunde får den rigtige ernæring gennem hele deres liv. Med special udviklede formler, sørger de for at hundens behov bliver opfyldt gennem hundens forskellige livsfaser. Velsmag er i højsæde, men der er samtidigt gjort ekstra ud af at sikre at hunden får ekstra energi, kontrolleret sin vægt, sund hud og pels. 1st Choices vigtigste formål er, at sikre at din bedste ven får det bedste af det bedste. Derudover får du også meget foder for pengene. Med foder fra 1st Choice får du kun kvalitets ingredienser, ingen animalske biprodukter, ingen majs, hvede eller soja, kun naturlige konserveringsmidler og velsmagende smagsvarianter.\nACANA\n\nSource: {'source': 'https://petguide.dk/hundefoder-maerker/', 'start_index': 12376, 'section': 'middle'}\n

In [166]:
result

[HumanMessage(content='Hvilke fodermærker kommer fra Canada?', additional_kwargs={}, response_metadata={}, id='67ef6b8a-a283-43cd-80ab-6cbb00de950d'),
 AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_ZQBN1LSrWIXRtdf4DUM4Eldn', 'function': {'arguments': '{"query":"fodermærker fra Canada"}', 'name': 'retrieve'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 54, 'total_tokens': 73, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_d02d531b47', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-594d62e6-d4c4-4adc-a9ce-9365febe0c11-0', tool_calls=[{'name': 'retrieve', 'args': {'query': 'fodermærker fra Canada'}, 'id': 'call_ZQBN1LSrWIXRtdf4DUM4Eldn', 'type': 'tool_call'}], us

In [193]:
from langchain_core.messages import ToolMessage

last_tool_message = next(
        (
            tool_msg for tool_msg in reversed(result)
            if isinstance(tool_msg, ToolMessage)
        )
    )

"\n\n".join(doc.content for doc in [last_tool_message])

"Source: {'source': 'https://petguide.dk/hundefoder-maerker/', 'start_index': 2207, 'section': 'beginning'}\nContent: 1st Choice\n1st Choice er kvalitetsfoder lavet i Canada, af gode lokale råvarer skabt med det formål, at sikre at hunde får den rigtige ernæring gennem hele deres liv. Med special udviklede formler, sørger de for at hundens behov bliver opfyldt gennem hundens forskellige livsfaser. Velsmag er i højsæde, men der er samtidigt gjort ekstra ud af at sikre at hunden får ekstra energi, kontrolleret sin vægt, sund hud og pels. 1st Choices vigtigste formål er, at sikre at din bedste ven får det bedste af det bedste. Derudover får du også meget foder for pengene. Med foder fra 1st Choice får du kun kvalitets ingredienser, ingen animalske biprodukter, ingen majs, hvede eller soja, kun naturlige konserveringsmidler og velsmagende smagsvarianter.\nACANA\n\nSource: {'source': 'https://petguide.dk/hundefoder-maerker/', 'start_index': 12376, 'section': 'middle'}\nContent: Pronature Ho

In [167]:
input_message_2 = "Hvad spiste Adolf Hitler under krigen?"

query_2 = {"messages": [{"role": "user", "content": input_message_2}]}

output_2 = graph.stream(query_2, stream_mode="values", config=config)

result_2 = [step["messages"] for step in output_2][-1]

1.3796352
1.4567018
1.4695898


In [120]:
# get all 
graph.get_state(config=config).values['messages'] #.messages

[HumanMessage(content='Hvilke fodermærker kommer fra Canada?', additional_kwargs={}, response_metadata={}, id='9cb7ca1a-62e2-42af-964b-6baf4b15baba'),
 AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_9p8jbI9xN3ihZi7bPO9A3cvq', 'function': {'arguments': '{"query":"fodermærker fra Canada"}', 'name': 'retrieve'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 54, 'total_tokens': 73, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_d02d531b47', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-b206ef32-ccbb-4967-997b-96a942739033-0', tool_calls=[{'name': 'retrieve', 'args': {'query': 'fodermærker fra Canada'}, 'id': 'call_9p8jbI9xN3ihZi7bPO9A3cvq', 'type': 'tool_call'}], us

In [None]:
response = res[-1]

final_response = res[-1][-1]

In [34]:
from langchain_core.messages import ToolMessage

answer = final_response.content
# Extract sources from ToolMessages
sources = []
for step in res:
    for message in step:
        if isinstance(message, ToolMessage):
            sources.extend([doc.metadata["source"] for doc in message.artifact])
sources = set(sources)

In [47]:
res[-1][2].artifact[0].metadata

{'source': 'https://petguide.dk/hundefoder-maerker/',
 'start_index': 2355,
 'section': 'beginning'}

In [36]:
sources = [doc.metadata["source"] for doc in final_response.tool_calls[0].artifact]


IndexError: list index out of range

In [16]:
input_message = "Hvilke fodermærker kommer fra Canada?"

for step in graph.stream(
    {"messages": [{"role": "user", "content": input_message}]},
    stream_mode="values",
    config=config,
):
    step["messages"][-1].pretty_print()


Hvilke fodermærker kommer fra Canada?
Tool Calls:
  retrieve (call_tTYrxjL4JmHyUV1txNlE90de)
 Call ID: call_tTYrxjL4JmHyUV1txNlE90de
  Args:
    query: fodermærker fra Canada
Name: retrieve

Source: {'source': 'https://petguide.dk/hundefoder-maerker/', 'start_index': 2355, 'section': 'beginning'}
Content: 1st Choice
1st Choice er kvalitetsfoder lavet i Canada, af gode lokale råvarer skabt med det formål, at sikre at hunde får den rigtige ernæring gennem hele deres liv. Med special udviklede formler, sørger de for at hundens behov bliver opfyldt gennem hundens forskellige livsfaser. Velsmag er i højsæde, men der er samtidigt gjort ekstra ud af at sikre at hunden får ekstra energi, kontrolleret sin vægt, sund hud og pels. 1st Choices vigtigste formål er, at sikre at din bedste ven får det bedste af det bedste. Derudover får du også meget foder for pengene. Med foder fra 1st Choice får du kun kvalitets ingredienser, ingen animalske biprodukter, ingen majs, hvede eller soja, kun naturlige