In [1]:
from dotenv import load_dotenv
load_dotenv()

# 1. Preprocess documents

# retriever
# first we index 3 blog posts

from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100, chunk_overlap=50
)
doc_splits = text_splitter.split_documents(docs_list)

# 2. Create a retriever tool

# Add to vectorDB
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=OpenAIEmbeddings()
)
retriever = vectorstore.as_retriever()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
# then we create a retrieval tool
from langchain.tools.retriever import create_retriever_tool

retriever_tool = create_retriever_tool(
    retriever=retriever,
    name="retrieve_blog_posts",
    description="Search and return information about Lilian Weng blog posts on LLM agents, prompt engineering, and adversarial attacks on LLMs.",
)

tools = [retriever_tool]

In [3]:
# test the tool
retriever_tool.invoke({"query":"types of reward hacking"})

'combination_1 composes prefix injection, refusal suppression, and the Base64 attack\ncombination_2 adds style injection\ncombination_3 adds generating website content and formatting constraints\n\n\n\nTypes of jailbreak tricks and their success rate at attacking the models. Check the papers for detailed explanation of each attack config. (Image source: Wei et al. 2023)\n\nAttack\nType\nDescription\n\n\n\n\nToken manipulation\nBlack-box\nAlter a small fraction of tokens in the text input such that it triggers model failure but still remain its original semantic meanings.\n\n\nGradient based attack\nWhite-box\nRely on gradient signals to learn an effective attack.\n\n\nJailbreak prompting\nBlack-box\nOften heuristic based prompting to “jailbreak” built-in model safety.\n\nToken Manipulation\n\nGradient based Attacks\n\nJailbreak Prompting\n\nHumans in the Loop Red-teaming\n\nModel Red-teaming\n\n\nPeek into Mitigation\n\nSaddle Point Problem\n\nSome work on LLM Robustness\n\n\nCitation\

In [4]:
# 3.  Generate query

from langgraph.graph import MessagesState
from langchain.chat_models import init_chat_model

response_model = init_chat_model("openai:gpt-4.1", temperature=0)

def generate_query_or_respond(state: MessagesState):
    """Call the model to generate a response based on the current state. Given
    the question, it will decide to retrieve using the retriever tool, or simply respond to the user.
    """
    response = (
        response_model.bind_tools([retriever_tool]).invoke(state["messages"])
    )
    return {"messages": [response]}

In [5]:
# try it on a random input
input = {"messages": [{"role": "user", "content": "hello!"}]}
generate_query_or_respond(input)["messages"][-1].pretty_print()


Hello! How can I help you today?


In [7]:
# Ask a question that requires semantic search:
input = {
    "messages": [
        {
            "role": "user",
            "content": "What does Lilian Weng say about types of reward hacking?",
        }
    ]
}
generate_query_or_respond(input)["messages"][-1].pretty_print()

Tool Calls:
  retrieve_blog_posts (call_Bg69X2PtdVoyIrS9kpMtDXOm)
 Call ID: call_Bg69X2PtdVoyIrS9kpMtDXOm
  Args:
    query: types of reward hacking
