In [1]:
from langchain_community.utilities import WikipediaAPIWrapper, ArxivAPIWrapper
from langchain_community.tools import ArxivQueryRun
from langchain_community.document_loaders import (
    WebBaseLoader, BraveSearchLoader, WikipediaLoader,
    PyPDFLoader, Docx2txtLoader, TextLoader, CSVLoader, BSHTMLLoader, OnlinePDFLoader
)
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.tools.retriever import create_retriever_tool
from langchain.tools import StructuredTool
from langchain import hub
from langchain.agents import create_openai_tools_agent, AgentExecutor
from langchain_core.prompts import ChatPromptTemplate

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
from dotenv import load_dotenv
import os
import re

In [3]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')   # required
BRAVE_API_KEY = os.getenv("BRAVE_SEARCH_API_KEY")            # required for web RAG

In [4]:
loader = WebBaseLoader('https://docs.smith.langchain.com/')
docs = loader.load()
documents = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_documents(docs)
vectordb = FAISS.from_documents(documents, OpenAIEmbeddings())
retriever = vectordb.as_retriever()
retriever_tool = create_retriever_tool(
    retriever,
    "langsmith_search",
    "FOR LANGSMITH ONLY. If the query mentions LangSmith or docs.smith.langchain.com, "
    "you MUST use this tool and no other."
)

In [5]:
arxiv_wrapper = ArxivAPIWrapper(top_k_results=1, doc_content_chars_max=800)
arxiv = ArxivQueryRun(arxiv_wrapper=arxiv_wrapper)
arxiv.description = "ArXiv tool. Use when the query references arXiv or includes an arXiv ID"

In [6]:
def wikipedia_rag(query: str, top_docs: int = 3) -> str:
    try:
        wiki_docs = WikipediaLoader(query=query, load_max_docs=top_docs).load()
    except Exception as e:
        return f"Failed to load from Wikipedia: {e}"
    if not wiki_docs:
        return "No Wikipedia pages found."

    chunks = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_documents(wiki_docs)
    vdb = FAISS.from_documents(chunks, OpenAIEmbeddings())
    rel_docs = vdb.as_retriever(search_kwargs={"k": 4}).get_relevant_documents(query)
    if not rel_docs:
        return "No relevant passages were found in the fetched Wikipedia pages."

    out, seen = [], []
    for d in rel_docs:
        title = d.metadata.get("title") or "Wikipedia"
        if title not in seen:
            seen.append(title)
        text = d.page_content.strip().replace("\n", " ")
        if len(text) > 600:
            text = text[:600] + " ..."
        url = d.metadata.get("source") or d.metadata.get("wikipedia_url") or ""
        src = f"{title} — {url}" if url else title
        out.append(f"- Excerpt: {text}\n  Source: {src}")

    return (
        "Wikipedia RAG answer (supporting snippets):\n"
        + "\n".join(out[:4])
        + "\n\nPages:\n"
        + "\n".join(f"* {t}" for t in seen[:5])
    )

In [7]:
wikipedia_rag_tool = StructuredTool.from_function(
    func=wikipedia_rag,
    name="wikipedia_rag",
    description=(
        "Wikipedia RAG. Use for definitions, people, events, places, timelines, comparisons, or when quotations are needed. "
        "Args: query:str, top_docs:int=3"
    ),
)

In [8]:
def web_rag_search(query: str, count: int = 5) -> str:
    if not BRAVE_API_KEY:
        return "BRAVE_SEARCH_API_KEY is not set."
    count = max(1, min(int(count), 10))

    serp_docs = BraveSearchLoader(query=query, api_key=BRAVE_API_KEY, search_kwargs={"count": count}).load()

    links = []
    for d in serp_docs:
        link = d.metadata.get("link")
        if link and link not in links:
            links.append(link)
    if not links:
        return "No web results found."

    web_docs = []
    for url in links:
        try:
            # If it looks like a PDF URL, fetch as PDF
            if url.lower().endswith(".pdf"):
                page_docs = OnlinePDFLoader(url).load()
            else:
                page_docs = WebBaseLoader(url).load()
            for pd in page_docs:
                pd.metadata["source"] = url
            web_docs.extend(page_docs)
        except Exception:
            pass
    if not web_docs:
        return "Failed to load content from the top web results."

    chunks = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=200).split_documents(web_docs)
    vdb = FAISS.from_documents(chunks, OpenAIEmbeddings())
    rel_docs = vdb.as_retriever(search_kwargs={"k": 4}).get_relevant_documents(query)
    if not rel_docs:
        return "No relevant passages were found in the crawled pages."

    out, seen = [], []
    for d in rel_docs:
        src = d.metadata.get("source") or d.metadata.get("link") or "unknown"
        if src not in seen:
            seen.append(src)
        text = d.page_content.strip().replace("\n", " ")
        if len(text) > 600:
            text = text[:600] + " ..."
        out.append(f"- Excerpt: {text}\n  Source: {src}")

    return (
        "Web RAG answer (supporting snippets):\n"
        + "\n".join(out[:4])
        + "\n\nTop sources:\n"
        + "\n".join(f"* {s}" for s in seen[:5])
    )

In [9]:
web_rag_tool = StructuredTool.from_function(
    func=web_rag_search,
    name="web_rag_search",
    description=(
        "General web RAG (Brave → top 5 links → scrape → chunk → embed → retrieve). "
        "Default choice when no user documents apply and it’s not an ArXiv/Wikipedia/LangSmith query. "
        "Args: query:str, count:int=5"
    ),
)

In [28]:
from langchain_community.document_loaders import TextLoader

DOC_VDB = None
DOC_CHUNK_COUNT = 0
DOC_INDEXED = False
DOC_SOURCES = set()

# Replace your doc_index signature & body with this:
def doc_index(files_csv: str = "", chunk_size: int = 800, chunk_overlap: int = 100) -> str:
    from langchain_community.document_loaders import TextLoader
    paths = [p.strip() for p in (files_csv or "").split(",") if p.strip()]
    if not paths:
        return "ERROR: Provide .txt file path(s) via files_csv."
    if any(not p.lower().endswith(".txt") for p in paths):
        return "ERROR: Only .txt files are supported."

    docs, loaded = [], []
    for p in paths:
        try:
            try:
                docs.extend(TextLoader(p, encoding="utf-8").load())
            except Exception:
                docs.extend(TextLoader(p, encoding="utf-8-sig").load())
            loaded.append(p)
        except Exception as e:
            return f"ERROR: Failed to load {p}: {e}"

    splitter = RecursiveCharacterTextSplitter(chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap))
    chunks = splitter.split_documents(docs)

    global DOC_VDB, DOC_INDEXED, DOC_CHUNK_COUNT
    if DOC_VDB is None:
        DOC_VDB = FAISS.from_documents(chunks, OpenAIEmbeddings())
    else:
        DOC_VDB.add_documents(chunks)
    DOC_CHUNK_COUNT = getattr(DOC_VDB, "index").ntotal if hasattr(DOC_VDB, "index") else len(chunks)
    DOC_INDEXED = True
    return f"OK: Indexed {len(chunks)} chunks from {len(loaded)} .txt file(s)."


In [None]:
llm = ChatOpenAI(model="gpt-4o", temperature=0.3)
def doc_rag_search(query: str, k: int = 4) -> str:
    if DOC_VDB is None:
        return "No indexed documents. Run doc_index first with files_csv."
    rel = DOC_VDB.as_retriever(search_kwargs={"k": int(k)}).get_relevant_documents(query)
    if not rel:
        return "No relevant passages found in the indexed documents."
    labeled = []
    for i, d in enumerate(rel, start=1):
        txt = d.page_content.strip().replace("\n", " ")
        if len(txt) > 1200: txt = txt[:1200] + " ..."
        src = d.metadata.get("source") or d.metadata.get("file_path") or "uploaded_document"
        labeled.append(f"[{i}] {txt}\n(Source: {src})")
    context = "\n\n".join(labeled)
    try:
        _llm = llm
    except NameError:
        _llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
    prompt_text = (
        "Answer ONLY from the provided document excerpts. "
        "If the answer isn't there, say you couldn't find it in the uploaded document.\n\n"
        f"Question: {query}\n\nExcerpts:\n{context}\n\n"
        "Answer concisely. Include bracketed citations like [1], [2] matching the excerpts you used."
    )
    return _llm.invoke(prompt_text).content

def answer(query: str):
    # Force Doc-RAG first if we have an index
    if 'DOC_INDEXED' in globals() and DOC_INDEXED:
        ans = doc_rag_search(query)
        if not ans.startswith(("No indexed documents", "No relevant passages")):
            return ans
    # Fallback to your agent (web/Wiki/ArXiv/LangSmith) if docs fail or not indexed
    selected = pick_tools(query) if 'pick_tools' in globals() else [web_rag_tool, arxiv, wikipedia_rag_tool, retriever_tool]
    ag = create_openai_tools_agent(llm, selected, prompt)
    ex = AgentExecutor(agent=ag, tools=selected, verbose=True)
    return ex.invoke({"input": query})


In [30]:
doc_index_tool = StructuredTool.from_function(
    func=doc_index,
    name="doc_index",
    description=("Ingest user documents into a session vector index. "
                 "Call this when the user provides local .txt files. "
                 "Args: files_csv:str (comma-separated .txt paths), "
                 "chunk_size:int=800, chunk_overlap:int=100"),
)

In [31]:
doc_rag_tool = StructuredTool.from_function(
    func=doc_rag_search,
    name="doc_rag_search",
    description=("Query the user's indexed documents (from doc_index). "
                 "Prefer this when documents are indexed and the question could be answered from them. "
                 "Args: query:str, k:int=4"),
)

In [32]:
tools = [wikipedia_rag_tool, arxiv, retriever_tool, web_rag_tool, doc_index_tool, doc_rag_tool]

In [33]:
llm = ChatOpenAI(model="gpt-4o", temperature=0.3)
base_prompt = hub.pull("hwchase17/openai-functions-agent")



In [34]:
ROUTING_GUIDE = """
Tool routing policy (follow strictly, in priority order):
1) DOCS (TXT-only): If any documents are uploaded, prefer doc_rag_search for queries. Try to strictly answer from the document.
2) LANGSMITH: If the query mentions 'LangSmith' or 'docs.smith.langchain.com' → use langsmith_search ONLY.
3) ARXIV / PAPERS: If the query references arXiv (contains 'arxiv'/'arXiv' or an arXiv ID like 1605.08386) or asks 
about a research paper/preprint → use arxiv.
4) WIKIPEDIA RAG: Use for definitions, people/biographies, historical events, places, timelines, comparisons,
terminology explanations, or when quotations from Wikipedia are useful.
5) WEB RAG (Brave): Default for general or dynamic/time-sensitive queries ('latest', 'today', policies, products, news,
releases, prices). Search top 5 links → scrape → chunk → embed → retrieve.

Hard rules:
- Do NOT answer from prior model knowledge when a tool applies.
- If multiple categories match, pick the highest-priority applicable tool above.
- If the chosen tool yields no relevant results, fall through to the next tool in priority order.
- Documents are TXT-only for document RAG.
"""

In [35]:
prompt = base_prompt + ChatPromptTemplate.from_messages([("system", ROUTING_GUIDE)])

In [36]:
prompt

ChatPromptTemplate(input_variables=['agent_scratchpad', 'input'], optional_variables=['chat_history'], input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai.AIMessageChunk, Tag(tag='AIMessageChunk')], typing.Annotated[langchain_core.messages.human.HumanMessageChunk, Tag(tag='HumanMessageChunk')], typing.Annotated[langchain_core.messages.chat.ChatMessageChunk, Tag(tag='ChatMessageChunk')], typing.Annotated[langchain_core.messages.system.SystemMessageChunk, Tag(tag='

In [37]:
agent = create_openai_tools_agent(llm, tools, prompt)
executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

In [38]:
response1=executor.invoke({"input":"What is Logistic Regression"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `wikipedia_rag` with `{'query': 'Logistic Regression'}`


[0m[36;1m[1;3mWikipedia RAG answer (supporting snippets):
- Excerpt: In statistics, multinomial logistic regression is a classification method that generalizes logistic regression to multiclass problems, i.e. with more than two possible discrete outcomes.  That is, it is a model that is used to predict the probabilities of the different possible outcomes of a categorically distributed dependent variable, given a set of independent variables (which may be real-valued, binary-valued, categorical-valued, etc.). Multinomial logistic regression is known by a variety of other names, including polytomous LR, multiclass LR, softmax regression, multinomial logit (mlogit ...
  Source: Multinomial logistic regression — https://en.wikipedia.org/wiki/Multinomial_logistic_regression
- Excerpt: The parameters of a logistic regression are most commonly estimated by maxim

In [25]:
response3=executor.invoke({"input":"What is most recent research paper on AI agents?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `arxiv` with `{'query': 'AI agents'}`


[0m[33;1m[1;3mPublished: 2025-07-02
Title: AI Agents and Agentic AI-Navigating a Plethora of Concepts for Future Manufacturing
Authors: Yinwang Ren, Yangyang Liu, Tang Ji, Xun Xu
Summary: AI agents are autonomous systems designed to perceive, reason, and act within
dynamic environments. With the rapid advancements in generative AI (GenAI),
large language models (LLMs) and multimodal large language models (MLLMs) have
significantly improved AI agents' capabilities in semantic comprehension,
complex reasoning, and autonomous decision-making. At the same time, the rise
of Agentic AI highlights adaptability and goal-directed autonomy in dynamic and
complex environments. LLMs-based AI Agents (LLM-Agents), MLLMs-based AI Agents
(MLLM-Agents), and Agentic AI contribute to expanding AI's capabilities in
information processing, environmental perception, and autonomous
decision-maki

In [26]:
# Test Document RAG using the agent on the local TXT
# This indexes the specified .txt and queries strictly from it.
print(doc_index(files_csv="deep-learning-in-python-prerequisites.txt"))

query = (
    "From the uploaded document only, what are the key prerequisites for deep learning in Python?"
)

# Ensure executor exists (created earlier); if not, initialize it using existing llm/tools/prompt.
try:
    executor
except NameError:
    agent = create_openai_tools_agent(llm, tools, prompt)
    executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

response = executor.invoke({"input": query})
response


OK: Indexed 51 chunks from 1 .txt file(s).


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `doc_rag_search` with `{'query': 'key prerequisites for deep learning in Python'}`


[0m[38;5;200m[1;3mKey prerequisites for deep learning in Python include mastering data science and machine learning techniques such as linear regression and logistic regression, understanding gradient descent, and having undergraduate-level math and programming skills. Familiarity with the Python programming language and the numerical computing library Numpy is also necessary [1], [2], [3].[0m[32;1m[1;3mThe key prerequisites for deep learning in Python, as outlined in the uploaded document, include:

1. Mastering data science and machine learning techniques such as linear regression and logistic regression.
2. Understanding gradient descent.
3. Having undergraduate-level math and programming skills.
4. Familiarity with the Python programming language.
5. Knowledge of the numerical co

{'input': 'From the uploaded document only, what are the key prerequisites for deep learning in Python?',
 'output': 'The key prerequisites for deep learning in Python, as outlined in the uploaded document, include:\n\n1. Mastering data science and machine learning techniques such as linear regression and logistic regression.\n2. Understanding gradient descent.\n3. Having undergraduate-level math and programming skills.\n4. Familiarity with the Python programming language.\n5. Knowledge of the numerical computing library Numpy.'}

In [39]:
response1 = executor.invoke({"input":"What is Logistic Regression"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `wikipedia_rag` with `{'query': 'Logistic Regression'}`


[0m[36;1m[1;3mWikipedia RAG answer (supporting snippets):
- Excerpt: In statistics, multinomial logistic regression is a classification method that generalizes logistic regression to multiclass problems, i.e. with more than two possible discrete outcomes.  That is, it is a model that is used to predict the probabilities of the different possible outcomes of a categorically distributed dependent variable, given a set of independent variables (which may be real-valued, binary-valued, categorical-valued, etc.). Multinomial logistic regression is known by a variety of other names, including polytomous LR, multiclass LR, softmax regression, multinomial logit (mlogit ...
  Source: Multinomial logistic regression — https://en.wikipedia.org/wiki/Multinomial_logistic_regression
- Excerpt: The parameters of a logistic regression are most commonly estimated by maxim