In [1]:
from langchain_community.tools import WikipediaQueryRun, BraveSearch
from langchain_community.utilities import WikipediaAPIWrapper, ArxivAPIWrapper
from langchain_community.tools import ArxivQueryRun
from langchain_community.document_loaders import WebBaseLoader, BraveSearchLoader, WikipediaLoader, PyPDFLoader, Docx2txtLoader, TextLoader, CSVLoader, BSHTMLLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.tools.retriever import create_retriever_tool
from langchain.tools import StructuredTool
from langchain import hub
from langchain.agents import create_openai_tools_agent, AgentExecutor

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
from dotenv import load_dotenv
import os
import re

## Environment

In [3]:
load_dotenv()
os.environ['OPENAI_API_KEY']=os.getenv('OPENAI_API_KEY')
BRAVE_API_KEY = os.getenv("BRAVE_SEARCH_API_KEY")  # required for web search

## Wikipedia (Quick search)

In [4]:
api_wrapper=WikipediaAPIWrapper(top_k_results=1,doc_content_chars_max=1000)
wiki=WikipediaQueryRun(api_wrapper=api_wrapper)

In [5]:
wiki.description = (
    "Wikipedia quick lookup. Use for short factual questions that need a brief summary. "
    "Do NOT use for detailed/ambiguous multi-part queries or when quotes/timelines are needed; "
    "prefer wikipedia_rag in those cases."
)

## LangSmith site Retriever tool

In [6]:
loader=WebBaseLoader('https://docs.smith.langchain.com/')
docs=loader.load()
documents=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200).split_documents(docs)

In [7]:
## creating vector DB
vectordb=FAISS.from_documents(documents, OpenAIEmbeddings())
retriever=vectordb.as_retriever()
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x127e77b60>, search_kwargs={})

In [8]:
retriever_tool = create_retriever_tool(
    retriever,
    "langsmith_search",
    "FOR LANGSMITH ONLY. If the query mentions LangSmith or docs.smith.langchain.com, "
    "you MUST use this tool and no other."
)

In [123]:
retriever_tool.name

'langsmith_search'

## Arxiv Tool

In [9]:
arxiv_wrapper=ArxivAPIWrapper(top_k_results=1, doc_content_chars_max=200)
arxiv=ArxivQueryRun(arxiv_wrapper=arxiv_wrapper)
arxiv.name

'arxiv'

In [10]:
arxiv.description = (
    "ArXiv tool. Use when the query references arXiv or includes an arXiv ID."
)

## Wikipedia RAG (deeper context)

In [11]:
def wikipedia_rag(query: str, top_docs: int = 3) -> str:
    try:
        wiki_docs = WikipediaLoader(query=query, load_max_docs=top_docs).load()
    except Exception as e:
        return f"Failed to load from Wikipedia: {e}"

    if not wiki_docs:
        return "No Wikipedia pages found."

    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_documents(wiki_docs)

    vdb = FAISS.from_documents(chunks, OpenAIEmbeddings())
    rel_docs = vdb.as_retriever(search_kwargs={"k": 4}).get_relevant_documents(query)

    if not rel_docs:
        return "No relevant passages were found in the fetched Wikipedia pages."

    out, seen = [], []
    for d in rel_docs:
        title = d.metadata.get("title") or "Wikipedia"
        if title not in seen:
            seen.append(title)
        text = d.page_content.strip().replace("\n", " ")
        if len(text) > 600:
            text = text[:600] + " ..."
        url = d.metadata.get("source") or d.metadata.get("wikipedia_url") or ""
        src = f"{title} — {url}" if url else title
        out.append(f"- Excerpt: {text}\n  Source: {src}")

    return (
        "Wikipedia RAG answer (supporting snippets):\n"
        + "\n".join(out[:4])
        + "\n\nPages:\n"
        + "\n".join(f"* {t}" for t in seen[:5])
    )

In [12]:
wikipedia_rag_tool = StructuredTool.from_function(
    func=wikipedia_rag,
    name="wikipedia_rag",
    description=(
        "Deep Wikipedia RAG. Use when the question needs detailed context, comparisons, timelines, or quotations "
        "from Wikipedia beyond a short summary. Args: query:str, top_docs:int=3"
    ),
)

## Web RAG via Brave

In [13]:
brave_search_tool = BraveSearch.from_api_key(api_key=BRAVE_API_KEY, search_kwargs={"count": 5})

In [14]:
def web_rag_search(query: str, count: int = 5) -> str:
    if not BRAVE_API_KEY:
        return "BRAVE_SEARCH_API_KEY is not set."
    count = max(1, min(int(count), 10))

    # 1) Search
    serp_docs = BraveSearchLoader(query=query, api_key=BRAVE_API_KEY, search_kwargs={"count": count}).load()

    # 2) Links
    links = []
    for d in serp_docs:
        link = d.metadata.get("link")
        if link and link not in links:
            links.append(link)
    if not links:
        return "No web results found."

    # 3) Scrape
    web_docs = []
    for url in links:
        try:
            for pd in WebBaseLoader(url).load():
                pd.metadata["source"] = url
                web_docs.append(pd)
        except Exception:
            pass
    if not web_docs:
        return "Failed to load content from the top web results."

    # 4) Chunk → 5) Embed → 6) Similarity search
    chunks = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=200).split_documents(web_docs)
    vdb = FAISS.from_documents(chunks, OpenAIEmbeddings())
    rel_docs = vdb.as_retriever(search_kwargs={"k": 4}).get_relevant_documents(query)

    if not rel_docs:
        return "No relevant passages were found in the crawled pages."

    # 7) Compose short answer with sources
    out, seen = [], []
    for d in rel_docs:
        src = d.metadata.get("source") or d.metadata.get("link") or "unknown"
        if src not in seen:
            seen.append(src)
        text = d.page_content.strip().replace("\n", " ")
        if len(text) > 600:
            text = text[:600] + " ..."
        out.append(f"- Excerpt: {text}\n  Source: {src}")

    return (
        "Web RAG answer (supporting snippets):\n"
        + "\n".join(out[:4])
        + "\n\nTop sources:\n"
        + "\n".join(f"* {s}" for s in seen[:5])
    )

In [15]:
web_rag_tool = StructuredTool.from_function(
    func=web_rag_search,
    name="web_rag_search",
    description="General web RAG. Use when the question is NOT about Wikipedia, Arxiv, or LangSmith. Args: query:str, count:int=5",
)

## Document RAG

In [16]:
DOC_VDB = None

def doc_index(files_csv: str = "./NehaGaonkarDataScientist.pdf", urls_csv: str = "", chunk_size: int = 1200, chunk_overlap: int = 200) -> str:
    """
    Ingest user-provided documents into an in-memory FAISS index.
    - files_csv: comma-separated local file paths (pdf, docx, txt, md, csv, html)
    - urls_csv: comma-separated URLs to fetch and index
    """
    paths = [p.strip() for p in files_csv.split(",") if p.strip()]
    urls  = [u.strip() for u in urls_csv.split(",")  if u.strip()]

    docs = []

    # Load local files
    for p in paths:
        try:
            lo = None
            pl = p.lower()
            if pl.endswith(".pdf"):
                lo = PyPDFLoader(p)
            elif pl.endswith(".docx") or pl.endswith(".doc"):
                lo = Docx2txtLoader(p)
            elif pl.endswith(".csv"):
                lo = CSVLoader(p)
            elif pl.endswith(".html") or pl.endswith(".htm"):
                lo = BSHTMLLoader(p)
            elif pl.endswith(".txt") or pl.endswith(".md"):
                lo = TextLoader(p, encoding="utf-8")
            else:
                # skip unknown types silently to avoid heavy 'unstructured' dependency
                continue
            docs.extend(lo.load())
        except Exception:
            continue

    # Load URLs
    for url in urls:
        try:
            page_docs = WebBaseLoader(url).load()
            for pd in page_docs:
                pd.metadata["source"] = url
            docs.extend(page_docs)
        except Exception:
            continue

    if not docs:
        return "No documents loaded. Provide valid paths in files_csv or URLs in urls_csv."

    # Chunk and (create or update) vectorstore
    splitter = RecursiveCharacterTextSplitter(chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap))
    chunks = splitter.split_documents(docs)

    global DOC_VDB, DOC_CHUNK_COUNT, DOC_INDEXED
    if DOC_VDB is None:
        DOC_VDB = FAISS.from_documents(chunks, OpenAIEmbeddings())
    else:
        DOC_VDB.add_documents(chunks)

    DOC_CHUNK_COUNT += len(chunks)
    DOC_INDEXED = DOC_VDB is not None and DOC_CHUNK_COUNT > 0

    return f"Indexed {len(chunks)} chunks from {len(paths)} file(s) and {len(urls)} URL(s)."

In [17]:
def doc_rag_search(query: str, k: int = 4) -> str:
    """
    Query the user-provided document corpus previously indexed with doc_index.
    """
    if DOC_VDB is None:
        return "No indexed documents. Run doc_index first with files_csv and/or urls_csv."
    rel = DOC_VDB.as_retriever(search_kwargs={"k": int(k)}).get_relevant_documents(query)
    if not rel:
        return "No relevant passages found in the indexed documents."

    out, seen = [], []
    for d in rel:
        src = d.metadata.get("source") or d.metadata.get("file_path") or d.metadata.get("source_path") or "uploaded_document"
        if src not in seen: seen.append(src)
        text = d.page_content.strip().replace("\n", " ")
        if len(text) > 700: text = text[:700] + " ..."
        out.append(f"- Excerpt: {text}\n  Source: {src}")
    return "Document RAG answer (supporting snippets):\n" + "\n".join(out[:4]) + "\n\nSources:\n" + "\n".join(f"* {s}" for s in seen[:8])

In [18]:
doc_index_tool = StructuredTool.from_function(
    func=doc_index,
    name="doc_index",
    description=("Ingest user documents into a persistent (session) vector index. "
                 "Use when the user uploads or provides links. "
                 "Args: files_csv:str (comma-separated file paths), urls_csv:str (comma-separated URLs), "
                 "chunk_size:int=1200, chunk_overlap:int=200"),
)
doc_rag_tool = StructuredTool.from_function(
    func=doc_rag_search,
    name="doc_rag_search",
    description=("Query the user's indexed documents (from doc_index). "
                 "Use for questions that refer to the uploaded/added documents. "
                 "Args: query:str, k:int=4"),
)

## Combine All Tools

In [19]:
# --- Assemble tools ---
tools = [wiki, wikipedia_rag_tool, arxiv, retriever_tool, web_rag_tool, doc_index_tool, doc_rag_tool]

In [20]:
tools

[WikipediaQueryRun(description='Wikipedia quick lookup. Use for short factual questions that need a brief summary. Do NOT use for detailed/ambiguous multi-part queries or when quotes/timelines are needed; prefer wikipedia_rag in those cases.', api_wrapper=WikipediaAPIWrapper(wiki_client=<module 'wikipedia' from '/Users/tarunkashyap/Desktop/neha_g_git/rag_agent_multiple_sources/nenv/lib/python3.13/site-packages/wikipedia/__init__.py'>, top_k_results=1, lang='en', load_all_available_meta=False, doc_content_chars_max=1000)),
 StructuredTool(name='wikipedia_rag', description='Deep Wikipedia RAG. Use when the question needs detailed context, comparisons, timelines, or quotations from Wikipedia beyond a short summary. Args: query:str, top_docs:int=3', args_schema=<class 'langchain_core.utils.pydantic.wikipedia_rag'>, func=<function wikipedia_rag at 0x13067c400>),
 ArxivQueryRun(description='ArXiv tool. Use when the query references arXiv or includes an arXiv ID.', api_wrapper=ArxivAPIWrapper

## Agents

In [21]:
llm = ChatOpenAI(model="gpt-4o", temperature=0)

In [22]:
prompt = hub.pull("hwchase17/openai-functions-agent")
prompt.messages



[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='You are a helpful assistant'), additional_kwargs={}),
 MessagesPlaceholder(variable_name='chat_history', optional=True),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={}),
 MessagesPlaceholder(variable_name='agent_scratchpad')]

In [23]:
agent=create_openai_tools_agent(llm,tools,prompt)

## Agent Executor

In [24]:
agent_executor=AgentExecutor(agent=agent,tools=tools,verbose=True)
agent_executor

AgentExecutor(verbose=True, agent=RunnableMultiActionAgent(runnable=RunnableAssign(mapper={
  agent_scratchpad: RunnableLambda(lambda x: format_to_openai_tool_messages(x['intermediate_steps']))
})
| ChatPromptTemplate(input_variables=['agent_scratchpad', 'input'], optional_variables=['chat_history'], input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai.AIMessageChunk, Tag(tag='AIMessageChunk')], typing.Annotated[langchain_core.messages.human.HumanMessageChunk, Tag

In [150]:
# def pick_tools(q: str):
#     ql = q.lower()
#     if "langsmith" in ql or "docs.smith.langchain.com" in ql:
#         return [retriever_tool]
#     if "arxiv" in ql or re.search(r"\b\d{4}\.\d{5}\b", ql):
#         return [arxiv]
#     # quick wiki signals
#     if any(w in ql for w in ["who is", "who was", "when was", "where is", "capital of", "definition of"]):
#         return [wiki, wikipedia_rag_tool]
#     # default to web
#     return tools

In [25]:
response1=agent_executor.invoke({"input":"List experience of Neha"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `doc_rag_search` with `{'query': "Neha's experience"}`


[0m[36;1m[1;3mNo indexed documents. Run doc_index first with files_csv and/or urls_csv.[0m[32;1m[1;3mIt seems that I don't have any documents indexed that contain information about Neha's experience. If you have a document or a resume for Neha, you can upload it, and I can help extract the relevant information.[0m

[1m> Finished chain.[0m


In [26]:
response2=agent_executor.invoke({"input":"What's the paper 1605.08386 about?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `arxiv` with `{'query': '1605.08386'}`


[0m[38;5;200m[1;3mPublished: 2016-05-26
Title: Heat-bath random walks with Markov bases
Authors: Caprice Stanley, Tobias Windisch
Summary: Graphs on lattice points are studied whose edges come from a finite set of
allowed moves of arbitrary length. We show that the diameter of these graphs on
fibers of a fixed integer matrix can be bounded from above by a constant. We
then study the mixing behaviour of heat-bath random walks on these graphs. We
also state explicit conditions on the set of moves so that the heat-bath random
walk, a generalization of the Glauber dynamics, is an expander in fixed
dimension.[0m[32;1m[1;3mThe paper titled "Heat-bath random walks with Markov bases" by Caprice Stanley and Tobias Windisch, published on May 26, 2016, studies graphs on lattice points where the edges are defined by a finite set of allowed moves of arbitrary length. The authors de

In [27]:
response3=agent_executor.invoke({"input":"Who was the british king who established the church of England and in what year?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `wikipedia` with `{'query': 'British king who established the Church of England'}`


[0m[36;1m[1;3mPage: Charles II of England
Summary: Charles II (29 May 1630 – 6 February 1685) was King of Scotland from 1649 until 1651 and King of England, Scotland, and Ireland from the 1660 Restoration of the monarchy until his death in 1685.
Charles II was the eldest surviving child of Charles I of England, Scotland and Ireland and Henrietta Maria of France. After Charles I's execution at Whitehall on 30 January 1649, at the climax of the English Civil War, the Parliament of Scotland proclaimed Charles II king on 5 February 1649. However, England entered the period known as the English Interregnum or the English Commonwealth with a republican government eventually led by Oliver Cromwell. Cromwell defeated Charles II at the Battle of Worcester on 3 September 1651, and Charles fled to mainland Europe. Cromwell became Lord Prot

In [28]:
response4=agent_executor.invoke({"input":"Who was the british king who established the church of England and what was the role of his marriage in his decisiopn?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `wikipedia` with `{'query': 'British king who established the Church of England'}`


[0m[36;1m[1;3mPage: Charles II of England
Summary: Charles II (29 May 1630 – 6 February 1685) was King of Scotland from 1649 until 1651 and King of England, Scotland, and Ireland from the 1660 Restoration of the monarchy until his death in 1685.
Charles II was the eldest surviving child of Charles I of England, Scotland and Ireland and Henrietta Maria of France. After Charles I's execution at Whitehall on 30 January 1649, at the climax of the English Civil War, the Parliament of Scotland proclaimed Charles II king on 5 February 1649. However, England entered the period known as the English Interregnum or the English Commonwealth with a republican government eventually led by Oliver Cromwell. Cromwell defeated Charles II at the Battle of Worcester on 3 September 1651, and Charles fled to mainland Europe. Cromwell became Lord Prot

  rel_docs = vdb.as_retriever(search_kwargs={"k": 4}).get_relevant_documents(query)


[33;1m[1;3mWikipedia RAG answer (supporting snippets):
- Excerpt: In 1534, the Church of England renounced the authority of the Papacy under the direction of King Henry VIII, beginning the English Reformation. The guiding theologian that shaped Anglican doctrine was the Reformer Thomas Cranmer, who developed the Church of England's liturgical text, the Book of Common Prayer. Papal authority was briefly restored under Mary I, before her successor Elizabeth I renewed the breach. The Elizabethan Settlement (implemented 1559–1563) concluded the English Reformation, charting a course for the English church to describe itself as a via media between two branches of ...
  Source: Church of England — https://en.wikipedia.org/wiki/Church_of_England
- Excerpt: The Church of England (C of E) is the established Christian church in England and the Crown Dependencies. It was the initial church of the Anglican tradition. The Church traces its history to the Christian hierarchy recorded as existing i

In [29]:
response5 = agent_executor.invoke({"input":"Who is the person identified as the shooter of Charlie Kirk?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `web_rag_search` with `{'query': 'Charlie Kirk shooter identified', 'count': 5}`


[0m[33;1m[1;3mWeb RAG answer (supporting snippets):
- Excerpt: Charlie Kirk shooting suspect 'was living with transgender partner who handed over texts about assassination plot to the police'By CAROLINE GRAHAM  Published:  23:58, 13 September 2025   |  Updated:  11:49, 14 September 2025                   e-mail        915 shares     3.9k View  comments
  Source: https://www.dailymail.co.uk/news/article-15096005/Charlie-Kirk-shooting-suspect-living-transgender-partner-texts-assassination-plot-police.html
- Excerpt: Charlie Kirk Shooting Updates: Family Member Alerted Officials to Suspect - The New York Times
  Source: https://www.nytimes.com/live/2025/09/12/us/charlie-kirk-news-suspect
- Excerpt: Charlie Kirk shooting suspect 'was living with transgender partner who handed over texts about assassination plot to the police' | Daily 

In [30]:
response6 = agent_executor.invoke({"input":"How much tariff has Trump levied on India?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `web_rag_search` with `{'query': 'Trump tariff on India 2023', 'count': 5}`


[0m[33;1m[1;3mWeb RAG answer (supporting snippets):
- Excerpt: Kin Cheung/WPA/Pool/Getty Images     Trump’s Tariffs and India’s Economic Future  Aug 25, 2025   Ajay Shah     US President Donald Trump’s new tariffs on India will likely cause some immediate economic damage. But the greater risk is that they will prompt the Indian government to turn inward, when it would be better off seeking new international partners and reaffirming its commitment to the globalized economy that has served it so well.
  Source: https://www.project-syndicate.org/onpoint/what-trump-s-tariffs-mean-for-india
- Excerpt: SportHomeNewsSportBusinessInnovationCultureArtsTravelEarthAudioVideoLiveWeatherNewslettersTrump's 50% tariff on India kicks in as Modi urges self-reliance27 August 2025ShareSaveNikhil InamdarBBC News, MumbaiShareSaveBloomberg via Getty ImagesM

In [31]:
response7 = agent_executor.invoke({"input":"Tell me a programming joke"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mWhy do programmers prefer dark mode?

Because light attracts bugs![0m

[1m> Finished chain.[0m


In [32]:
response9 = agent_executor.invoke({"input":"What are the latest updates from NASA on the possibility of life on Mars?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `web_rag_search` with `{'query': 'latest updates from NASA on the possibility of life on Mars', 'count': 5}`


[0m[33;1m[1;3mWeb RAG answer (supporting snippets):
- Excerpt: NASA Rover Findings ‘Could Be the Clearest Sign of Life’ on Mars - The New York Times
  Source: https://www.nytimes.com/2025/09/10/science/mars-rock-nasa-perserverance.html
- Excerpt: See all topics            Facebook      Tweet      Email        Link                 Link Copied!                                              Follow                                  Scientists believe intriguing leopard spots on a rock sampled by the Perseverance rover on Mars last year may have potentially been made by ancient life, NASA announced Wednesday. The team has also published a peer-reviewed paper in the journal Nature about the new analysis, though they say further study is needed.                   “After a year of review, they have come back and 

In [36]:
print(response9["output"])

Recent updates from NASA suggest that the Perseverance rover has made intriguing discoveries on Mars that could be the clearest signs of ancient life so far. Scientists have found leopard spots on a rock sampled by the rover, which may have been made by ancient life. This finding has been published in a peer-reviewed paper in the journal Nature, although further study is needed to confirm these results. The analysis of the specimen collected by Perseverance raises the possibility of Martian microbes, but NASA scientists are not yet claiming to have found fossils of such life forms. 

For more detailed information, you can read the full articles on [The New York Times](https://www.nytimes.com/2025/09/10/science/mars-rock-nasa-perserverance.html) and [CNN](https://www.cnn.com/2025/09/10/science/nasa-mars-sapphire-falls-rock-sample).
