In [115]:
from langchain_community.tools import WikipediaQueryRun, BraveSearch
from langchain_community.utilities import WikipediaAPIWrapper, ArxivAPIWrapper
from langchain_community.tools import ArxivQueryRun
from langchain_community.document_loaders import WebBaseLoader, BraveSearchLoader, WikipediaLoader, PyPDFLoader, Docx2txtLoader, TextLoader, CSVLoader, BSHTMLLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.tools.retriever import create_retriever_tool
from langchain.tools import StructuredTool
from langchain import hub
from langchain.agents import create_openai_tools_agent, AgentExecutor

In [116]:
from dotenv import load_dotenv
import os
import re

## Environment

In [117]:
load_dotenv()
os.environ['OPENAI_API_KEY']=os.getenv('OPENAI_API_KEY')
BRAVE_API_KEY = os.getenv("BRAVE_SEARCH_API_KEY")  # required for web search

## Wikipedia (Quick search)

In [118]:
api_wrapper=WikipediaAPIWrapper(top_k_results=1,doc_content_chars_max=1000)
wiki=WikipediaQueryRun(api_wrapper=api_wrapper)

In [119]:
wiki.description = (
    "Wikipedia quick lookup. Use for short factual questions that need a brief summary. "
    "Do NOT use for detailed/ambiguous multi-part queries or when quotes/timelines are needed; "
    "prefer wikipedia_rag in those cases."
)

## LangSmith site Retriever tool

In [120]:
loader=WebBaseLoader('https://docs.smith.langchain.com/')
docs=loader.load()
documents=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200).split_documents(docs)

In [121]:
## creating vector DB
vectordb=FAISS.from_documents(documents, OpenAIEmbeddings())
retriever=vectordb.as_retriever()
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x11855c440>, search_kwargs={})

In [122]:
retriever_tool = create_retriever_tool(
    retriever,
    "langsmith_search",
    "FOR LANGSMITH ONLY. If the query mentions LangSmith or docs.smith.langchain.com, "
    "you MUST use this tool and no other."
)

In [123]:
retriever_tool.name

'langsmith_search'

## Arxiv Tool

In [124]:
arxiv_wrapper=ArxivAPIWrapper(top_k_results=1, doc_content_chars_max=200)
arxiv=ArxivQueryRun(arxiv_wrapper=arxiv_wrapper)
arxiv.name

'arxiv'

In [125]:
arxiv.description = (
    "ArXiv tool. Use when the query references arXiv or includes an arXiv ID."
)

## Wikipedia RAG (deeper context)

In [126]:
def wikipedia_rag(query: str, top_docs: int = 3) -> str:
    try:
        wiki_docs = WikipediaLoader(query=query, load_max_docs=top_docs).load()
    except Exception as e:
        return f"Failed to load from Wikipedia: {e}"

    if not wiki_docs:
        return "No Wikipedia pages found."

    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_documents(wiki_docs)

    vdb = FAISS.from_documents(chunks, OpenAIEmbeddings())
    rel_docs = vdb.as_retriever(search_kwargs={"k": 4}).get_relevant_documents(query)

    if not rel_docs:
        return "No relevant passages were found in the fetched Wikipedia pages."

    out, seen = [], []
    for d in rel_docs:
        title = d.metadata.get("title") or "Wikipedia"
        if title not in seen:
            seen.append(title)
        text = d.page_content.strip().replace("\n", " ")
        if len(text) > 600:
            text = text[:600] + " ..."
        url = d.metadata.get("source") or d.metadata.get("wikipedia_url") or ""
        src = f"{title} — {url}" if url else title
        out.append(f"- Excerpt: {text}\n  Source: {src}")

    return (
        "Wikipedia RAG answer (supporting snippets):\n"
        + "\n".join(out[:4])
        + "\n\nPages:\n"
        + "\n".join(f"* {t}" for t in seen[:5])
    )

In [127]:
wikipedia_rag_tool = StructuredTool.from_function(
    func=wikipedia_rag,
    name="wikipedia_rag",
    description=(
        "Deep Wikipedia RAG. Use when the question needs detailed context, comparisons, timelines, or quotations "
        "from Wikipedia beyond a short summary. Args: query:str, top_docs:int=3"
    ),
)

## Web RAG via Brave

In [128]:
brave_search_tool = BraveSearch.from_api_key(api_key=BRAVE_API_KEY, search_kwargs={"count": 5})

In [129]:
def web_rag_search(query: str, count: int = 5) -> str:
    if not BRAVE_API_KEY:
        return "BRAVE_SEARCH_API_KEY is not set."
    count = max(1, min(int(count), 10))

    # 1) Search
    serp_docs = BraveSearchLoader(query=query, api_key=BRAVE_API_KEY, search_kwargs={"count": count}).load()

    # 2) Links
    links = []
    for d in serp_docs:
        link = d.metadata.get("link")
        if link and link not in links:
            links.append(link)
    if not links:
        return "No web results found."

    # 3) Scrape
    web_docs = []
    for url in links:
        try:
            for pd in WebBaseLoader(url).load():
                pd.metadata["source"] = url
                web_docs.append(pd)
        except Exception:
            pass
    if not web_docs:
        return "Failed to load content from the top web results."

    # 4) Chunk → 5) Embed → 6) Similarity search
    chunks = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=200).split_documents(web_docs)
    vdb = FAISS.from_documents(chunks, OpenAIEmbeddings())
    rel_docs = vdb.as_retriever(search_kwargs={"k": 4}).get_relevant_documents(query)

    if not rel_docs:
        return "No relevant passages were found in the crawled pages."

    # 7) Compose short answer with sources
    out, seen = [], []
    for d in rel_docs:
        src = d.metadata.get("source") or d.metadata.get("link") or "unknown"
        if src not in seen:
            seen.append(src)
        text = d.page_content.strip().replace("\n", " ")
        if len(text) > 600:
            text = text[:600] + " ..."
        out.append(f"- Excerpt: {text}\n  Source: {src}")

    return (
        "Web RAG answer (supporting snippets):\n"
        + "\n".join(out[:4])
        + "\n\nTop sources:\n"
        + "\n".join(f"* {s}" for s in seen[:5])
    )

In [130]:
web_rag_tool = StructuredTool.from_function(
    func=web_rag_search,
    name="web_rag_search",
    description="General web RAG. Use when the question is NOT about Wikipedia, Arxiv, or LangSmith. Args: query:str, count:int=5",
)

## Document RAG

In [141]:
DOC_VDB = None

def doc_index(files_csv: str = "./NehaGaonkarDataScientist.pdf", urls_csv: str = "", chunk_size: int = 1200, chunk_overlap: int = 200) -> str:
    """
    Ingest user-provided documents into an in-memory FAISS index.
    - files_csv: comma-separated local file paths (pdf, docx, txt, md, csv, html)
    - urls_csv: comma-separated URLs to fetch and index
    """
    paths = [p.strip() for p in files_csv.split(",") if p.strip()]
    urls  = [u.strip() for u in urls_csv.split(",")  if u.strip()]

    docs = []

    # Load local files
    for p in paths:
        try:
            lo = None
            pl = p.lower()
            if pl.endswith(".pdf"):
                lo = PyPDFLoader(p)
            elif pl.endswith(".docx") or pl.endswith(".doc"):
                lo = Docx2txtLoader(p)
            elif pl.endswith(".csv"):
                lo = CSVLoader(p)
            elif pl.endswith(".html") or pl.endswith(".htm"):
                lo = BSHTMLLoader(p)
            elif pl.endswith(".txt") or pl.endswith(".md"):
                lo = TextLoader(p, encoding="utf-8")
            else:
                # skip unknown types silently to avoid heavy 'unstructured' dependency
                continue
            docs.extend(lo.load())
        except Exception:
            continue

    # Load URLs
    for url in urls:
        try:
            page_docs = WebBaseLoader(url).load()
            for pd in page_docs:
                pd.metadata["source"] = url
            docs.extend(page_docs)
        except Exception:
            continue

    if not docs:
        return "No documents loaded. Provide valid paths in files_csv or URLs in urls_csv."

    # Chunk and (create or update) vectorstore
    splitter = RecursiveCharacterTextSplitter(chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap))
    chunks = splitter.split_documents(docs)

    global DOC_VDB, DOC_CHUNK_COUNT, DOC_INDEXED
    if DOC_VDB is None:
        DOC_VDB = FAISS.from_documents(chunks, OpenAIEmbeddings())
    else:
        DOC_VDB.add_documents(chunks)

    DOC_CHUNK_COUNT += len(chunks)
    DOC_INDEXED = DOC_VDB is not None and DOC_CHUNK_COUNT > 0

    return f"Indexed {len(chunks)} chunks from {len(paths)} file(s) and {len(urls)} URL(s)."

In [142]:
def doc_rag_search(query: str, k: int = 4) -> str:
    """
    Query the user-provided document corpus previously indexed with doc_index.
    """
    if DOC_VDB is None:
        return "No indexed documents. Run doc_index first with files_csv and/or urls_csv."
    rel = DOC_VDB.as_retriever(search_kwargs={"k": int(k)}).get_relevant_documents(query)
    if not rel:
        return "No relevant passages found in the indexed documents."

    out, seen = [], []
    for d in rel:
        src = d.metadata.get("source") or d.metadata.get("file_path") or d.metadata.get("source_path") or "uploaded_document"
        if src not in seen: seen.append(src)
        text = d.page_content.strip().replace("\n", " ")
        if len(text) > 700: text = text[:700] + " ..."
        out.append(f"- Excerpt: {text}\n  Source: {src}")
    return "Document RAG answer (supporting snippets):\n" + "\n".join(out[:4]) + "\n\nSources:\n" + "\n".join(f"* {s}" for s in seen[:8])

In [143]:
doc_index_tool = StructuredTool.from_function(
    func=doc_index,
    name="doc_index",
    description=("Ingest user documents into a persistent (session) vector index. "
                 "Use when the user uploads or provides links. "
                 "Args: files_csv:str (comma-separated file paths), urls_csv:str (comma-separated URLs), "
                 "chunk_size:int=1200, chunk_overlap:int=200"),
)
doc_rag_tool = StructuredTool.from_function(
    func=doc_rag_search,
    name="doc_rag_search",
    description=("Query the user's indexed documents (from doc_index). "
                 "Use for questions that refer to the uploaded/added documents. "
                 "Args: query:str, k:int=4"),
)

## Combine All Tools

In [144]:
# --- Assemble tools ---
tools = [wiki, wikipedia_rag_tool, arxiv, retriever_tool, web_rag_tool, doc_index_tool, doc_rag_tool]

In [145]:
tools

[WikipediaQueryRun(description='Wikipedia quick lookup. Use for short factual questions that need a brief summary. Do NOT use for detailed/ambiguous multi-part queries or when quotes/timelines are needed; prefer wikipedia_rag in those cases.', api_wrapper=WikipediaAPIWrapper(wiki_client=<module 'wikipedia' from '/Users/nehagaonkar/Desktop/RAG agent/ragagent/lib/python3.12/site-packages/wikipedia/__init__.py'>, top_k_results=1, lang='en', load_all_available_meta=False, doc_content_chars_max=1000)),
 StructuredTool(name='wikipedia_rag', description='Deep Wikipedia RAG. Use when the question needs detailed context, comparisons, timelines, or quotations from Wikipedia beyond a short summary. Args: query:str, top_docs:int=3', args_schema=<class 'langchain_core.utils.pydantic.wikipedia_rag'>, func=<function wikipedia_rag at 0x118260900>),
 ArxivQueryRun(description='ArXiv tool. Use when the query references arXiv or includes an arXiv ID.', api_wrapper=ArxivAPIWrapper(arxiv_search=<class 'arx

## Agents

In [146]:
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)

In [147]:
prompt = hub.pull("hwchase17/openai-functions-agent")
prompt.messages



[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='You are a helpful assistant'), additional_kwargs={}),
 MessagesPlaceholder(variable_name='chat_history', optional=True),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={}),
 MessagesPlaceholder(variable_name='agent_scratchpad')]

In [148]:
agent=create_openai_tools_agent(llm,tools,prompt)

## Agent Executor

In [149]:
agent_executor=AgentExecutor(agent=agent,tools=tools,verbose=True)
agent_executor

AgentExecutor(verbose=True, agent=RunnableMultiActionAgent(runnable=RunnableAssign(mapper={
  agent_scratchpad: RunnableLambda(lambda x: format_to_openai_tool_messages(x['intermediate_steps']))
})
| ChatPromptTemplate(input_variables=['agent_scratchpad', 'input'], optional_variables=['chat_history'], input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai.AIMessageChunk, Tag(tag='AIMessageChunk')], typing.Annotated[langchain_core.messages.human.HumanMessageChunk, Tag

In [150]:
# def pick_tools(q: str):
#     ql = q.lower()
#     if "langsmith" in ql or "docs.smith.langchain.com" in ql:
#         return [retriever_tool]
#     if "arxiv" in ql or re.search(r"\b\d{4}\.\d{5}\b", ql):
#         return [arxiv]
#     # quick wiki signals
#     if any(w in ql for w in ["who is", "who was", "when was", "where is", "capital of", "definition of"]):
#         return [wiki, wikipedia_rag_tool]
#     # default to web
#     return tools

In [151]:
response1=agent_executor.invoke({"input":"List experience of Neha"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `web_rag_search` with `{'query': 'Experience of Neha'}`


[0m[33;1m[1;3mWeb RAG answer (supporting snippets):
- Excerpt: Neha Kumar                                                          0                                     Investor, entrepreneur, business coach, faculty, founder, mom.    Meet Neha Kumar                           Neha Kumar brings over 15 years of experience in startups and a proven track record as a changemaker and strategic leader.Before co-founding Full Glass Wine, Neha served as COO and CFO of Create & Cultivate, where she successfully led the company through a private equity acquisition in 2021. Following the exit, she launched New Money Ventures, a fund to invest in early-stage startups ...
  Source: https://www.nehatkumar.com/
- Excerpt: At Full Glass Wine, Neha plays a pivotal role in identifying acquisition opportunities that align with the Full Glass mission, while also spearheading

In [91]:
response2=agent_executor.invoke({"input":"What's the paper 1605.08386 about?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `arxiv` with `{'query': '1605.08386'}`


[0m[38;5;200m[1;3mPublished: 2016-05-26
Title: Heat-bath random walks with Markov bases
Authors: Caprice Stanley, Tobias Windisch
Summary: Graphs on lattice points are studied whose edges come from a finite set of
allowed moves of arbitrary length. We show that the diameter of these graphs on
fibers of a fixed integer matrix can be bounded from above by a constant. We
then study the mixing behaviour of heat-bath random walks on these graphs. We
also state explicit conditions on the set of moves so that the heat-bath random
walk, a generalization of the Glauber dynamics, is an expander in fixed
dimension.[0m[32;1m[1;3mThe paper with the identifier 1605.08386 is titled "Heat-bath random walks with Markov bases" by Caprice Stanley and Tobias Windisch. The paper discusses the study of graphs on lattice points where the edges are derived from a finite set of allowed moves o

In [92]:
response3=agent_executor.invoke({"input":"Who was the british king who established the church of England and in what year?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `wikipedia` with `{'query': 'British king who established the Church of England'}`


[0m[36;1m[1;3mPage: Charles II of England
Summary: Charles II (29 May 1630 – 6 February 1685) was King of Scotland from 1649 until 1651 and King of England, Scotland, and Ireland from the 1660 Restoration of the monarchy until his death in 1685.
Charles II was the eldest surviving child of Charles I of England, Scotland and Ireland and Henrietta Maria of France. After Charles I's execution at Whitehall on 30 January 1649, at the climax of the English Civil War, the Parliament of Scotland proclaimed Charles II king on 5 February 1649. However, England entered the period known as the English Interregnum or the English Commonwealth with a republican government eventually led by Oliver Cromwell. Cromwell defeated Charles II at the Battle of Worcester on 3 September 1651, and Charles fled to mainland Europe. Cromwell became Lord Prot

In [93]:
response4=agent_executor.invoke({"input":"Who was the british king who established the church of England and what was the role of his marriage in his decisiopn?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `wikipedia` with `{'query': 'Church of England establishment by British king and role of marriage'}`


[0m[36;1m[1;3mPage: Charles I of England
Summary: Charles I (19 November 1600 – 30 January 1649) was King of England, Scotland, and Ireland from 27 March 1625 until his execution in 1649.
Charles was born into the House of Stuart as the second son of King James VI of Scotland, but after his father inherited the English throne in 1603, he moved to England, where he spent much of the rest of his life. He became heir apparent to the kingdoms of England, Scotland, and Ireland in 1612 upon the death of his elder brother, Henry Frederick, Prince of Wales. An unsuccessful and unpopular attempt to marry him to Infanta Maria Anna of Spain culminated in an eight-month visit to Spain in 1623 that demonstrated the futility of the marriage negotiation. Two years later, shortly after his accession, he married Henrietta Maria

In [94]:
response5 = agent_executor.invoke({"input":"Who is the person identified as the shooter of Charlie Kirk?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `web_rag_search` with `{'query': 'shooter of Charlie Kirk'}`


[0m[33;1m[1;3mWeb RAG answer (supporting snippets):
- Excerpt: America.WorldFri, Sep 124:13'No idea the fire that you have ignited': Charlie Kirk's widow speaksThe conservative provocateur was speaking at an event on a college campus when he was killed.Crime and JusticeSaturday 2:35pm4:13Like the Robinsons, Schwiermann is a member of The Church of Jesus Christ of Latter-day Saints. She said they belonged to the same congregation, though she said the family hadn’t been active in the church in at least eight years."I want to make sure that people know that we don’t have any ill feelings towards their family or him," Schwiermann said. "He made the wrong choi ...
  Source: https://www.1news.co.nz/2025/09/14/students-reckon-with-trauma-after-witnessing-charlie-kirk-assassination/
- Excerpt: Questions linger about alleged shooter’s motivation for killing C

In [95]:
response6 = agent_executor.invoke({"input":"How much tariff has Trump levied on India?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `web_rag_search` with `{'query': 'Trump tariff on India'}`


[0m[33;1m[1;3mWeb RAG answer (supporting snippets):
- Excerpt: SportHomeNewsSportBusinessInnovationCultureArtsTravelEarthAudioVideoLiveWeatherNewslettersTrump's 50% tariff on India kicks in as Modi urges self-reliance27 August 2025ShareSaveNikhil InamdarBBC News, MumbaiShareSaveBloomberg via Getty ImagesModi has urged small shop owners and businesses to put up "Made in India" boards outside their storesUS tariffs of 50% on goods from India took effect on Wednesday as Donald Trump sought to punish Delhi for buying Russian oil and weapons.The tariffs – among the highest in the world – include a 25% penalty for transactions with Russia that are a key source  ...
  Source: https://www.bbc.com/news/articles/c5ykznn158qo
- Excerpt: Donald Trump's 50% tariff on India kicks in as PM Modi urges self-relianceSkip to contentBritish Broadcasting CorporationHomeNew

In [96]:
response7 = agent_executor.invoke({"input":"Tell me a programming joke"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mSure, here's a programming joke for you:

Why do programmers prefer dark mode?
Because the light attracts bugs![0m

[1m> Finished chain.[0m


In [97]:
response9 = agent_executor.invoke({"input":"What are the latest updates from NASA on the possibility of life on Mars?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `web_rag_search` with `{'query': 'latest updates from NASA on the possibility of life on Mars'}`


[0m[33;1m[1;3mWeb RAG answer (supporting snippets):
- Excerpt: See all topics            Facebook      Tweet      Email        Link                 Link Copied!                                              Follow                                  Scientists believe intriguing leopard spots on a rock sampled by the Perseverance rover on Mars last year may have potentially been made by ancient life, NASA announced Wednesday. The team has also published a peer-reviewed paper in the journal Nature about the new analysis, though they say further study is needed.                   “After a year of review, they have come back and they said, listen, we can’t find  ...
  Source: https://www.cnn.com/2025/09/10/science/nasa-mars-sapphire-falls-rock-sample
- Excerpt: Nasa rover finds rocks on Mars with potential signs of past l