In [None]:
import os
import tempfile
import re
from datetime import datetime

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from agno.agent import Agent
from agno.models.ollama import Ollama
from agno.models.google import Gemini
from agno.tools.duckduckgo import DuckDuckGoTools
from agno.vectordb.chroma import ChromaDb

from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

In [None]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyCr35hxFrpVsbNWgqOwU6PwmkpwLmO2dJA"

In [None]:
COLLECTION_NAME = "deepseek_rag"
CHROMA_PATH = "./chroma_db"
EMBEDDING_MODEL = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
MODEL_VERSION = "deepseek-r1:1.5b"

In [None]:
# --- Initialize ChromaDB ---
def init_chroma():
    chroma = ChromaDb(
        collection=COLLECTION_NAME,
        path=CHROMA_PATH,
        embedder=EMBEDDING_MODEL,
        persistent_client=True
    )
    try:
        chroma.client.get_collection(name=COLLECTION_NAME)
    except Exception:
        chroma.create()
    return chroma


In [None]:
# --- Split documents into chunks ---
def split_texts(documents):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_documents(documents)
    return [Document(page_content=chunk.page_content, metadata=chunk.metadata) for chunk in chunks if chunk.page_content.strip()]

In [None]:
# --- Process PDF Files ---
def process_pdf(file_path, file_name):
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    for doc in documents:
        doc.metadata.update({
            "source_type": "pdf",
            "file_name": file_name,
            "timestamp": datetime.now().isoformat()
        })
    return split_texts(documents)

In [None]:
# --- Process Web URL ---
def process_web(url):
    loader = WebBaseLoader(url)
    documents = loader.load()
    for doc in documents:
        doc.metadata.update({
            "source": url,
            "timestamp": datetime.now().isoformat()
        })
    return split_texts(documents)

In [None]:
# --- Retrieve Documents ---
def retrieve_documents(prompt, chroma_client, threshold=0.7):
    collection = chroma_client.client.get_collection(name=COLLECTION_NAME)
    results = collection.query(query_texts=[prompt], n_results=5)
    docs = results.get('documents', [])
    return docs, len(docs) > 0

In [None]:
# --- Filter <think> tags ---
def filter_think_tags(response):
    return re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)

In [None]:

# --- Agents ---
def get_rag_agent():
    return Agent(
        name="DeepSeek RAG Agent",
        model=Ollama(id=MODEL_VERSION),
        instructions="Answer using the most relevant available information.",
        markdown=True,
    )

def get_web_search_agent():
    return Agent(
        name="Web Search Agent",
        model=Gemini(id="gemini-2.0-flash-exp"),
        tools=[DuckDuckGoTools()],
        instructions="Search the web using DuckDuckGo and summarize key points.",
        markdown=True,
    )

In [None]:
def answer_question(prompt, chroma_client, use_rag=True, use_web_search=False, force_web_search=False):
    context = ""

    if use_rag and not force_web_search:
        docs, found = retrieve_documents(prompt, chroma_client)
        if found:
            flattened = [p for doc in docs for p in doc]
            context = "\n\n".join(flattened)

    if (not context or force_web_search) and use_web_search:
        print("Running web search...")
        web_agent = get_web_search_agent()
        web_results = web_agent.run(prompt).content
        if web_results:
            context = f"Web Search Results:\n{web_results}"

    print("Generating response...")
    agent = get_rag_agent()
    response = agent.run(f"Context: {context}\n\nQuestion: {prompt}").content
    return filter_think_tags(response)



In [None]:
chroma_client = init_chroma()

In [None]:
pdf_path = "sample.pdf"
if os.path.exists(pdf_path):
    pdf_chunks = process_pdf(pdf_path, "sample.pdf")
    ids = [str(i) for i in range(len(pdf_chunks))]
    texts = [doc.page_content for doc in pdf_chunks]
    metadatas = [doc.metadata for doc in pdf_chunks]
    chroma_client.client.get_collection(name=COLLECTION_NAME).add(
        ids=ids, documents=texts, metadatas=metadatas
    )

In [None]:
web_url = "https://en.wikipedia.org/wiki/Artificial_intelligence"
web_chunks = process_web(web_url)
ids = [str(i + 10000) for i in range(len(web_chunks))]
texts = [doc.page_content for doc in web_chunks]
metadatas = [doc.metadata for doc in web_chunks]
chroma_client.client.get_collection(name=COLLECTION_NAME).add(
    ids=ids, documents=texts, metadatas=metadatas
)

In [None]:
query = "What is Artificial Intelligence?"
answer = answer_question(query, chroma_client, use_rag=True, use_web_search=True)
print(f"\n🧠 Answer:\n{answer}")