In [1]:
!pip install -q langchain_groq
!pip install -q qdrant-client
!pip install -q langchain_community
!pip install -q pypdf
!pip install -q sentence_transformers
!pip install -q wikipedia
!pip install -q arxiv
!pip install -q PyPDF2
!pip install -q gradio
!pip install -U langchain-huggingface

Collecting sentence-transformers>=2.6.0 (from langchain-huggingface)
  Using cached sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Using cached sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
Installing collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 2.2.2
    Uninstalling sentence-transformers-2.2.2:
      Successfully uninstalled sentence-transformers-2.2.2
Successfully installed sentence-transformers-3.2.0


In [2]:
import os
from langchain_groq import ChatGroq
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient,models
from qdrant_client.http.models import PointStruct

from langchain_core.prompts import ChatPromptTemplate
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
import time
import uuid
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
import warnings

from tqdm import TqdmExperimentalWarning
warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)

  from tqdm.autonotebook import tqdm, trange


In [3]:
from dotenv import load_dotenv
load_dotenv()

groq_api_key = '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'
qdrant_url = "https://44a82cdb-7d22-46a7-b2f3-c948f4ff16ec.europe-west3-0.gcp.cloud.qdrant.io:6333"
qdrant_api_key = "S8AkQRPHd57btMMsDt_5dlM9tSOUemKeZcHX1LtX6jIm4uI8W35INA"

In [5]:
# Function to load text from a PDF file
def load_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Function to load all PDFs from a folder
def load_pdfs_from_folder(folder_path):
    docs = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):  # Only process PDF files
            file_path = os.path.join(folder_path, filename)
            text = load_pdf(file_path)  # Load text from each PDF
            docs.append(Document(page_content=text, metadata={"source": filename}))
    return docs

# Specify the folder path where the PDFs are stored
folder_path = "./Data"  # You have set your path folder

# Load all PDFs from the folder
docs = load_pdfs_from_folder(folder_path)

# Initialize RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

# Split the documents into chunks
all_splits = text_splitter.split_documents(docs)

# Check the structure of the split documents
print(all_splits)  # Each item will be a chunk of text



In [7]:
def get_embedding(text_chunks, model_name='all-MiniLM-L6-v2'):
    embeddings_model = HuggingFaceEmbeddings(model_name=model_name)
    points = []

    # Generate embeddings for each text chunk
    embeddings = embeddings_model.embed_documents([chunk.page_content for chunk in text_chunks])

    for idx, chunk in enumerate(text_chunks):
        point_id = str(uuid.uuid4())
        points.append({
            "id": point_id,
            "vector": embeddings[idx],
            "payload": {"text": chunk.page_content, "source": chunk.metadata["source"]}
        })

    return points

# Generate embeddings for the split documents
points = get_embedding(all_splits)

In [8]:

# Create a Qdrant vector store and store embeddings
collection_name = "Chatbot-HR"

vectorstore = Qdrant.from_documents(
    documents=all_splits,
    embedding=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2'),
    url=qdrant_url,
    api_key=qdrant_api_key,
    collection_name=collection_name,
    force_recreate=True
)

print("Embeddings have been stored in Qdrant successfully!")

Embeddings have been stored in Qdrant successfully!


In [9]:
retriever = vectorstore.as_retriever()

In [10]:
from langchain.tools.retriever import create_retriever_tool

pdf_tool = create_retriever_tool(
    retriever=retriever,
    name="pdf_search",
    description="Search for information about Company policy, HR, and leave"
)

In [11]:
llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama3-8b-8192")

In [20]:
prompt = ChatPromptTemplate.from_template("""

Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n

<context>
{context}
<context>

Questions: {input}

{agent_scratchpad}
""")

In [13]:
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper

wiki_wrapper = WikipediaAPIWrapper(top_k_results = 1, doc_content_chars_max=10000)
wiki = WikipediaQueryRun(api_wrapper = wiki_wrapper)

In [14]:
from langchain_community.utilities import ArxivAPIWrapper
from langchain_community.tools import ArxivQueryRun

arxiv_wrapper = ArxivAPIWrapper(top_k_results=1, doc_content_chars_max=10000)
arxiv = ArxivQueryRun(api_wrapper=arxiv_wrapper)

In [15]:
tools = [wiki, arxiv, pdf_tool]

In [21]:
from langchain.agents import create_openai_tools_agent

agent = create_openai_tools_agent(
    llm=llm,
    tools=tools,
    prompt=prompt
)

In [22]:
from langchain.agents import AgentExecutor

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)