In [12]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

def process_pdf(file_path: str):
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    print(docs)
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_documents(docs)
    print("Number of chunks:", len(chunks))
    for i, chunk in enumerate(chunks):
        print(f"Chunk {i}: {chunk.page_content[:100]}...")  # Print first 100 characters of each chunk
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embeddings)
    return vectordb


In [13]:
#run the function with the path to your PDF file
if __name__ == "__main__":
    file_path = "Take_home_assignment_AdvoraAI.pdf"
    vectordb = process_pdf(file_path)
    print("Vector database created successfully.")
    # You can now use vectordb for further processing or querying.

[Document(metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® Word 2016', 'creationdate': '2025-05-01T14:06:20+00:00', 'moddate': '2025-05-01T14:06:20+00:00', 'source': 'Take_home_assignment_AdvoraAI.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Take-Home Assignment: PDF Document Query Application \nWe ask you to complete a take-home assignment to help us evaluate your technical skills and creativity. Your task is to build a simple \nweb application that allows users to upload PDF documents and query their content via a chat interface, storing up to 5 chat \nmessages in memory. This aligns with Advora.ai’s focus on AI-driven document processing and knowledge retrieval. \nAssignment Requirements \n1. Functionalities: \n○ PDF Upload: Users can upload a PDF file through the web interface. \n○ Chat Interface: Users can ask questions about the PDF’s content, and the app responds with relevant answers \nextracted from the document. \n○ In-Memory Chat Stora

In [14]:
vectordb.search("What is the main topic of the document?", k=3, search_type="similarity")

[Document(id='88095155-b281-4d0d-bbd9-61c8f213def2', metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® Word 2016', 'creationdate': '2025-05-01T14:06:20+00:00', 'moddate': '2025-05-01T14:06:20+00:00', 'source': 'Take_home_assignment_AdvoraAI.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content='Take-Home Assignment: PDF Document Query Application \nWe ask you to complete a take-home assignment to help us evaluate your technical skills and creativity. Your task is to build a simple \nweb application that allows users to upload PDF documents and query their content via a chat interface, storing up to 5 chat \nmessages in memory. This aligns with Advora.ai’s focus on AI-driven document processing and knowledge retrieval. \nAssignment Requirements \n1. Functionalities:'),
 Document(id='c9d92568-27c0-4a91-aa91-5c418cb9f0a3', metadata={'producer': 'www.ilovepdf.com', 'creator': 'Microsoft® Word 2016', 'creationdate': '2025-05-01T14:06:20+00:00', 'moddate': '2025-

In [16]:
import os 
from dotenv import load_dotenv
from langchain.chains import RetrievalQA
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate

load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")

def create_qa_chain(vectorstore):
    llm = init_chat_model("llama3-8b-8192", model_provider="groq")  # Replace with your preferred LLM
    chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3}),
        chain_type="stuff"
    )
    return chain



In [None]:
# Example usage
if __name__ == "__main__":
    file_path = "Take_home_assignment_AdvoraAI.pdf"
    vectordb = process_pdf(file_path)
    qa_chain = create_qa_chain(vectordb)
    
    # Example query
    query = "What is the main topic of the document?"
    response = qa_chain.invoke(query)
    print("Response:", response)

In [18]:
qa_chain.invoke("When is the deadline for the assignment?")

{'query': 'When is the deadline for the assignment?',
 'result': 'The deadline for the assignment is 7 days from receiving this assignment.'}