<a href="https://colab.research.google.com/github/LRManamperi/Machine-Learning/blob/main/pdfchatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Import required libraries
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceEmbeddings  # Free, open-source embeddings
import os

# Set your Groq API key
os.environ["GROQ_API_KEY"] = "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"

class PDFChatbot:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.vector_store = None
        self.qa_chain = None

    def load_and_process_pdf(self):
        """Load PDF, split into chunks, and embed using open-source model."""
        # Step 1: Load PDF
        loader = PyPDFLoader(self.pdf_path)
        documents = loader.load()

        # Step 2: Split into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len
        )
        chunks = text_splitter.split_documents(documents)

        # Step 3: Use open-source embeddings (e.g., all-MiniLM-L6-v2)
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )

        # Step 4: Store in Chroma
        self.vector_store = Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory="./chroma_db"
        )

    def initialize_qa_chain(self, model_name="mixtral-8x7b-32768"):
        """Initialize Groq-powered QA chain."""
        llm = ChatGroq(
            model_name=model_name,
            temperature=0.3  # Slightly more creative than 0
        )

        self.qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}),
            return_source_documents=True
        )

    def ask_question(self, question):
        """Get answer from Groq with sources."""
        if not self.qa_chain:
            raise ValueError("Initialize QA chain first!")

        result = self.qa_chain({"query": question})
        return {
            "answer": result["result"],
            "sources": result["source_documents"]
        }

# Example Usage
if __name__ == "__main__":
    chatbot = PDFChatbot("HM2450 - Introduction to Psychology CA-I (2025).pdf")  # Replace with your PDF
    chatbot.load_and_process_pdf()
    chatbot.initialize_qa_chain()

    while True:
        question = input("\nAsk a question (or 'quit'): ")
        if question.lower() == 'quit':
            break
        response = chatbot.ask_question(question)
        print("\nAnswer:", response["answer"])
        print("\nSources:")
        for doc in response["sources"]:
            print(f"- Page {doc.metadata['page']}: {doc.page_content[:100]}...")

ModuleNotFoundError: Module langchain_community.document_loaders not found. Please install langchain-community to access this module. You can install it using `pip install -U langchain-community`

In [2]:
pip install langchain-groq sentence-transformers chromadb pypdf

Collecting langchain-groq
  Downloading langchain_groq-0.3.2-py3-none-any.whl.metadata (2.6 kB)
Collecting chromadb
  Downloading chromadb-1.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Collecting groq<1,>=0.4.1 (from langchain-groq)
  Downloading groq-0.22.0-py3-none-any.whl.metadata (15 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.23.0-py2.py3-no

In [5]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-no

In [None]:
# Import required libraries
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from sentence_transformers import SentenceTransformer
import os

# Set your Groq API key
os.environ["GROQ_API_KEY"] = "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"

class PDFChatbot:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.vector_store = None
        self.qa_chain = None

    def load_and_process_pdf(self):
        """Load PDF, split into chunks, and embed using all-MiniLM-L6-v2."""
        # Step 1: Load PDF
        loader = PyPDFLoader(self.pdf_path)
        documents = loader.load()

        # Step 2: Split into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len
        )
        chunks = text_splitter.split_documents(documents)

        # Step 3: Initialize all-MiniLM-L6-v2 embeddings
        model = SentenceTransformer('all-MiniLM-L6-v2')
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'normalize_embeddings': False}
        )

        # Step 4: Store in Chroma
        self.vector_store = Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory="./chroma_db"
        )

    def initialize_qa_chain(self):
        """Initialize Groq-powered QA chain with current model."""
        # Using current recommended Groq model
        llm = ChatGroq(
            model_name="Llama3-8b-8192",
            temperature=0.3,
            max_tokens=2048
        )

        # Enhanced prompt template
        prompt = ChatPromptTemplate.from_template("""
        You are an expert at analyzing PDF documents. Answer the question based only on the following context:

        {context}

        Question: {input}

        Guidelines:
        1. Provide a detailed, accurate answer
        2. Cite specific page numbers when referencing the document
        3. If the answer isn't in the document, say so
        4. Keep technical explanations clear and concise
        """)

        document_chain = create_stuff_documents_chain(llm, prompt)
        retriever = self.vector_store.as_retriever(
            search_type="mmr",
            search_kwargs={"k": 4, "fetch_k": 10}
        )
        self.qa_chain = create_retrieval_chain(retriever, document_chain)

    def ask_question(self, question):
        """Get answer with sources."""
        if not self.qa_chain:
            raise ValueError("QA chain not initialized!")

        result = self.qa_chain.invoke({"input": question})
        return {
            "answer": result["answer"],
            "sources": result["context"]
        }

# Main execution
if __name__ == "__main__":
    try:
        chatbot = PDFChatbot("HM2450 - Introduction to Psychology CA-I (2025).pdf")

        print("🔍 Loading and processing PDF...")
        chatbot.load_and_process_pdf()

        print("🧠 Initializing QA system with all-MiniLM-L6-v2 embeddings...")
        chatbot.initialize_qa_chain()

        print("✅ Ready! Ask questions about your PDF (type 'quit' to exit)\n")

        while True:
            question = input("💬 Your question: ")
            if question.lower() in ['quit', 'exit']:
                break

            try:
                response = chatbot.ask_question(question)

                print("\n📝 Answer:")
                print(response["answer"])

                print("\n📚 Sources:")
                for i, doc in enumerate(response["sources"], 1):
                    print(f"\n📄 Source {i} (Page {doc.metadata['page']+1}):")
                    print(doc.page_content[:400] + ("..." if len(doc.page_content) > 400 else ""))
                print("\n" + "="*50 + "\n")

            except Exception as e:
                print(f"\n❌ Error: {str(e)}")
                print("Please try rephrasing your question or check your PDF content.\n")

    except Exception as e:
        print(f"🚨 Initialization failed: {str(e)}")

🔍 Loading and processing PDF...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  embeddings = HuggingFaceEmbeddings(


🧠 Initializing QA system with all-MiniLM-L6-v2 embeddings...
✅ Ready! Ask questions about your PDF (type 'quit' to exit)

💬 Your question: what are my childhood experiences

📝 Answer:
Based on the provided context, I can answer the question:

What are my childhood experiences?

Unfortunately, the provided context does not contain any information about the student's childhood experiences. The context only includes a question paper with a specific question about self-evaluation, strengths, weaknesses, and self-improvement. There is no mention of childhood experiences. Therefore, I must say that the answer is not present in the document.

📚 Sources:

📄 Source 1 (Page 2):
Citation  and  referencing  (10  marks)    
 
 
 
BACHELOR  OF  THE  SCIENCE  OF  ENGINEERING  HONOURS    HM2450:   INTRODUCTION  TO  PSYCHOLOGY                           
Answer  the  following  question:  
 
-  Have  you  ever  conducted  a  self-evaluation?  Based  on  your  understanding,  identify  two  of  
your
 
k