<a href="https://colab.research.google.com/github/LINGESH-0511/PDF-RAG-Academic-Chatbot/blob/main/PDF%20RAG%20Academic%20Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet gradio faiss-cpu sentence-transformers pypdf requests


In [2]:
import gradio as gr
import faiss
import requests
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import os
import re
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
GROQ_API_KEY = "use your api key"
GROQ_ENDPOINT = "https://api.groq.com/openai/v1/chat/completions"
GROQ_MODEL = "llama-3.1-8b-instant"



In [4]:
def process_multiple_pdfs(pdf_files):
    chunks = []
    metadata = []

    for pdf in pdf_files:
        reader = PdfReader(pdf.name)
        for page_no, page in enumerate(reader.pages, start=1):
            text = page.extract_text()
            if not text:
                continue

            for i in range(0, len(text), 800):
                chunk = text[i:i+800]
                chunks.append(chunk)
                metadata.append({
                    "pdf": os.path.basename(pdf.name),
                    "page": page_no
                })

    embeddings = embedding_model.encode(chunks)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    return chunks, metadata, index


In [5]:
def detect_marks_and_type(question):
    marks = re.findall(r"\b(2|3|5|10|15|20)\b", question)
    marks = marks[0] if marks else "general"

    q = question.lower()
    if "define" in q:
        qtype = "definition"
    elif "advantages" in q or "disadvantages" in q:
        qtype = "pros_cons"
    elif "compare" in q:
        qtype = "comparison"
    elif "summary" in q:
        qtype = "summary"
    else:
        qtype = "general"

    return marks, qtype


In [6]:
def call_groq(prompt):
    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": GROQ_MODEL,
        "messages": [
            {"role": "system", "content": "You are an academic expert assistant."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.3,
        "max_tokens": 900
    }

    response = requests.post(GROQ_ENDPOINT, headers=headers, json=payload)

    if response.status_code != 200:
        raise RuntimeError(response.text)

    return response.json()["choices"][0]["message"]["content"]


In [7]:
def rag_answer(question, chunks, metadata, index, history, k=4):
    q_embedding = embedding_model.encode([question])
    _, indices = index.search(q_embedding, k)

    context = ""
    citations = []

    for i in indices[0]:
        context += chunks[i] + "\n\n"
        meta = metadata[i]
        citations.append(f"{meta['pdf']} (Page {meta['page']})")

    marks, qtype = detect_marks_and_type(question)

    prompt = f"""
Answer STRICTLY using the context.

Question Type: {qtype}
Marks: {marks}

Formatting Rules:
- 2 marks: short bullets
- 10+ marks: headings + explanation
- Summary: structured summary

Previous Conversation:
{history}

Context:
{context[:3000]}

Question:
{question}

Answer:
"""

    answer = call_groq(prompt)
    return answer, list(set(citations))


In [8]:
chunks_store = None
metadata_store = None
index_store = None
chat_history_text = []

def upload_pdfs(pdfs):
    global chunks_store, metadata_store, index_store, chat_history_text
    chat_history_text = []
    chunks_store, metadata_store, index_store = process_multiple_pdfs(pdfs)
    return "‚úÖ PDFs indexed with source tracking"
def chat(question, history):
    global chat_history_text

    if not question.strip():
        history.append((question, "Please ask a valid question."))
        return history, ""

    try:
        answer, citations = rag_answer(
            question,
            chunks_store,
            metadata_store,
            index_store,
            "\n".join(chat_history_text)
        )

        formatted = answer + "\n\nüìö Sources:\n"
        for c in citations:
            formatted += f"- {c}\n"

        history.append((question, formatted))
        chat_history_text.append(f"Q: {question}\nA: {answer}")

        return history, ""

    except Exception as e:
        history.append((question, f"‚ö†Ô∏è Error: {str(e)}"))
        return history, ""


In [9]:
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("## : üìÑ PDF RAG Academic Chatbott")
    gr.Markdown("Academic-grade PDF Question Answering with citations and adaptive answers.")

    pdf_input = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs")
    status = gr.Textbox(interactive=False)

    pdf_input.change(upload_pdfs, inputs=pdf_input, outputs=status)

    chatbot = gr.Chatbot(height=450, bubble_full_width=False)

    msg = gr.Textbox(placeholder="Ask your academic question‚Ä¶")

    msg.submit(chat, inputs=[msg, chatbot], outputs=[chatbot, msg])

demo.launch(share=True)


  with gr.Blocks(theme=gr.themes.Soft()) as demo:
  chatbot = gr.Chatbot(height=450, bubble_full_width=False)
  chatbot = gr.Chatbot(height=450, bubble_full_width=False)
  chatbot = gr.Chatbot(height=450, bubble_full_width=False)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8c352d52b565fc5f60.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


