<a href="https://colab.research.google.com/github/JahanviGupta17/ResearchBot-RAG-based-QA-system/blob/main/researchBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade pip setuptools wheel
!pip install streamlit pyngrok python-dotenv pdfplumber PyPDF2 \
langchain langchain-community sentence-transformers "transformers==4.28.1"
!pip install faiss-cpu

Collecting transformers==4.28.1
  Using cached transformers-4.28.1-py3-none-any.whl.metadata (109 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.1)
  Using cached tokenizers-0.13.3.tar.gz (314 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
INFO: pip is looking at multiple versions of sentence-transformers to determine which version is compatible with other requirements. This could take a while.
Collecting sentence-transformers
  Using cached sentence_transformers-5.2.2-py3-none-any.whl.metadata (16 kB)
  Using cached sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
  Using cached sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
  Using cached sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
  Using cached sentence_trans

In [2]:
!pip install huggingface-hub




In [3]:
import os
import streamlit as st
from dotenv import load_dotenv
import pdfplumber
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.base import Embeddings
from langchain_community.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

from huggingface_hub import InferenceClient
from langchain_community.embeddings import HuggingFaceEmbeddings
# Load environment variables (if any)
load_dotenv()

False

In [4]:

# 1Ô∏è‚É£ Load HF API Token (Colab)
from google.colab import userdata

HF_TOKEN = userdata.get("HF_TOKEN")

if HF_TOKEN is None:
    raise ValueError(" HF_TOKEN not found in Colab Secrets")

print("HF_TOKEN loaded successfully")


HF_TOKEN loaded successfully


In [5]:

# Embedding Model

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

print(" Embeddings model loaded")


  embeddings = HuggingFaceEmbeddings(


 Embeddings model loaded


In [6]:
# LLM via HuggingFace API

llm = InferenceClient(
    model="mistralai/Mistral-7B-Instruct-v0.2",
    token=HF_TOKEN
)

print("LLM client ready")


LLM client ready


In [7]:

# PDF Extraction

def extract_pdf_text(pdf_files):
    text = ""
    for pdf in pdf_files:
        with pdfplumber.open(pdf) as pdf_doc:
            for page in pdf_doc.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    return text


In [8]:

# Text Chunking
def split_text(text):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=150
    )
    return splitter.split_text(text)


In [9]:

# FAISS Index
def create_faiss_index(chunks):
    db = FAISS.from_texts(chunks, embeddings)
    db.save_local("faiss_index")
    print("FAISS index created")


def load_faiss_index():
    return FAISS.load_local(
        "faiss_index",
        embeddings,
        allow_dangerous_deserialization=True
    )


In [10]:

# Smart Context Summarizer

def summarize_chunk(text, source_id):
    prompt = f"""
Summarize the following content in 1‚Äì2 clear sentences.
Keep only important facts.
Cite as [Source {source_id}].

Text:
{text}

Summary:
"""
    response = llm.text_generation(
        prompt,
        max_new_tokens=120,
        temperature=0.2
    )
    return response.strip()


def build_context(docs):
    context = ""
    for i, doc in enumerate(docs):
        context += summarize_chunk(doc.page_content, i+1) + " "
    return context


In [11]:
# Final Answer Generator
def answer_question(question):
    db = load_faiss_index()
    docs = db.similarity_search(question, k=5)

    if not docs:
        return "Not enough information found."

    context = build_context(docs)

    prompt = f"""
You are a research assistant.

Rules:
- Answer clearly and concisely
- Use ONLY the context
- Cite sources like [Source X]
- If missing info, say so

Context:
{context}

Question:
{question}

Answer:
"""

    answer = llm.text_generation(
        prompt,
        max_new_tokens=200,
        temperature=0.3
    )

    return answer.strip()


In [12]:
# Streamlit UI
import streamlit as st

# ----------------------------
# Page config
# ----------------------------
st.set_page_config(
    page_title="üìö ResearchBot",
    page_icon="üìñ",
    layout="wide",
    initial_sidebar_state="expanded"
)

# ----------------------------
# Dark theme styles + animations
# ----------------------------
st.markdown(
    """
    <style>
    /* Dark background */
    body, .stApp {
        background-color: #0F111A;
        color: #FFFFFF;
    }
    /* Sidebar dark */
    [data-testid="stSidebar"] {
        background-color: #1B1C2A;
    }
    /* Buttons animation */
    div.stButton > button:hover {
        background: linear-gradient(90deg, #ff7eb3, #ff758c);
        transform: scale(1.05);
        transition: all 0.3s ease;
    }
    /* Text input animation */
    input[type=text]:focus {
        border: 2px solid #ff7eb3;
        transition: 0.3s;
    }
    /* Expander dark style */
    div[role="button"] {
        color: #ff7eb3;
    }
    hr {
        border: 0.5px solid #444;
    }
    </style>
    """, unsafe_allow_html=True
)

# ----------------------------
# Title
# ----------------------------
st.title("üìö ResearchBot ‚Äì Multi-PDF Research Assistant")

# ----------------------------
# Sidebar - PDF Upload & Index
# ----------------------------
with st.sidebar:
    st.header("üìÅ Document Ingestion")
    pdfs = st.file_uploader(
        "Upload PDF documents",
        type="pdf",
        accept_multiple_files=True
    )

    if st.button("üìå Index Documents"):
        if not pdfs:
            st.warning("Please upload at least one PDF")
        else:
            with st.spinner("Indexing PDFs... ‚è≥"):
                text = extract_pdf_text(pdfs)
                chunks = split_text(text)
                create_faiss_index(chunks)
                st.success("‚úÖ PDFs Indexed Successfully!")

# ----------------------------
# Question input & answer
# ----------------------------
question = st.text_input("‚ùì Enter your research question:")

if question:
    with st.spinner("Generating answer... ü§ñ"):
        answer = answer_question(question)
    st.subheader("ü§ñ Answer")
    st.write(answer)

    with st.expander("üìÑ Context Summaries"):
        db = load_faiss_index()
        docs = db.similarity_search(question, k=5)
        context = build_context(docs)
        st.write(context)

# ----------------------------
# Footer
# ----------------------------
st.markdown(
    """
    <hr>
    <div style="text-align:center; color:#888;">
    ResearchBot ‚Ä¢ Retrieval-Augmented Generation using FAISS + Hugging Face API
    </div>
    """, unsafe_allow_html=True
)


2026-01-29 20:12:17.970 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2026-01-29 20:12:18.002 Session state does not function when running a script without `streamlit run`


DeltaGenerator()

In [13]:
!pip install streamlit pyngrok




In [14]:
%%writefile app.py

import streamlit as st

Overwriting app.py


In [16]:
!npm install -g localtunnel


[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º[1G[0K‚†¥[1G[0K‚†¶[1G[0K‚†ß[1G[0K‚†á[1G[0K‚†è[1G[0K‚†ã[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º[1G[0K‚†¥[1G[0K‚†¶[1G[0K‚†ß[1G[0K‚†á[1G[0K‚†è[1G[0K‚†ã[1G[0K‚†ô[1G[0K‚†π[1G[0K
added 22 packages in 2s
[1G[0K‚†π[1G[0K
[1G[0K‚†π[1G[0K3 packages are looking for funding
[1G[0K‚†π[1G[0K  run `npm fund` for details
[1G[0K‚†π[1G[0K

In [None]:
!streamlit run app.py &



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.127.231:8501[0m
[0m


In [None]:
!lt --port 8501
