# GIKI Prospectus Q&A Chatbot using Retrieval-Augmented Generation (RAG)
This notebook implements a system to upload GIKI-related documents, extract and chunk content, embed the chunks using sentence transformers, do vector similarity search with FAISS, and answer questions using gpt. The interface is built using Gradio.


#Environment Setup

In [1]:
!pip install gradio sentence-transformers faiss-cpu pdfplumber python-docx transformers torch

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 M

In [2]:
# Check CUDA availability for the inference
import torch

def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

device = get_device()
print(f"Using device: {device}")

Using device: cuda


#Import modules and initialize models
This cell imports necessary modules and loads the embedding and generation models.

In [3]:
import os, io, pickle, faiss, pdfplumber
from docx import Document
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(EMBED_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_NAME)


GEN_MODEL_NAME = "google/flan-t5-base"
flan_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
flan_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

#Define text extraction functions
These functions extract text from PDF, DOCX, and TXT files, returning a list of dictionaries containing the text and metadata.


In [4]:
def extract_text_from_pdf(file_bytes, filename):
    pages = []
    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
        for i, page in enumerate(pdf.pages):
            txt = page.extract_text() or ""
            if txt.strip():
                pages.append({"text": txt, "metadata": {"source": filename, "page": i+1}})
    return pages


def extract_text_from_docx(file_bytes, filename):
    doc = Document(io.BytesIO(file_bytes))
    text = "\n".join(p.text for p in doc.paragraphs)
    return [{"text": text, "metadata": {"source": filename}}] if text.strip() else []


def extract_text_from_txt(file_bytes, filename):
    text = io.BytesIO(file_bytes).read().decode("utf-8", errors="ignore")
    return [{"text": text, "metadata": {"source": filename}}] if text.strip() else []


def load_and_extract(files):
    all_docs = []
    for name, data in files:
        if name.endswith(".pdf"): all_docs.extend(extract_text_from_pdf(data, name))
        elif name.endswith(".docx"): all_docs.extend(extract_text_from_docx(data, name))
        elif name.endswith(".txt"): all_docs.extend(extract_text_from_txt(data, name))
    return all_docs


# Define chunking functions
Splitting text into smaller chunks based on token counts with some overlap to maintain context.


In [5]:
def chunk_by_tokens(text, max_tokens=400, overlap=50):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks, start = [], 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk_text = tokenizer.decode(tokens[start:end], skip_special_tokens=True)
        chunks.append(chunk_text)
        if end == len(tokens): break
        start = end - overlap
    return chunks


def make_chunks(docs, max_tokens=400, overlap=50):
    all_chunks, metas = [], []
    for d in docs:
        for c in chunk_by_tokens(d["text"], max_tokens=max_tokens, overlap=overlap):
            all_chunks.append(c)
            metas.append(d["metadata"])
    return all_chunks, metas


# Embeddings and FAISS index building
Create vector embeddings for chunks and build a FAISS index for efficient similarity search.


In [6]:
def embed_texts(texts):
    return embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True)


def build_faiss(chunks, metas):
    embs = embed_texts(chunks)
    index = faiss.IndexFlatIP(embs.shape[1])
    index.add(embs.astype(np.float32))
    return index


#Answer generation and retrieval
  Generate answers using Flan-T5 model based on retrieved chunks relevant to the query.

In [7]:
SYSTEM_PROMPT = (
    "You are a helpful assistant for GIKI students. "
    "Answer ONLY using the provided context. "
    "If the answer is not in the context, say: "
    "'I could not find this in the provided documents.'"
)


def flan_generate(prompt):
    inputs = flan_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    outputs = flan_model.generate(**inputs, max_new_tokens=256)
    return flan_tokenizer.decode(outputs[0], skip_special_tokens=True)


def retrieve(query, index, chunks, metas, top_k=3):
    q_emb = embed_texts([query]).astype(np.float32)
    scores, idxs = index.search(q_emb, top_k)
    results = []
    for i, s in zip(idxs[0], scores[0]):
        results.append({"score": float(s), "chunk": chunks[i], "meta": metas[i]})
    return results


#Orchestration functions
Functions coordinating document processing and answering user questions.


In [8]:
def build_store(files):
    docs = load_and_extract(files)
    chunks, metas = make_chunks(docs)
    index = build_faiss(chunks, metas)
    return index, chunks, metas


def answer_question(query, index, chunks, metas):
    if index is None:
        return "⚠️ Please upload and build first.", []
    retrieved = retrieve(query, index, chunks, metas)
    ctx = "\n\n".join([r['chunk'] for r in retrieved])
    prompt = f"{SYSTEM_PROMPT}\n\nContext:\n{ctx}\n\nQuestion: {query}\nAnswer:"
    ans = flan_generate(prompt)
    return ans, retrieved


#UI callback functions
Global variables and UI functions for building the knowledge base and answering queries.


In [9]:
index, chunks, metas = None, None, None

def ui_build(files):
    global index, chunks, metas
    file_pairs = []
    for f in files:
        with open(f, "rb") as fh:   # Use file path
            file_pairs.append((os.path.basename(f), fh.read()))
    index, chunks, metas = build_store(file_pairs)
    return "✅ Knowledge base built!"


def ui_ask(q):
    global index, chunks, metas
    ans, retrieved = answer_question(q, index, chunks, metas)
    rows = [[r["meta"].get("source"), r["meta"].get("page", "-"), round(r["score"],3), r["chunk"][:120]] for r in retrieved]
    return ans, rows


#Gradio interface setup
This cell builds the web UI using Gradio components and connects the callbacks.


In [10]:
with gr.Blocks() as demo:
    gr.Markdown("# 📘 GIKI RAG Chatbot (Gradio)")

    files = gr.File(file_types=[".pdf",".docx",".txt"], file_count="multiple", label="Upload Docs")
    build_btn = gr.Button("🔧 Build Knowledge Base")
    build_status = gr.Textbox(label="Status")

    query = gr.Textbox(label="Ask a Question")
    ask_btn = gr.Button("🤖 Ask")
    answer = gr.Textbox(label="Answer")
    sources = gr.Dataframe(headers=["Source","Page","Score","Excerpt"], row_count=3)

    build_btn.click(ui_build, inputs=[files], outputs=[build_status])
    ask_btn.click(ui_ask, inputs=[query], outputs=[answer, sources])


# Launch the app
Run this last cell to launch the interactive Gradio app.


In [11]:
demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3192e284130807b715.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


