<a href="https://colab.research.google.com/github/Kaviyarasi-Sasiperumal/AI_-Based_-Document-_Search_-and-_Knowledge-_Retrieval_-with-_Conversational_Interface/blob/main/Milestone_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install -q sentence-transformers faiss-cpu transformers gradio pypdf

In [16]:
import warnings
warnings.filterwarnings("ignore")

In [17]:
from datetime import datetime
import numpy as np
import faiss
import gradio as gr

from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import pipeline

In [18]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

llm = pipeline(
    "text-generation",
    model="google/flan-t5-base",
    max_new_tokens=150
)

Device set to use cpu
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['PeftModelForCausalLM', 'ApertusForCausalLM', 'ArceeForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BitNetForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'BltForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'DogeForCausalLM', 'Dots1ForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'Ernie4_5ForCausalLM', 'Ernie4_5_MoeForCausalLM', 'Exaone4ForCausalLM', 'FalconForCausalLM', 'FalconH1ForCausalLM', 'FalconMambaForCausalLM', 'FlexOlmoF

In [19]:
system_logs = []
uploaded_docs = []

documents = []
embeddings = []

DIM = 384
index = faiss.IndexFlatL2(DIM)

In [20]:
def log(msg):
    t = datetime.now().strftime("%H:%M:%S")
    system_logs.append(f"[{t}] {msg}")
    return "\n".join(system_logs)

In [21]:
def extract_text(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        if page.extract_text():
            text += page.extract_text() + "\n"
    return text

In [22]:
def chunk_text(text, size=1000):
    return [text[i:i+size] for i in range(0, len(text), size)]

In [23]:
def ingest_document(file):
    uploaded_docs.append(file.name)
    log(f"Ingesting {file.name}")

    text = extract_text(file.name)
    log("Text extracted")

    chunks = chunk_text(text)
    log(f"{len(chunks)} chunks created")

    vecs = embedding_model.encode(chunks)
    log("Embeddings generated")

    for c, v in zip(chunks, vecs):
        documents.append(c)
        embeddings.append(v)

    index.add(np.array(embeddings).astype("float32"))
    log("Stored in vector index")

    return "\n".join(system_logs), "\n".join(uploaded_docs)

In [24]:
def retrieve(query, k=6):
    q_vec = embedding_model.encode([query]).astype("float32")
    _, ids = index.search(q_vec, k)
    return [documents[i] for i in ids[0] if i < len(documents)]

In [25]:
def chatbot(query):
    chunks = retrieve(query, k=6)

    if not chunks:
        return "No answer found in documents.", ""

    context = "\n".join(chunks)

    prompt = f"""
You are a study assistant.
Read the entire context carefully and explain the concept completely.

Context:
{context}

Question:
{query}

Give a clear, complete, and meaningful explanation:
"""
    result = llm(prompt)[0]["generated_text"]

    return result.strip(), "\n\n".join(chunks)

In [27]:
with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("# ðŸ“„ Document Chatbot â€“ Milestone 1")
    gr.Markdown("Document ingestion, indexing, and basic chatbot")

    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="Upload PDF")
            upload_btn = gr.Button("Upload & Index")

        with gr.Column():
            status_box = gr.Textbox(
                label="System Status",
                lines=10,
                interactive=False
            )

    doc_list = gr.Textbox(
        label="Uploaded Documents",
        lines=4,
        interactive=False
    )

    upload_btn.click(
        ingest_document,
        inputs=file_input,
        outputs=[status_box, doc_list]
    )

    gr.Markdown("---")

    question = gr.Textbox(label="Ask a Question")
    sources = gr.Textbox(label="Retrieved Answer", lines=4)

    ask_btn = gr.Button("Ask")
    ask_btn.click(chatbot, inputs=question, outputs=[answer, sources])

app.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ab1c8ddbd2a6b9b670.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


