In [1]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [2]:
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.2


In [None]:
from sentence_transformers import SentenceTransformer, util
import fitz  # PyMuPDF for PDFs
import numpy as np
import re
import requests

# Initialize Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# NVIDIA API details
NVIDIA_API_URL = "https://integrate.api.nvidia.com/v1/chat/completions"
API_KEY = "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"  # Replace with your valid NVIDIA API key

# Function to load and parse Q&A from PDF
def extract_qa_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = " ".join([page.get_text() for page in doc])

    # Extract structured Q&A pairs using regex
    qa_pairs = []
    qa_pattern = r'\"question\":\s*\"(.*?)\",\s*\"answer\":\s*\"(.*?)\"'
    matches = re.findall(qa_pattern, text, re.DOTALL)

    for match in matches:
        question, answer = match
        qa_pairs.append({"question": question.strip(), "answer": answer.strip()})

    return qa_pairs

# Retrieve best-matching answer and expand it contextually
def get_best_answer(user_question, qa_pairs):
    questions = [qa["question"] for qa in qa_pairs]
    question_embeddings = model.encode(questions, convert_to_tensor=True)
    user_embedding = model.encode(user_question, convert_to_tensor=True)

    # Compute similarity scores
    scores = util.cos_sim(user_embedding, question_embeddings).cpu().numpy()
    best_match_idx = np.argmax(scores)
    best_match_score = scores[0][best_match_idx]

    # Set a similarity threshold to determine if it's a valid match
    if best_match_score > 0.5:
        return qa_pairs[best_match_idx]["answer"]
    else:
        return None  # No exact match found

# Generate elaborated answer by providing the original answer as context with logical reasoning
def generate_elaborated_answer(question, short_answer):
    headers = {"Authorization": f"Bearer {API_KEY}"}

    # Modify the prompt to provide a detailed, case study-based response with logical reasoning
    prompt = f"""
    The question below involves a case study or logical reasoning. Provide a detailed and thoughtful response that addresses the question comprehensively, including real-world examples or theoretical scenarios. The answer should explain the reasoning behind the conclusion and end with a clear, actionable insight. Do not mention stages or steps explicitly.

    Question: {question}
    Answer: {short_answer}
    """

    payload = {
        "model": "microsoft/phi-3-mini-128k-instruct",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.2,
        "top_p": 0.9,
        "max_tokens": 1500,  # Increased token limit for longer responses
    }

    response = requests.post(NVIDIA_API_URL, json=payload, headers=headers)
    response_data = response.json()

    if response.status_code == 200 and "choices" in response_data:
        return response_data["choices"][0]["message"]["content"].strip()
    else:
        return "I'm unable to find an answer. Please try again later with a different question."

# Main function with a limit of 5 questions
def main(pdf_path):
    qa_pairs = extract_qa_from_pdf(pdf_path)
    question_count = 0  # Counter for questions

    while question_count < 5:  # Limit to 5 questions
        user_question = input(f"({question_count+1}/5) Enter your question (or type 'exit' to quit): ")
        if user_question.lower() == "exit":
            break

        answer = get_best_answer(user_question, qa_pairs)

        if answer:
            # Generate a detailed and expanded version of the answer with case study and logical reasoning
            elaborated_answer = generate_elaborated_answer(user_question, answer)

            # Ensure the response ends properly
            if elaborated_answer and elaborated_answer[-1] not in ['.', '!', '?']:
                elaborated_answer += "."

            print(f"\n✅ Elaborated Answer: {elaborated_answer}\n")
        else:
            print("\n❌ Exact match not found. Generating a detailed response...")

            # No fallback generation since it's already handled by elaboration
            print(f"\n🤖 No answer available for this question.\n")

        question_count += 1  # Increment question count

    print("\n🎯 You have reached the limit of 5 questions. Exiting...")

pdf_path = "/content/Campus_Pal Content.pdf"
main(pdf_path)

(1/5) Enter your question (or type 'exit' to quit): If a student skilled in Python, Java, and data analysis is looking to  specialize, which programs at Amrita Vishwa Vidyapeetham would you  recommend, and why? 

✅ Elaborated Answer: Given the student's proficiency in Python, Java, and data analysis, I would recommend the following programs at Amrita Vishwa Vidyapeetham:

1. Master of Science in Data Science: This program is designed to provide students with a strong foundation in data science, including data mining, machine learning, and big data analytics. The student's existing skills in Python and data analysis would be a significant advantage in this program, as Python is a widely used language in data science, and the program covers various data analysis techniques.

2. Master of Science in Artificial Intelligence: This program focuses on the development of intelligent systems and algorithms, including machine learning, deep learning, and natural language processing. The student'

In [6]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.13.2-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.6.0 (from gradio)
  Downloading gradio_client-1.6.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.9.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [10]:
import gradio as gr
from sentence_transformers import SentenceTransformer, util
import fitz  # PyMuPDF for PDFs
import numpy as np
import re
import requests

# Initialize Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# NVIDIA API details
NVIDIA_API_URL = "https://integrate.api.nvidia.com/v1/chat/completions"
API_KEY = "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"  # Replace with your valid NVIDIA API key

# Function to load and parse Q&A from PDF
def extract_qa_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = " ".join([page.get_text() for page in doc])

    # Extract structured Q&A pairs using regex
    qa_pairs = []
    qa_pattern = r'\"question\":\s*\"(.*?)\",\s*\"answer\":\s*\"(.*?)\"'
    matches = re.findall(qa_pattern, text, re.DOTALL)

    for match in matches:
        question, answer = match
        qa_pairs.append({"question": question.strip(), "answer": answer.strip()})

    return qa_pairs

# Retrieve best-matching answer and expand it contextually
def get_best_answer(user_question, qa_pairs):
    questions = [qa["question"] for qa in qa_pairs]
    question_embeddings = model.encode(questions, convert_to_tensor=True)
    user_embedding = model.encode(user_question, convert_to_tensor=True)

    # Compute similarity scores
    scores = util.cos_sim(user_embedding, question_embeddings).cpu().numpy()
    best_match_idx = np.argmax(scores)
    best_match_score = scores[0][best_match_idx]

    # Set a similarity threshold to determine if it's a valid match
    if best_match_score > 0.5:
        return qa_pairs[best_match_idx]["answer"]
    else:
        return None  # No exact match found

# Generate elaborated answer by providing the original answer as context with logical reasoning
def generate_elaborated_answer(question, short_answer):
    headers = {"Authorization": f"Bearer {API_KEY}"}

    # Modify the prompt to provide a detailed, case study-based response with logical reasoning
    prompt = f"""
    The question below involves a case study or logical reasoning. Provide a detailed and thoughtful response that addresses the question comprehensively, including real-world examples or theoretical scenarios. The answer should explain the reasoning behind the conclusion and end with a clear, actionable insight. Do not mention stages or steps explicitly.

    Question: {question}
    Answer: {short_answer}
    """

    payload = {
        "model": "microsoft/phi-3-mini-128k-instruct",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.2,
        "top_p": 0.9,
        "max_tokens": 1500,  # Increased token limit for longer responses
    }

    response = requests.post(NVIDIA_API_URL, json=payload, headers=headers)
    response_data = response.json()

    if response.status_code == 200 and "choices" in response_data:
        return response_data["choices"][0]["message"]["content"].strip()
    else:
        return "I'm unable to find an answer. Please try again later with a different question."

# Gradio interface
def qa_interface(pdf, question):
    qa_pairs = extract_qa_from_pdf(pdf.name)
    answer = get_best_answer(question, qa_pairs)

    if answer:
        elaborated_answer = generate_elaborated_answer(question, answer)
        if elaborated_answer and elaborated_answer[-1] not in ['.', '!', '?']:
            elaborated_answer += "."
        return f"✅ Elaborated Answer: {elaborated_answer}"
    else:
        ai_answer = generate_elaborated_answer(question, "No exact match found")
        return f"🤖 AI-Generated Answer: {ai_answer}"

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Amrita College: Your Interactive Guide to Campus Insights")
    pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
    question_input = gr.Textbox(label="Enter your question", placeholder="Type your question here...")
    answer_output = gr.Textbox(label="Answer", lines=5, interactive=False)

    submit_button = gr.Button("Submit")

    submit_button.click(qa_interface, inputs=[pdf_input, question_input], outputs=[answer_output])

# Launch the Gradio app
demo.launch(share=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://be993c51f010b01638.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


