1. METEOR for Answer Accuracy

In [None]:
import pandas as pd
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK resources
nltk.download('punkt_tab')  # Download the punkt_tab data package
nltk.download("punkt")
nltk.download('wordnet') # Download the wordnet dataset - This is the fix

# Load the dataset
df = pd.read_csv("/content/questions_with_answers3.csv")

# Ensure correct column names (adjust if needed)
reference_answers = df["Answer"].astype(str)  # Ground truth answers
generated_answers = df["ChatbotAnswer"].astype(str)  # Model's answers

# Tokenize the answers before computing METEOR scores
df["METEOR Score"] = [
    meteor_score([word_tokenize(ref)], word_tokenize(gen))
    for ref, gen in zip(reference_answers, generated_answers)
]

# Compute the average METEOR score
average_meteor = df["METEOR Score"].mean()

# Save the results to a new CSV file
df.to_csv("BMSCE_FAQ_with_meteor_scores.csv", index=False)

print(f"Average METEOR Score: {average_meteor}")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Average METEOR Score: 0.2869676738988281


2. BERTScore for Response Relevance

In [None]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [None]:
import pandas as pd
from bert_score import score

# Load the dataset
df = pd.read_csv("/content/BMSCE_FAQ_with_meteor_scores.csv")

# Ensure correct column names (adjust if needed)
reference_answers = df["Answer"].astype(str)  # Ground truth answers
generated_answers = df["ChatbotAnswer"].astype(str)  # Model-generated answers

# Compute BERTScore
P, R, F1 = score(generated_answers.tolist(), reference_answers.tolist(), lang="en", verbose=True)

# Add BERTScore F1 values to the DataFrame
df["BERTScore F1"] = F1.tolist()

# Compute the average BERTScore F1
average_bert_f1 = df["BERTScore F1"].mean()

# Save the results to a new CSV file
df.to_csv("BMSCE_FAQ_with_bertscore.csv", index=False)

# Print the average score
print(f"Average BERTScore F1: {average_bert_f1}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 5.93 seconds, 15.17 sentences/sec
Average BERTScore F1: 0.8573147800233629


3. RAGAS Answer Completeness for Query Coverage

In [None]:
from ragas.metrics import answer_completeness

def evaluate_completeness(questions, contexts, answers, references):
    # Setup RAGAS evaluator
    completeness_score = answer_completeness.compute(
        questions=questions,
        answers=answers,
        references=references,
        contexts=contexts
    )
    return completeness_score

# Example usage
questions = test_queries
answers = [chatbot.get_response(q) for q in test_queries]
references = [ground_truth[i] for i in range(len(test_queries))]
contexts = [chatbot.get_retrieved_context(q) for q in test_queries]  # If available

completeness_scores = evaluate_completeness(questions, contexts, answers, references)

4. p90 Response Time

In [None]:
import time
import numpy as np

def measure_response_times(chatbot, test_queries, n_runs=3):
    timing_data = []

    for query in test_queries:
        query_times = []
        for _ in range(n_runs):  # Multiple runs for stability
            start = time.time()
            _ = chatbot.get_response(query)
            query_times.append(time.time() - start)

        # Use median of runs for this query
        timing_data.append(np.median(query_times))

    p50 = np.percentile(timing_data, 50)
    p90 = np.percentile(timing_data, 90)
    p99 = np.percentile(timing_data, 99)

    return {
        'p50': p50,
        'p90': p90,
        'p99': p99,
        'mean': np.mean(timing_data),
        'std': np.std(timing_data)
    }

In [None]:
!pip install -qU langchain-community tavily-python faiss-cpu
%pip install --upgrade --quiet  langchain-huggingface text-generation transformers google-search-results numexpr langchainhub sentencepiece jinja2 bitsandbytes
!pip install -qU langchain_community beautifulsoup4
!pip install opencv-python
!pip install --upgrade transformers
%pip install -qU langchain_community pymupdf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.6/91.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m99.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tavily import TavilyClient
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import WebBaseLoader,PyMuPDFLoader
import os
import re
from urllib.parse import urljoin, unquote

# Set API Keys
os.environ["TAVILY_API_KEY"] = "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"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

# 1. Tavily Web Search Retriever
def search_web(query):
    client = TavilyClient(api_key="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")
    response = client.search(query=query, search_depth="advanced", max_results=5)

    # Extract only BMSCE links
    source_links = [result["url"].replace("+", "%20") for result in response["results"] if "bmsce.ac.in" in result["url"]]

    print(f"Filtered BMSCE Sources: {source_links}")  # Debugging output
    return source_links

# 2. Extract Syllabus PDF Links Using BeautifulSoup

def extract_years_from_url(url):
    """Extracts the latest starting year from a syllabus URL (e.g., '2022-25' → 2022)."""
    decoded_url = unquote(url)  # Decode %20 spaces
    match = re.findall(r'(\d{4})-\d{2}', decoded_url)  # Find patterns like '2022-25'
    return max(map(int, match)) if match else 0  # Return latest starting year

def get_syllabus_links(urls):
    syllabus_links = {"UG": [], "PG": []}  # Store UG and PG separately

    for url in urls:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to load {url}")
            continue

        # If the URL directly points to a PDF, categorize as UG or PG
        if url.endswith(".pdf"):
            year = extract_years_from_url(url)
            if "UG" in url.upper():
                syllabus_links["UG"].append((url, year))
            elif "PG" in url.upper():
                syllabus_links["PG"].append((url, year))
            continue

        # Parse the HTML page for syllabus links
        soup = BeautifulSoup(response.text, 'html.parser')
        sections = soup.find_all(class_="toggle active")

        for section in sections:
            label = section.find('label')
            if label:
                category = None
                if "UG Syllabus" in label.text.strip():
                    category = "UG"
                elif "PG Syllabus" in label.text.strip():
                    category = "PG"

                if category:
                    links = section.find_all('a', href=True)
                    for link in links:
                        absolute_link = urljoin(url, link['href']).replace(" ", "%20")
                        year = extract_years_from_url(absolute_link)
                        syllabus_links[category].append((absolute_link, year))

    # Sort by year (latest first) and pick the most recent syllabus link
    latest_ug = max(syllabus_links["UG"], key=lambda x: x[1], default=None)
    latest_pg = max(syllabus_links["PG"], key=lambda x: x[1], default=None)

    # Return only the latest UG and PG syllabus links
    final_links = []
    if latest_ug:
        final_links.append(latest_ug[0])
    if latest_pg:
        final_links.append(latest_pg[0])

    return final_links if final_links else None  # Convert to list before returning



# 2. Extract Content from ALL Web Pages
def get_web_content(urls):
    if not urls:
        return []

    docs = []
    for url in urls:
        print(f"\nExtracting content from: {url}")  # Debugging output

        try:
            # Check if URL is a PDF
            if url.endswith(".pdf"):
                print(f"Detected PDF: {url}, using PDF loader.")

                # Download the PDF file temporarily
                response = requests.get(url)
                if response.status_code == 200:
                    temp_pdf_path = "temp_doc.pdf"
                    with open(temp_pdf_path, "wb") as f:
                        f.write(response.content)

                    # Use PyMuPDFLoader to extract text
                    loader = PyMuPDFLoader(temp_pdf_path)
                    docs.extend(loader.load())

                    # Clean up the temporary PDF
                    os.remove(temp_pdf_path)

                else:
                    print(f"Failed to download PDF: {url} (Status Code: {response.status_code})")

            else:
                # Use WebBaseLoader for normal web pages
                loader = WebBaseLoader(url)
                docs.extend(loader.load())

        except Exception as e:
            print(f"Failed to load {url}: {e}")  # Handle errors gracefully

    return docs

# 3. Process Text (Split & Embed)
def process_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    docs = text_splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")
    vector_db = FAISS.from_documents(docs, embeddings)

    return vector_db

# 4. Set Up Hugging Face LLM
llm = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
)

chat_model = ChatHuggingFace(llm=llm)

# 5. Retrieve & Query LLM with Context
def answer_query(query, vector_db):
    retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    retrieved_docs = retriever.get_relevant_documents(query)

    print("\nRetrieved Context for Query:")
    for i, doc in enumerate(retrieved_docs, 1):
        print(f"\n--- Document {i} ---\n{doc.page_content[:500]}...\n")  # Show first 500 characters

    if not retrieved_docs:
        return chat_model.invoke(query)

    qa_chain = RetrievalQA.from_chain_type(llm=chat_model, retriever=retriever)
    return qa_chain.run(query)

# 6. Full RAG Pipeline Execution
def rag_chatbot(question):
    urls = search_web(question)
    print(f"Sources: {urls}")

    # If "syllabus" is in the query, extract the PDF link instead of running the RAG pipeline
    if "syllabus" in question.lower():
        syllabus_links = get_syllabus_links(urls)
        if syllabus_links:
            return f"This is the PDF link to the syllabus:\n{syllabus_links}"
        return "No syllabus links found."

    # Normal RAG pipeline if it's not a syllabus query
    documents = get_web_content(urls)
    if not documents:
        return chat_model.invoke(question)

    vector_db = process_documents(documents)
    return answer_query(question, vector_db)

# Example Usage
query = "Are there any workshops conducted on resume building?"
query = query if "bmsce" in query.lower() else query + "at BMSCE"
response = rag_chatbot(query)
print("\nChatbot Response:", response)

Filtered BMSCE Sources: ['https://bmsce.ac.in/assets/files/MonthlyReports/Monthly%20Report-October%202024.pdf', 'https://bmsce.ac.in/home/Computer-Applications-MCA-Student-Affinity-Groups']
Sources: ['https://bmsce.ac.in/assets/files/MonthlyReports/Monthly%20Report-October%202024.pdf', 'https://bmsce.ac.in/home/Computer-Applications-MCA-Student-Affinity-Groups']

Extracting content from: https://bmsce.ac.in/assets/files/MonthlyReports/Monthly%20Report-October%202024.pdf
Detected PDF: https://bmsce.ac.in/assets/files/MonthlyReports/Monthly%20Report-October%202024.pdf, using PDF loader.

Extracting content from: https://bmsce.ac.in/home/Computer-Applications-MCA-Student-Affinity-Groups

Retrieved Context for Query:

--- Document 1 ---
chapter and thirty female students 
participated in the training. 
 
 
 The Department of Computer Science & 
Engineering 
in 
association 
with 
<CodeIO/> conducted a workshop on 
‘Resume Building’ for UG students. The 
event 
was 
designed 
to 
provide 
p

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tavily import TavilyClient
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import WebBaseLoader,PyMuPDFLoader
import os
import re
from urllib.parse import urljoin, unquote
import pandas as pd
import pandas as pd
import time

# Set API Keys
os.environ["TAVILY_API_KEY"] = "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"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

# 1. Tavily Web Search Retriever
def search_web(query):
    client = TavilyClient(api_key="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")
    response = client.search(query=query, search_depth="advanced", max_results=5)

    # Extract only BMSCE links
    source_links = [result["url"].replace("+", "%20") for result in response["results"] if "bmsce.ac.in" in result["url"]]

    print(f"Filtered BMSCE Sources: {source_links}")  # Debugging output
    return source_links

# 2. Extract Syllabus PDF Links Using BeautifulSoup

def extract_years_from_url(url):
    """Extracts the latest starting year from a syllabus URL (e.g., '2022-25' → 2022)."""
    decoded_url = unquote(url)  # Decode %20 spaces
    match = re.findall(r'(\d{4})-\d{2}', decoded_url)  # Find patterns like '2022-25'
    return max(map(int, match)) if match else 0  # Return latest starting year

def get_syllabus_links(urls):
    syllabus_links = {"UG": [], "PG": []}  # Store UG and PG separately

    for url in urls:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to load {url}")
            continue

        # If the URL directly points to a PDF, categorize as UG or PG
        if url.endswith(".pdf"):
            year = extract_years_from_url(url)
            if "UG" in url.upper():
                syllabus_links["UG"].append((url, year))
            elif "PG" in url.upper():
                syllabus_links["PG"].append((url, year))
            continue

        # Parse the HTML page for syllabus links
        soup = BeautifulSoup(response.text, 'html.parser')
        sections = soup.find_all(class_="toggle active")

        for section in sections:
            label = section.find('label')
            if label:
                category = None
                if "UG Syllabus" in label.text.strip():
                    category = "UG"
                elif "PG Syllabus" in label.text.strip():
                    category = "PG"

                if category:
                    links = section.find_all('a', href=True)
                    for link in links:
                        absolute_link = urljoin(url, link['href']).replace(" ", "%20")
                        year = extract_years_from_url(absolute_link)
                        syllabus_links[category].append((absolute_link, year))

    # Sort by year (latest first) and pick the most recent syllabus link
    latest_ug = max(syllabus_links["UG"], key=lambda x: x[1], default=None)
    latest_pg = max(syllabus_links["PG"], key=lambda x: x[1], default=None)

    # Return only the latest UG and PG syllabus links
    final_links = []
    if latest_ug:
        final_links.append(latest_ug[0])
    if latest_pg:
        final_links.append(latest_pg[0])

    return final_links if final_links else None  # Convert to list before returning



# 2. Extract Content from ALL Web Pages
def get_web_content(urls):
    if not urls:
        return []

    docs = []
    for url in urls:
        print(f"\nExtracting content from: {url}")  # Debugging output

        try:
            # Check if URL is a PDF
            if url.endswith(".pdf"):
                print(f"Detected PDF: {url}, using PDF loader.")

                # Download the PDF file temporarily
                response = requests.get(url)
                if response.status_code == 200:
                    temp_pdf_path = "temp_doc.pdf"
                    with open(temp_pdf_path, "wb") as f:
                        f.write(response.content)

                    # Use PyMuPDFLoader to extract text
                    loader = PyMuPDFLoader(temp_pdf_path)
                    docs.extend(loader.load())

                    # Clean up the temporary PDF
                    os.remove(temp_pdf_path)

                else:
                    print(f"Failed to download PDF: {url} (Status Code: {response.status_code})")

            else:
                # Use WebBaseLoader for normal web pages
                loader = WebBaseLoader(url)
                docs.extend(loader.load())

        except Exception as e:
            print(f"Failed to load {url}: {e}")  # Handle errors gracefully

    return docs

# 3. Process Text (Split & Embed)
def process_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    docs = text_splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")
    vector_db = FAISS.from_documents(docs, embeddings)

    return vector_db

# 4. Set Up Hugging Face LLM
llm = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
)

chat_model = ChatHuggingFace(llm=llm)

# 5. Retrieve & Query LLM with Context
def answer_query(query, vector_db):
    retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    retrieved_docs = retriever.get_relevant_documents(query)
    if not retrieved_docs:
        return chat_model.invoke(query)

    qa_chain = RetrievalQA.from_chain_type(llm=chat_model, retriever=retriever)
    return qa_chain.run(query)

# 6. Full RAG Pipeline Execution
def rag_chatbot(question):
    question = question if "bmsce" in question.lower() else question + " at BMSCE"
    urls = search_web(question)
    print(f"Sources: {urls}")

    # If "syllabus" is in the query, extract the PDF link instead of running the RAG pipeline
    if "syllabus" in question.lower():
        syllabus_links = get_syllabus_links(urls)
        if syllabus_links:
            return f"This is the PDF link to the syllabus:\n{syllabus_links}"
        return "No syllabus links found."

    # Normal RAG pipeline if it's not a syllabus query
    documents = get_web_content(urls)
    if not documents:
        return chat_model.invoke(question)

    vector_db = process_documents(documents)
    return answer_query(question, vector_db)

# Load CSV file
csv_file = "/content/questions_with_answers2.csv"  # Update with your actual file path
df = pd.read_csv(csv_file)

# Ensure there's a column named 'Question'
if "Question" not in df.columns:
    raise ValueError("CSV file must have a 'Question' column.")

# Add an 'Answer' column if it doesn't exist
if "ChatbotAnswer" not in df.columns:
    df["ChatbotAnswer"] = None

# Process each question and update the 'Answer' column incrementally
for i, row in df.iterrows():
    if pd.notna(row["ChatbotAnswer"]):  # Skip if already answered
        continue

    query = row["Question"]
    query = query if "bmsce" in query.lower() else query + " at BMSCE"

    try:
        # Generate answer using rag_chatbot function
        answer = rag_chatbot(query)

        # Update the 'Answer' column
        df.at[i, "ChatbotAnswer"] = answer

        # Save progress after each query
        df.to_csv("questions_with_answers3.csv", index=False)
        print(f"Processed {i+1}/{len(df)}: {query} ✅")

        # Optional: Add a small delay to avoid rate limiting
        time.sleep(1)

    except Exception as e:
        print(f"Error processing {query}: {e}")
        # Save progress even if an error occurs
        df.to_csv("questions_with_answers.csv", index=False)
        break  # Stop on error (remove this line if you want to continue on errors)

print("Updated CSV saved as: questions_with_answers.csv")

Filtered BMSCE Sources: ['https://bmsce.ac.in/home/Information-Science-and-Engineering-About', 'https://bmsce.ac.in/home/About-R-and-D', 'https://bmsce.ac.in/home/About-CIIE']
Sources: ['https://bmsce.ac.in/home/Information-Science-and-Engineering-About', 'https://bmsce.ac.in/home/About-R-and-D', 'https://bmsce.ac.in/home/About-CIIE']

Extracting content from: https://bmsce.ac.in/home/Information-Science-and-Engineering-About

Extracting content from: https://bmsce.ac.in/home/About-R-and-D

Extracting content from: https://bmsce.ac.in/home/About-CIIE
Processed 60/90: How does the ISE department at BMSCE support student involvement in research and innovation? ✅
Filtered BMSCE Sources: ['https://www.bmsce.ac.in/']
Sources: ['https://www.bmsce.ac.in/']

Extracting content from: https://www.bmsce.ac.in/
Processed 61/90: What undergraduate programs does BMSCE offer? ✅
Filtered BMSCE Sources: ['https://www.bmsce.ac.in/', 'https://bmsce.ac.in/home/About-BMSCE']
Sources: ['https://www.bmsce.ac