## **1. Imports**

In [1]:
import os
import json
import nltk
import spacy
import pandas as pd
import gradio as gr
from rank_bm25 import BM25Okapi
from transformers import pipeline
from ranx import Qrels, Run, evaluate
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util

nlp = spacy.load("en_core_web_sm")

### **2. Paths & Data Loading**

In [2]:
base_path = r"D:\OneDrive\Desktop\Uni\Year 3\Semester 6\Advanced Deep Learning Gen AI\Final Project"
data_path = os.path.join(base_path, "Datasets")
ai_file = os.path.join(data_path, "top_ai_questions.json")
ds_file = os.path.join(data_path, "top_datascience_questions.json")

with open(ai_file, "r") as f:
    ai_data = json.load(f)
with open(ds_file, "r") as f:
    ds_data = json.load(f)

### **3. Build DataFrame**

In [3]:
def to_df(data, domain):
    return pd.DataFrame(
        [
            {
                "id": item["question_id"],
                "title": item["title"],
                "body": item["body"],
                "link": item["link"],
                "domain": domain,
            }
            for item in data
        ]
    )


df_ai = to_df(ai_data, "AI")
df_ds = to_df(ds_data, "DataScience")
df = pd.concat([df_ai, df_ds], ignore_index=True)
df["full_text"] = df["title"] + " " + df["body"]

### **4. BM25 Search Setup**

In [4]:
def spacy_tokenize(text):
    return [token.text.lower() for token in nlp(text)]


tokenized_corpus = [spacy_tokenize(doc) for doc in df["full_text"]]
bm25 = BM25Okapi(tokenized_corpus)

In [5]:
def search_bm25(query, top_k=10):
    tok_q = spacy_tokenize(query)
    scores = bm25.get_scores(tok_q)
    indices = scores.argsort()[-top_k:][::-1]
    return df.iloc[indices][["id", "title", "link", "domain"]]

### **5. Semantic Search Setup**

In [6]:
model = SentenceTransformer("all-MiniLM-L6-v2")
corpus = df["full_text"].tolist()
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

In [7]:
def search_semantic(query, top_k=10):
    q_emb = model.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(q_emb, corpus_embeddings, top_k=top_k)[0]
    ids = [hit["corpus_id"] for hit in hits]
    return df.iloc[ids][["id", "title", "link", "domain"]]

### **6. Hybrid (RRF Search)**

In [8]:
def hybrid_search(query, top_k=10, rrf_k=60):
    bm25_df = search_bm25(query, top_k)
    sem_df = search_semantic(query, top_k)

    bm25_ids = bm25_df["id"].apply(lambda x: df[df["id"] == x].index[0]).tolist()
    sem_ids = sem_df["id"].apply(lambda x: df[df["id"] == x].index[0]).tolist()

    scores = {}
    for rank, idx in enumerate(bm25_ids):
        scores[idx] = scores.get(idx, 0) + 1 / (rrf_k + rank + 1)
    for rank, idx in enumerate(sem_ids):
        scores[idx] = scores.get(idx, 0) + 1 / (rrf_k + rank + 1)

    sorted_ids = sorted(scores, key=lambda i: scores[i], reverse=True)[:top_k]
    return df.iloc[sorted_ids][["id", "title", "link", "domain"]]

### **7. Build Ground Truth & Run Files**

In [9]:
ground_truth = {
    "What is the difference between AI and machine learning?": [35],
    "What is a convolutional neural network?": [5546],
    "How is overfitting handled in ML?": [61],
    "What are the main applications of reinforcement learning?": [3502],
    "What are class weights in Keras and why use them?": [13490],
}

ground_truth_str = {
    q: {str(doc_id): 1 for doc_id in docs} for q, docs in ground_truth.items()
}

sample_queries_for_eval = list(ground_truth.keys())

bm25_results_dict = {q: search_bm25(q, top_k=10) for q in sample_queries_for_eval}
semantic_results_dict = {
    q: search_semantic(q, top_k=10) for q in sample_queries_for_eval
}
hybrid_results_dict = {q: hybrid_search(q, top_k=10) for q in sample_queries_for_eval}

In [10]:
def create_run_file(results_dict):
    run_file = {}
    for query, result_df in results_dict.items():
        run_file[query] = {
            str(result_df.iloc[i]["id"]): 1 / (i + 1) for i in range(len(result_df))
        }
    return run_file

In [11]:
bm25_run_file = create_run_file(bm25_results_dict)
semantic_run_file = create_run_file(semantic_results_dict)
hybrid_run_file = create_run_file(hybrid_results_dict)

### **8. Evaluation**

In [12]:
qrels = Qrels(ground_truth_str)
bm25_run = Run(bm25_run_file)
semantic_run = Run(semantic_run_file)
hybrid_run = Run(hybrid_run_file)

bm25_metrics = evaluate(qrels, bm25_run, metrics=["map@10", "mrr@10", "ndcg@10"])
hybrid_metrics = evaluate(qrels, hybrid_run, metrics=["map@10", "mrr@10", "ndcg@10"])
semantic_metrics = evaluate(
    qrels, semantic_run, metrics=["map@10", "mrr@10", "ndcg@10"]
)

print(f"{'Metric':<12} {'BM25':>10} {'Semantic':>10} {'Hybrid':>10}")
print("-" * 46)
for metric in ["map@10", "mrr@10", "ndcg@10"]:
    bm25_val = bm25_metrics[metric]
    semantic_val = semantic_metrics[metric]
    hybrid_val = hybrid_metrics[metric]
    print(f"{metric:<12} {bm25_val:>10.4f} {semantic_val:>10.4f} {hybrid_val:>10.4f}")

Metric             BM25   Semantic     Hybrid
----------------------------------------------
map@10           0.2722     0.7333     0.7500
mrr@10           0.2722     0.7333     0.7500
ndcg@10          0.3987     0.7974     0.8123


**Interpretation**:
- **BM25** performs poorly on its own, likely due to strict lexical matching limitations.
- **Semantic search** significantly improves MAP and MRR by capturing semantic similarity via dense embeddings.
- **Hybrid search (BM25 + Semantic via RRF)** outperforms both individual methods, showing the benefit of combining lexical and semantic signals.

These results validate the use of hybrid retrieval in our QA system, especially for generating accurate and contextually grounded answers.

### **9. RAG with HuggingFace T5**

In [13]:
qa_model = pipeline("text2text-generation", model="google/flan-t5-base")

Device set to use cuda:0


In [14]:
def generate_answer_rag_hf(query, top_k=5):
    hits = hybrid_search(query, top_k=top_k)
    passages = []
    for _, row in hits.iterrows():
        title = row["title"]
        body = df[df["id"] == row["id"]]["body"].values[0]
        passages.append(f"{title}. {body}")

    numbered = "\n\n".join(f"[{i+1}] {p}" for i, p in enumerate(passages))

    prompt = (
        "Use the following passages (with bracketed labels) to answer the question. "
        "Include inline citations like [1], [2], etc.\n\n"
        f"{numbered}\n\n"
        f"Question: {query}\nAnswer:"
    )

    output = qa_model(prompt, max_new_tokens=256)[0]["generated_text"]
    return output, hits["link"].tolist()

In [15]:
ans, cites = generate_answer_rag_hf(
    "What is the difference between deep learning and AI?"
)
print(ans)
print(cites)

p>Can someone explain to me the difference between machine learning and deep learning? Is it possible to learn deep learning without knowing machine learning?/p> [2] What is the difference between self-supervised and unsupervised learning?. p> What is the difference between a href="https://ai.stackexchange.com/questions/10623/what-is-self-supervised-learning-in-machine-learning?noredirect=1&amp;lq=1">self-supervised/a> and unsupervised learning? The terms logically overlap (and maybe self-supervised learning is a subset of unsupervised learning?), but I cannot pinpoint exactly what that difference is./p> [3] Difference between machine learning and artificial intelligence. p>Is there any difference between machine learning and artificial intelligence? Or do these terms refer to the same thing?/p> [4] What does AI software look like, and how is it different from other software?. p> What does AI software look like? What is the major difference between AI software and other software?/
['ht

### **10. Gradio Interface for Live Q&A**

In [16]:
def hybrid_qa(query):
    answer, cites = generate_answer_rag_hf(query, top_k=5)
    df_cites = pd.DataFrame({"source_link": cites})
    return answer, df_cites

In [17]:
demo = gr.Interface(
    fn=hybrid_qa,
    inputs=gr.Textbox(
        lines=2,
        placeholder="Ask any AI or data science question...",
        label="Your Question",
    ),
    outputs=[
        gr.Textbox(label="Generated Answer", lines=6),
        gr.Dataframe(label="Top Retrieved Sources (Title, Link, Domain)"),
    ],
    title="💡 Hybrid Q&A with BM25 + MiniLM + FLAN-T5",
    description=(
        "This tool retrieves relevant questions from a combined AI and Data Science dataset using "
        "hybrid search (BM25 + MiniLM embeddings), then generates a cited answer using FLAN-T5. "
        "Inline citations like [1], [2] refer to the listed sources below."
    ),
    allow_flagging="never",
)

demo.launch(quiet=True)





### **11. Q&A Generation Pipeline**

In [18]:
def generate_answer_hf(query, top_k=5):
    hits = hybrid_search(query, top_k)
    context_passages = []
    for _, row in hits.iterrows():
        body = df[df["id"] == row["id"]]["body"].values[0]
        context_passages.append(f"{row['title']}. {body}")

    context = "\n\n".join(context_passages)
    prompt = (
        f"Use the context below to answer the question as accurately as possible.\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {query}\nAnswer:"
    )

    ans = qa_model(prompt, max_new_tokens=256, do_sample=False)[0]["generated_text"]
    return ans.strip(), hits["link"].tolist()

In [19]:
answer, cites = generate_answer_hf(
    "What is the difference between deep learning and AI?"
)
print("Answer:\n", answer)
print("\nSources:\n" + "\n".join(f"- {u}" for u in cites))

Answer:
 Deep learning is a software that builds a system for the other.

Sources:
- https://ai.stackexchange.com/questions/1742/what-is-the-difference-between-machine-learning-and-deep-learning
- https://ai.stackexchange.com/questions/40341/what-is-the-difference-between-self-supervised-and-unsupervised-learning
- https://datascience.stackexchange.com/questions/19077/difference-between-machine-learning-and-artificial-intelligence
- https://ai.stackexchange.com/questions/16448/what-does-ai-software-look-like-and-how-is-it-different-from-other-software
- https://ai.stackexchange.com/questions/35/what-is-the-difference-between-artificial-intelligence-and-machine-learning
