In [None]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi
from collections import Counter, defaultdict
import math
import os

In [None]:
# =========================
# Step 1: Load dataset
# =========================
file_path = r"C:\Users\Kusum Kunwar\Desktop\IR_Dataset\Reviews.csv"
df = pd.read_csv(file_path)

In [None]:
print(f"\n{df.shape[0]} rows Ã— {df.shape[1]} columns")

In [None]:
df.info()

In [None]:
# Sample for faster processing
df = df.sample(10000, random_state=42).reset_index(drop=True)
print("Total sampled reviews:", df.shape[0])


In [None]:
# =========================
# Step 2: Preprocessing
# =========================
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess(text):
    """Lowercase, remove punctuation/numbers, tokenize, remove stopwords, lemmatize."""
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join([c for c in text if not c.isdigit()])
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens
    
# Apply preprocessing
docs_tokenized = df['Text'].apply(preprocess).tolist()

print("Preprocessing complete!")

In [None]:
# =========================
# Step 3: BM25 Indexing
# =========================

bm25 = BM25Okapi(docs_tokenized)

print("BM25 Indexing complete!")

In [None]:
# =========================
# Step 4: RM3-style Query Expansion
# =========================
def expand_query_rm3(query, top_docs=5, expansion_terms=5):
    """
    Expand query using RM3 pseudo-relevance feedback:
    - Get top_docs retrieved documents for the query
    - Count term frequencies (excluding stopwords)
    - Add top expansion_terms to query
    """
    original_tokens = preprocess(query)
    scores = bm25.get_scores(original_tokens)
    top_indices = np.argsort(scores)[::-1][:top_docs]

    candidate_terms = Counter()
    for idx in top_indices:
        for term in docs_tokenized[idx]:
            if term not in stop_words and term not in original_tokens:
                candidate_terms[term] += 1

    expansion_tokens = [term for term, _ in candidate_terms.most_common(expansion_terms)]
    expanded_query = original_tokens + expansion_tokens
    return list(dict.fromkeys(expanded_query))


In [None]:
# =========================
# Step 5: Load queries from file
# =========================
queries_path = r"C:\Users\Kusum Kunwar\Desktop\IR_Dataset\Queries.txt"
with open(queries_path, 'r', encoding='utf-8') as f:
    queries = [line.strip() for line in f if line.strip() != ""]

print(f"Loaded {len(queries)} queries from file.")

In [None]:
# =========================
# Step 6: Search function
# =========================
def search_bm25(query, top_n=5, top_docs=5, expansion_terms=5):

    expanded_query = expand_query_rm3(query, top_docs=top_docs, expansion_terms=expansion_terms)
    
    print("Original query:", query)
    print("Expanded query terms:", expanded_query)

    scores = bm25.get_scores(expanded_query)
    top_indices = np.argsort(scores)[::-1][:top_n]

    results = []
    for rank, idx in enumerate(top_indices, start=1):
        text_preview = df.loc[idx, 'Text']
        # Truncate text for display
        if len(text_preview) > 100:
            text_preview = text_preview[:100] + "..."
        results.append({
            "Rank": rank,
            "Doc ID": idx,
            "BM25 Score": round(scores[idx], 4),
            "Text": text_preview
        })

    results_df = pd.DataFrame(results)
    display(results_df)


In [None]:
# =========================
# Run retrieval for all queries
# =========================
for i, q in enumerate(queries, start=1):
    print("\n==============================")
    print(f"Query {i}")
    search_bm25(q, top_n=10)


In [None]:
# ---------------------------
# Metrics
# ---------------------------
def precision_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    return sum(1 for d in retrieved_k if d in relevant) / k

def recall_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    if len(relevant) == 0:
        return 0
    return sum(1 for d in retrieved_k if d in relevant) / len(relevant)

def average_precision(retrieved, relevant):
    score = 0.0
    hits = 0
    if len(relevant) == 0:
        return 0
    for i, d in enumerate(retrieved, start=1):
        if d in relevant:
            hits += 1
            score += hits / i
    return score / len(relevant)

def dcg_at_k(retrieved, relevant, k):
    dcg = 0.0
    for i, d in enumerate(retrieved[:k], start=1):
        rel = 1 if d in relevant else 0
        dcg += rel / math.log2(i+1)
    return dcg

def ndcg_at_k(retrieved, relevant, k):
    ideal_rel = min(len(relevant), k)
    idcg = sum(1 / math.log2(i+1) for i in range(1, ideal_rel+1))
    if idcg == 0:
        return 0
    return dcg_at_k(retrieved, relevant, k) / idcg


In [None]:
# ---------------------------
# Relevance (simple heuristic)
# ---------------------------
def get_relevant_docs(query):
    q_tokens = preprocess(query)
    relevant = set()
    for i, doc in enumerate(docs_tokenized):
        if all(t in doc for t in q_tokens):
            relevant.add(i)
    return relevant

In [None]:
# ---------------------------
# Run evaluation for each query
# ---------------------------
K = 10
results = []

for q in queries:
    q_tokens = preprocess(q)
    
    # BM25 retrieval
    scores = bm25.get_scores(q_tokens)
    retrieved = list(np.argsort(scores)[::-1][:50])
    
    # RM3 expansion
    expanded = expand_query_rm3(q)
    scores_exp = bm25.get_scores(expanded)
    retrieved_exp = list(np.argsort(scores_exp)[::-1][:50])
    
    # relevance
    relevant = get_relevant_docs(q)
    
    # metrics
    p = precision_at_k(retrieved, relevant, K)
    r = recall_at_k(retrieved, relevant, K)
    ap = average_precision(retrieved, relevant)
    nd = ndcg_at_k(retrieved, relevant, K)

    p_exp = precision_at_k(retrieved_exp, relevant, K)
    r_exp = recall_at_k(retrieved_exp, relevant, K)
    ap_exp = average_precision(retrieved_exp, relevant)
    nd_exp = ndcg_at_k(retrieved_exp, relevant, K)

    results.append({
        "query": q,
        "precision@10": p,
        "recall@10": r,
        "AP": ap,
        "nDCG@10": nd,
        "precision@10_expanded": p_exp,
        "recall@10_expanded": r_exp,
        "AP_expanded": ap_exp,
        "nDCG@10_expanded": nd_exp
    })

results_df = pd.DataFrame(results)
results_df = results_df.round(5)

pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 200)
pd.set_option('display.colheader_justify', 'center')
print("\n==================== Evaluation Results Table ====================\n")
print(results_df.to_string(index=False))

print("\n======================== Summary Metrics ========================\n")
print(f"MAP:                      {results_df['AP'].mean():.4f}")
print(f"MAP Expanded (RM3):       {results_df['AP_expanded'].mean():.4f}")
print(f"Mean nDCG@10:             {results_df['nDCG@10'].mean():.4f}")
print(f"Mean nDCG@10 Expanded:    {results_df['nDCG@10_expanded'].mean():.4f}")
