In [231]:
import pandas as pd
import re
import math
from collections import defaultdict, Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import json
import numpy as np

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
FEEDBACK_FILE2 = "user_feedback2.json"


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mutua\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [232]:

def normalize_query(query: str) -> str:
    # Lowercase, remove stopwords, stem
    tokens = [stemmer.stem(t) for t in query.lower().split() if t not in stop_words]
    return " ".join(tokens)

def preprocess_fuzzy(text):
    text = re.sub(r'[^a-zA-Z0-9 ]', ' ', text.lower())
    tokens = [stemmer.stem(w) for w in text.split() if w not in stop_words]
    return tokens


In [233]:

# ----------------- BM25 Class -----------------
class BM25Fuzzy:
    def __init__(self, docs, k1=1.5, b=0.75):
        self.k1 = k1
        self.b = b
        self.docs = [preprocess_fuzzy(doc) for doc in docs]
        self.N = len(docs)
        self.avgdl = sum(len(d) for d in self.docs) / self.N
        self.doc_len = [len(d) for d in self.docs]
        self.index = defaultdict(dict)
        self.build_index()

        # Build vocab and document frequencies
        self.vocab = set()
        self.doc_freqs = defaultdict(int)
        for term, postings in self.index.items():
            self.vocab.add(term)
            self.doc_freqs[term] = len(postings)
    
    def build_index(self):
        for doc_id, doc in enumerate(self.docs):
            freqs = Counter(doc)
            for term, f in freqs.items():
                self.index[term][doc_id] = f

    def score_fuzzy(self, query_tokens, doc_id):
        score = 0.0
        doc_len = len(self.docs[doc_id])

        for q in query_tokens:
            # Fuzzy match: prefix overlap (or extend to Levenshtein later)
            matching_terms = [t for t in self.vocab if t.startswith(q[:3])]
            for term in matching_terms:
                f = self.index.get(term, {}).get(doc_id, 0)
                if f == 0:
                    continue
                df = self.doc_freqs[term]
                idf = math.log((self.N - df + 0.5) / (df + 0.5) + 1)

                numerator = f * (self.k1 + 1)
                denominator = f + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)
                score += idf * numerator / denominator
        return score


    def search_fuzzy(self, query, top_k=10):
        # normalize query same as when indexing
        query_tokens = preprocess_fuzzy(query)
        scores = defaultdict(float)

        for token in query_tokens:
            if token not in self.index:
                continue
            idf = self.idf(token)
            postings = self.index[token]
            for doc_id, freq in postings.items():
                dl = self.doc_len[doc_id]
                score = idf * freq * (self.k1 + 1) / (freq + self.k1 * (1 - self.b + self.b * dl / self.avgdl))
                scores[doc_id] += score

        # Return top_k
        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        return ranked[:top_k]

    def idf(self, term):
        df = self.doc_freqs.get(term, 0)
        return np.log((self.N - df + 0.5) / (df + 0.5) + 1)


In [234]:

def get_user_feedback(results, df):
    relevant_docs = []
    print("Mark relevant documents by typing their number (space-separated, e.g., 1 3 5):")
    selected = input("Relevant docs: ").strip()
    if selected:
        try:
            indices = [int(x)-1 for x in selected.split()]
            for i in indices:
                relevant_docs.append(results[i][0])
        except:
            print("Invalid input, no docs marked as relevant.")
    return relevant_docs


In [235]:

def fuzzy_search(bm25, query, top_k=10):
    """
    Fuzzy search using startswith matching for each query token.
    Returns top_k documents with scores.
    """
    query_tokens = preprocess_fuzzy(query)
    scores = []

    for doc_id, doc_tokens in enumerate(bm25.docs):
        score = 0.0
        for qtok in query_tokens:
            # match doc terms that start with query token
            for dtok in doc_tokens:
                if dtok.startswith(qtok):
                    score += bm25.idf(dtok)
        scores.append((doc_id, score))

    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:top_k]

def fuzzy_search_display(bm25, query, df, top_k=10):
    results = fuzzy_search(bm25, query, top_k=top_k)
    for rank, (doc_id, score) in enumerate(results, start=1):
        print(f"{rank}. Doc ID: {doc_id}, Score: {score:.4f}")
        snippet = " ".join(df.loc[doc_id, 'abstract_text'].split()[:30])
        print(f"   {snippet}...\n")
    return results


In [236]:

def pseudo_relevance_feedback_iterative_fuzzy(bm25, query, relevant_doc_ids, n_iterations=3, expansion_terms=5):
    current_query = query
    all_expanded_terms = []
    for i in range(n_iterations):
        term_counter = Counter()
        for doc_id in relevant_doc_ids:
            term_counter.update(bm25.docs[doc_id])

        original_tokens = preprocess_fuzzy(current_query)
        for t in original_tokens:
            if t in term_counter:
                del term_counter[t]

        top_terms = [t for t, _ in term_counter.most_common(expansion_terms)]
        if not top_terms:
            break

        all_expanded_terms.extend(top_terms)
        current_query = current_query + " " + " ".join(top_terms)

        # 🔹 Use fuzzy search here
        print("=== Index Tokens ===")
        print(list(bm25.index.keys()))
        search_results = bm25.search_fuzzy(current_query, top_k=10)
        relevant_doc_ids = [doc_id for doc_id, _ in search_results[:5]]

    return current_query, all_expanded_terms, relevant_doc_ids


In [237]:

def save_feedback_fuzzy(query, expanded_terms, relevant_docs):
    norm_query = normalize_query(query)
    try:
        with open(FEEDBACK_FILE2, "r") as f:
            feedback_data = json.load(f)
    except FileNotFoundError:
        feedback_data = {}

    # Overwrite instead of merging
    feedback_data[norm_query] = {
        "expanded_terms": expanded_terms,
        "relevant_docs": relevant_docs
    }

    with open(FEEDBACK_FILE2, "w") as f:
        json.dump(feedback_data, f, indent=2)

def query_feedback_fuzzy(query):
    """
    Returns learned expanded terms and relevant document IDs for a given query.
    Does NOT print anything; intended for programmatic access.
    """
    norm_query = normalize_query(query)  # normalize before lookup

    try:
        with open(FEEDBACK_FILE2, "r") as f:
            feedback_data = json.load(f)
    except FileNotFoundError:
        return None  # No feedback data available

    if norm_query not in feedback_data:
        return None  # No feedback for this query

    expanded_terms = feedback_data[norm_query].get("expanded_terms", [])
    relevant_docs = feedback_data[norm_query].get("relevant_docs", [])

    return {"expanded_terms": expanded_terms, "relevant_docs": relevant_docs}

In [238]:

def augment_query_with_feedback_fuzzy(query):
    try:
        with open(FEEDBACK_FILE2, "r") as f:
            feedback_data = json.load(f)
    except FileNotFoundError:
        return query
    if query in feedback_data:
        expanded_terms = feedback_data[query]["expanded_terms"]
        return query + " " + " ".join(expanded_terms)
    return query



In [239]:
# ----------------- Evaluation -----------------
def precision_at_k(relevant_docs, retrieved_docs, k=10):
    retrieved_k = [doc for doc, _ in retrieved_docs[:k]]
    return len(set(retrieved_k) & set(relevant_docs)) / k

def recall_at_k(relevant_docs, retrieved_docs, k=10):
    retrieved_k = [doc for doc, _ in retrieved_docs[:k]]
    return len(set(retrieved_k) & set(relevant_docs)) / len(relevant_docs)

def average_precision(relevant_docs, retrieved_docs, k=10):
    retrieved_k = [doc for doc, _ in retrieved_docs[:k]]
    hits = 0
    sum_prec = 0
    for i, doc_id in enumerate(retrieved_k, start=1):
        if doc_id in relevant_docs:
            hits += 1
            sum_prec += hits / i
    if hits == 0:
        return 0
    return sum_prec / hits


def evaluate_all_queries_fuzzy(test_queries, bm25, df, top_k=10):
    """
    Evaluate all queries in test_queries and compute aggregate metrics.
    
    Returns a dictionary with per-query metrics and overall mean metrics.
    """
    all_precisions = []
    all_recalls = []
    all_average_precisions = []

    per_query_metrics = {}

    for query_text, relevant_docs in test_queries.items():
        # Augment query with previous feedback if available
        query_aug = augment_query_with_feedback_fuzzy(query_text)

        # Retrieve top_k results
        results = bm25.search_fuzzy(query_aug, top_k=top_k)

        # Compute metrics for this query
        prec = precision_at_k(relevant_docs, results, k=top_k)
        rec = recall_at_k(relevant_docs, results, k=top_k)
        ap = average_precision(relevant_docs, results, k=top_k)

        per_query_metrics[query_text] = {
            'Precision@{}'.format(top_k): prec,
            'Recall@{}'.format(top_k): rec,
            'AveragePrecision': ap
        }

        all_precisions.append(prec)
        all_recalls.append(rec)
        all_average_precisions.append(ap)

    # Compute mean metrics
    mean_metrics = {
        'MeanPrecision@{}'.format(top_k): sum(all_precisions)/len(all_precisions),
        'MeanRecall@{}'.format(top_k): sum(all_recalls)/len(all_recalls),
        'MAP': sum(all_average_precisions)/len(all_average_precisions)
    }

    return per_query_metrics, mean_metrics



In [240]:
# ----------------- Main Iterative Feedback Workflow -----------------
def run_test_queries_fuzzy(df, bm25, test_queries, n_iterations=3, top_k=10, expansion_terms=5):
    for query_text, relevant_docs in test_queries.items():
        print(f"\n=== Processing Query (FUZZY): '{query_text}' ===")

        query_aug = augment_query_with_feedback_fuzzy(query_text)
        print(f"Augmented query: {query_aug}")

        results = fuzzy_search_display(bm25, query_aug, df, top_k=top_k)

        if not relevant_docs:
            relevant_docs = get_user_feedback(results, df)

        if not relevant_docs:
            print("No relevant documents marked. Skipping PRF.\n")
            continue

        final_query, learned_terms, final_relevant_docs = pseudo_relevance_feedback_iterative_fuzzy(
            bm25,
            query_aug,
            relevant_doc_ids=relevant_docs,
            n_iterations=n_iterations,
            expansion_terms=expansion_terms
        )
        print(f"Final query after {n_iterations} iterations: {final_query}")

        new_results = fuzzy_search_display(bm25, final_query, df, top_k=top_k)

        relevant_docs_filtered = [doc_id for doc_id, score in results if score > 0]

        save_feedback_fuzzy(query_text, learned_terms, relevant_docs_filtered)

    per_query_metrics, mean_metrics = evaluate_all_queries_fuzzy(test_queries, bm25, df, top_k)
    print("\n=== Evaluation Metrics for All Fuzzy Queries ===")
    print(per_query_metrics)
    print(mean_metrics)



In [241]:

def show_feedback_comparison_fuzzy(query, df, bm25, top_k=10):
    """
    Shows titles from both the original dataset (raw retrieval) and
    the learned feedback (JSON) for a given query.
    """
    feedback_info = query_feedback_fuzzy(query)
    inferred_doc_ids = feedback_info["relevant_docs"] if feedback_info else []

    # Original BM25 retrieval
    raw_results = bm25.search_fuzzy(query, top_k=top_k)
    raw_doc_ids = [doc_id for doc_id, _ in raw_results]

    print(f"\nQuery: {query}\n")

    print("Titles from JSON-inferred feedback:")
    if inferred_doc_ids:
        for doc_id in inferred_doc_ids:
            if doc_id < len(df):
                print(f"- {df.loc[doc_id, 'title']}")
    else:
        print("- None")

    print("\nTitles from original BM25 retrieval:")
    if raw_doc_ids:
        for doc_id in raw_doc_ids:
            if doc_id < len(df):
                print(f"- {df.loc[doc_id, 'title']}")
    else:
        print("- None")


In [242]:

# Sample DataFrame
data = {
    "title": [
        "AI in Healthcare",
        "Deep Learning Basics",
        "Machine Learning for AI",
        "Neural Networks in Practice",
        "Healthcare and AI Applications"
    ],
    "abstract_text": [
        "This paper discusses applications of AI in healthcare settings.",
        "Introduction to deep learning concepts and algorithms.",
        "A practical guide to machine learning techniques for AI.",
        "Detailed exploration of neural networks applied in practice.",
        "Overview of AI applications in healthcare and medicine."
    ]
}

df = pd.DataFrame(data).fillna('').reset_index(drop=True)


In [243]:

# BM25 search corpus
abstracts = df['abstract_text'].tolist()

bm25_fuzzy = BM25Fuzzy(abstracts)


In [244]:

# Sample test queries with expected doc IDs
test_queries = {
    "AI in healthcare": [0, 4],   # Docs 0 and 4 should match
    "Deep learning": [1],         # Only doc 1
    "Neural networks": [3],       # Only doc 3
    "AI for healthcare": [0, 4],  # Should still match docs 0 and 4 despite 'for' vs 'in'
}



In [245]:
run_test_queries_fuzzy(df, bm25_fuzzy, test_queries, n_iterations=3, top_k=10, expansion_terms=5)



=== Processing Query (FUZZY): 'AI in healthcare' ===
Augmented query: AI in healthcare
1. Doc ID: 0, Score: 1.4145
   This paper discusses applications of AI in healthcare settings....

2. Doc ID: 4, Score: 1.4145
   Overview of AI applications in healthcare and medicine....

3. Doc ID: 2, Score: 0.5390
   A practical guide to machine learning techniques for AI....

4. Doc ID: 1, Score: 0.0000
   Introduction to deep learning concepts and algorithms....

5. Doc ID: 3, Score: 0.0000
   Detailed exploration of neural networks applied in practice....

=== Index Tokens ===
['paper', 'discuss', 'applic', 'ai', 'healthcar', 'set', 'introduct', 'deep', 'learn', 'concept', 'algorithm', 'practic', 'guid', 'machin', 'techniqu', 'detail', 'explor', 'neural', 'network', 'appli', 'overview', 'medicin']
=== Index Tokens ===
['paper', 'discuss', 'applic', 'ai', 'healthcar', 'set', 'introduct', 'deep', 'learn', 'concept', 'algorithm', 'practic', 'guid', 'machin', 'techniqu', 'detail', 'explor', 'neur

In [None]:

show_feedback_comparison_fuzzy("Deep Learning", df, bm25_fuzzy)


Query: Learning

Titles from JSON-inferred feedback:
- None

Titles from original BM25 retrieval:
- Deep Learning Basics
- Machine Learning for AI


In [247]:
q = "AI in healthcare"
print("Query tokens:", preprocess_fuzzy(q))

Query tokens: ['ai', 'healthcar']


In [248]:
query = "Deep Learning"
feedback = query_feedback_fuzzy(query)
print("Feedback for query:", feedback)

Feedback for query: {'expanded_terms': ['introduct', 'concept', 'algorithm', 'practic', 'guid', 'machin', 'techniqu', 'ai', 'applic', 'healthcar', 'detail', 'explor', 'neural'], 'relevant_docs': [1, 2]}


In [249]:
query = "AI in Healthcare"
relevant_docs = [0, 4]  # only truly relevant docs
expanded_terms = ["ai", "healthcar"]
save_feedback_fuzzy(query, expanded_terms, relevant_docs)
feedback = query_feedback_fuzzy(query)
print(feedback)

{'expanded_terms': ['ai', 'healthcar'], 'relevant_docs': [0, 4]}
