In [None]:
import pandas as pd
import re
import math
from collections import defaultdict, Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import json


nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
FEEDBACK_FILE = "user_feedback10.json"
feedback_dict = {}
###############



# ----------------- Preprocessing -----------------
def normalize_query(query: str) -> str:
    tokens = query.lower().split()
    tokens = [stemmer.stem(t) for t in tokens if t not in stop_words]
    return " ".join(tokens)

def preprocess(text):
    text = re.sub(r'[^a-zA-Z0-9 ]', ' ', text.lower())
    tokens = [w for w in text.split() if w not in stop_words]  # remove stopwords first
    return [stemmer.stem(tok) for tok in tokens]  # then stem

def load_feedback():
    try:
        with open(FEEDBACK_FILE, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        return {}
######################


# ----------------- BM25 Class -----------------
class BM25:
    def __init__(self, docs, k1=1.5, b=0.75):
        self.k1 = k1
        self.b = b
        self.docs = [preprocess(doc) for doc in docs]
        self.N = len(docs)
        self.avgdl = sum(len(d) for d in self.docs) / self.N
        self.doc_len = [len(d) for d in self.docs]
        self.index = defaultdict(dict)
        self.build_index()
    
    def build_index(self):
        for doc_id, doc in enumerate(self.docs):
            freqs = Counter(doc)
            for term, f in freqs.items():
                self.index[term][doc_id] = f

    def idf(self, term):
        n_t = len(self.index.get(term, {}))
        return math.log((self.N - n_t + 0.5) / (n_t + 0.5) + 1)

    def score(self, query_tokens, doc_id):
        score = 0.0
        doc = self.docs[doc_id]
        for term in query_tokens:
            f = self.index.get(term, {}).get(doc_id, 0)
            if f == 0:
                continue
            idf = self.idf(term)
            denom = f + self.k1 * (1 - self.b + self.b * len(doc)/self.avgdl)
            score += idf * (f * (self.k1 + 1)) / denom
        return score

    def search(self, query, top_k=10):
        query_tokens = preprocess(query)
        scores = [(doc_id, self.score(query_tokens, doc_id)) for doc_id in range(self.N)]
        scores.sort(key=lambda x: x[1], reverse=True)
        return scores[:top_k]
    ##########################

# ----------------- Search & Feedback -----------------
def search_display(bm25, query, df, top_k=10):
    results = bm25.search(query, top_k=top_k)
    for rank, (doc_id, score) in enumerate(results, start=1):
        print(f"{rank}. Doc ID: {doc_id}, Score: {score:.4f}")
        snippet = " ".join(df.loc[doc_id, 'abstract_text'].split()[:30])
        print(f"   {snippet}...\n")
    return results

def get_user_feedback(results, df):
    relevant_docs = []
    print("Mark relevant documents by typing their number (space-separated, e.g., 1 3 5):")
    selected = input("Relevant docs: ").strip()
    if selected:
        try:
            indices = [int(x)-1 for x in selected.split()]
            for i in indices:
                relevant_docs.append(results[i][0])
        except:
            print("Invalid input, no docs marked as relevant.")
    return relevant_docs
############################
def pseudo_relevance_feedback_iterative(bm25, query, relevant_doc_ids, n_iterations=3, expansion_terms=5):
    current_query = query
    all_expanded_terms = []
    for i in range(n_iterations):
        term_counter = Counter()
        for doc_id in relevant_doc_ids:
            term_counter.update(bm25.docs[doc_id])
        original_tokens = preprocess(current_query)
        for t in original_tokens:
            if t in term_counter:
                del term_counter[t]
        top_terms = [t for t, _ in term_counter.most_common(expansion_terms)]
        if not top_terms:
            break
        all_expanded_terms.extend(top_terms)
        current_query = current_query + " " + " ".join(top_terms)
        # Take top 5 docs for next iteration (pseudo-relevance)
        search_results = bm25.search(current_query, top_k=10)
        relevant_doc_ids = [doc_id for doc_id, _ in search_results[:5]]
    return current_query, all_expanded_terms, relevant_doc_ids

####
def save_feedback(query, expanded_terms, relevant_docs):
    try:
        with open(FEEDBACK_FILE, "r") as f:
            feedback_data = json.load(f)
    except FileNotFoundError:
        feedback_data = {}
    if query in feedback_data:
        feedback_data[query]["expanded_terms"] = list(set(feedback_data[query]["expanded_terms"] + expanded_terms))
        feedback_data[query]["relevant_docs"] = list(set(feedback_data[query]["relevant_docs"] + relevant_docs))
    else:
        feedback_data[query] = {
            "expanded_terms": expanded_terms,
            "relevant_docs": relevant_docs
        }
    with open(FEEDBACK_FILE, "w") as f:
        json.dump(feedback_data, f, indent=2)

def augment_query_with_feedback(query):
    try:
        with open(FEEDBACK_FILE, "r") as f:
            feedback_data = json.load(f)
    except FileNotFoundError:
        return query
    if query in feedback_data:
        expanded_terms = feedback_data[query]["expanded_terms"]
        return query + " " + " ".join(expanded_terms)
    return query
########

# ----------------- Evaluation -----------------
def precision_at_k(relevant_docs, retrieved_docs, k=10):
    retrieved_k = [doc for doc, _ in retrieved_docs[:k]]
    return len(set(retrieved_k) & set(relevant_docs)) / k

def recall_at_k(relevant_docs, retrieved_docs, k=10):
    retrieved_k = [doc for doc, _ in retrieved_docs[:k]]
    return len(set(retrieved_k) & set(relevant_docs)) / len(relevant_docs)

def average_precision(relevant_docs, retrieved_docs, k=10):
    retrieved_k = [doc for doc, _ in retrieved_docs[:k]]
    hits = 0
    sum_prec = 0
    for i, doc_id in enumerate(retrieved_k, start=1):
        if doc_id in relevant_docs:
            hits += 1
            sum_prec += hits / i
    if hits == 0:
        return 0
    return sum_prec / hits

def evaluate_all_queries(test_queries, bm25, df, top_k=10):
    """
    Evaluate all queries in test_queries and compute aggregate metrics.
    
    Returns a dictionary with per-query metrics and overall mean metrics.
    """
    all_precisions = []
    all_recalls = []
    all_average_precisions = []

    per_query_metrics = {}

    for query_text, relevant_docs in test_queries.items():
        # Augment query with previous feedback if available
        query_aug = augment_query_with_feedback(query_text)

        # Retrieve top_k results
        results = bm25.search(query_aug, top_k=top_k)

        # Compute metrics for this query
        prec = precision_at_k(relevant_docs, results, k=top_k)
        rec = recall_at_k(relevant_docs, results, k=top_k)
        ap = average_precision(relevant_docs, results, k=top_k)

        per_query_metrics[query_text] = {
            'Precision@{}'.format(top_k): prec,
            'Recall@{}'.format(top_k): rec,
            'AveragePrecision': ap
        }

        all_precisions.append(prec)
        all_recalls.append(rec)
        all_average_precisions.append(ap)

    # Compute mean metrics
    mean_metrics = {
        'MeanPrecision@{}'.format(top_k): sum(all_precisions)/len(all_precisions),
        'MeanRecall@{}'.format(top_k): sum(all_recalls)/len(all_recalls),
        'MAP': sum(all_average_precisions)/len(all_average_precisions)
    }

    return per_query_metrics, mean_metrics

# ----------------- Main Iterative Feedback Workflow -----------------
def run_test_queries(df, bm25, test_queries, n_iterations=3, top_k=10, expansion_terms=5):
    feedback_dict = load_feedback()  # load existing feedback once

    for query_text, relevant_docs in test_queries.items():
        print(f"\n=== Processing Query: '{query_text}' ===")
        query_aug = augment_query_with_feedback(query_text)
        print(f"Augmented query: {query_aug}")

        results = search_display(bm25, query_aug, df, top_k=top_k)

        if not relevant_docs:
            relevant_docs = get_user_feedback(results, df)

        if not relevant_docs:
            print("No relevant documents marked. Skipping PRF.\n")
            continue

        final_query, learned_terms, final_relevant_docs = pseudo_relevance_feedback_iterative(
            bm25,
            query_aug,
            relevant_doc_ids=relevant_docs,
            n_iterations=n_iterations,
            expansion_terms=expansion_terms
        )
        print(f"Final query after {n_iterations} iterations: {final_query}")

        new_results = search_display(bm25, final_query, df, top_k=top_k)

        # Update feedback **in memory only**
        update_feedback(feedback_dict, query_text, learned_terms, final_relevant_docs)

    # ✅ Write feedback JSON once, at the very end
    with open(FEEDBACK_FILE, "w") as f:
        json.dump(feedback_dict, f, indent=2)

    # Optional: evaluate queries after all processing
    per_query_metrics, mean_metrics = evaluate_all_queries(test_queries, bm25, df, top_k)
    print("\n=== Evaluation Metrics for All Queries ===")
    print(per_query_metrics)
    print("\nMean metrics:")
    print(mean_metrics)

def update_feedback(feedback_dict, query, expanded_terms, relevant_docs):
    if query in feedback_dict:
        feedback_dict[query]["expanded_terms"] = list(
            set(feedback_dict[query]["expanded_terms"] + expanded_terms)
        )
        feedback_dict[query]["relevant_docs"] = list(
            set(feedback_dict[query]["relevant_docs"] + relevant_docs)
        )
    else:
        feedback_dict[query] = {
            "expanded_terms": expanded_terms,
            "relevant_docs": relevant_docs
        }

########

# Load data and build BM25
df = pd.read_csv("openalex_papers.csv").fillna('').reset_index(drop=True)
########



abstracts = df['abstract_text'].tolist()
bm25 = BM25(abstracts)
#####

# Example test_queries dict
test_queries = {
    "artificial intelligence textbook": [0, 1, 2, 3],
    "ai in medicine": [5, 7, 10],
    "distributed multiagent systems": [8],
    "ai in healthcare": [7, 9, 44, 45],
    "deep convolutional neural networks": [10, 12],
    "neural network fundamentals": [11, 14],
    "prevent overfitting in neural nets": [13],
    "dimensionality reduction with neural networks": [16],
    "overview of deep learning": [17],
    "knowledge distillation in neural networks": [18],
    "sequence to sequence learning": [19],
    "semi-supervised learning": [20, 21, 27, 28, 29],
    "fast supervised learning algorithms": [22, 23],
    "unsupervised learning methods": [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
    "deep clustering visual features": [36],
    "lstm video representations": [37],
    "slow feature analysis": [38],
    "generative ai cybersecurity": [39],
    "ai ethics and privacy": [40, 41, 42, 44, 45],
    "federated learning privacy": [43],
    "chatbots impact on users": [44],
    "image classification using deep learning": [10, 12],
    "convolutional deep belief networks": [35],
    "linear neural network learning": [36],
    "jigsaw puzzles visual representations": [31],
    "unsupervised depth ego-motion": [34],
    "ai privacy challenges healthcare": [44],
    "scalable unsupervised learning": [35],
    "ai in cyber-physical systems": [43],
    "privacy preserving AI": [43],
    "lite bert language models": [25],
    "supervised learning comparison": [23],
    "gaussian fields semi-supervised learning": [26],
    "virtual adversarial training": [30],
    "equality of opportunity supervised learning": [26],
    "introduction to semi-supervised learning": [27],
    "deep learning neural networks overview": [17],
    "reducing data dimensionality neural nets": [16],
    "sequence models neural networks": [19],
    "semi-supervised literature survey": [21],
    "unsupervised visual representations": [31],
    "slow features unsupervised learning": [38],
    "chatgpt generative ai impact": [39],
    "ai privacy research": [40, 41, 42, 44],
    "federated learning AI": [43],
    "ethical challenges AI healthcare": [44],
    "ai technologies privacy security": [45],
    "dropout neural networks": [13],
    "pattern recognition neural networks": [14],
    "physical systems emergent computation": [15],
    "scaled conjugate gradient learning": [22],
    "empirical comparison supervised algorithms": [23]
}
###############


# Run the workflow
run_test_queries(df, bm25, test_queries, n_iterations=3, top_k=10, expansion_terms=5)
###########

def query_feedback(query):
    """
    Returns learned expanded terms and relevant document IDs for a given query.
    Does NOT print anything; intended for programmatic access.
    """
    try:
        with open(FEEDBACK_FILE, "r") as f:
            feedback_data = json.load(f)
    except FileNotFoundError:
        return None  # No feedback data available

    if query not in feedback_data:
        return None  # No feedback for this query

    expanded_terms = feedback_data[query].get("expanded_terms", [])
    relevant_docs = feedback_data[query].get("relevant_docs", [])

    return {"expanded_terms": expanded_terms, "relevant_docs": relevant_docs}
#####

def show_feedback_comparison(query, df, bm25, top_k=10):
    """
    Shows titles from both the original dataset (raw retrieval) and
    the learned feedback (JSON) for a given query.
    """
    feedback_info = query_feedback(query)
    inferred_doc_ids = feedback_info["relevant_docs"] if feedback_info else []

    # Original BM25 retrieval
    raw_results = bm25.search(query, top_k=top_k)
    raw_doc_ids = [doc_id for doc_id, _ in raw_results]

    print(f"\nQuery: {query}\n")

    print("Titles from JSON-inferred feedback:")
    if inferred_doc_ids:
        for doc_id in inferred_doc_ids:
            if doc_id < len(df):
                print(f"- {df.loc[doc_id, 'title']}")
    else:
        print("- None")

    print("\nTitles from original BM25 retrieval:")
    if raw_doc_ids:
        for doc_id in raw_doc_ids:
            if doc_id < len(df):
                print(f"- {df.loc[doc_id, 'title']}")
    else:
        print("- None")
#########


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mutua\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



=== Processing Query: 'artificial intelligence textbook' ===
Augmented query: artificial intelligence textbook logic learn practic agent reason ai healthcar data patient system technolog privaci well comprehens applic
1. Doc ID: 0, Score: 30.8354
   The long-anticipated revision of this #1 selling book offers the most comprehensive, state of the art introduction to the theory and practice of artificial intelligence for modern applications. Intelligent Agents....

2. Doc ID: 46, Score: 25.3276
   Integrating Artificial Intelligence (AI) in healthcare represents a transformative shift with substantial potential for enhancing patient care. This paper critically examines this integration, confronting significant ethical, legal, and technological challenges,...

3. Doc ID: 9, Score: 19.5062
   Artificial intelligence (AI) aims to mimic human cognitive functions. It is bringing a paradigm shift to healthcare, powered by increasing availability of healthcare data and rapid progress of analyt

In [361]:

show_feedback_comparison("healthcare", df, bm25)
####



Query: healthcare

Titles from JSON-inferred feedback:
- None

Titles from original BM25 retrieval:
- Balancing Privacy and Progress: A Review of Privacy Challenges, Systemic Oversight, and Patient Perceptions in AI-Driven Healthcare
- Artificial intelligence in healthcare: past, present and future
- Artificial intelligence: a modern approach
- Artificial intelligence: A modern approach
- Lecture Notes in Artificial Intelligence
- Principles of Artificial Intelligence
- Proceedings of the 19th International Joint Conference on Artificial Intelligence
- High-performance medicine: the convergence of human and artificial intelligence
- Future paths for integer programming and links to artificial intelligence
- Artificial intelligence in radiology
