## Phase 2: Query Expansion with Wordnet

### Start ElasticSearch manually before running the notebook:
On Windows:
- Make sure you have at least JDK 17
- Open a terminal and execute this (or run it as a Windows service):
```bash
C:\path\to\elasticsearch-8.17.2\bin\elasticsearch.bat
```
- No Greek characters should be present in the path.
- Leave that terminal window open.

- If no password was autogenerated execute this to get one:
```bash
.\bin\elasticsearch-reset-password.bat -u elastic
```

In [None]:
# %pip install -r "..\\requirements.txt"

In [None]:
from collections import Counter
import jsonlines
import json
import csv
import pandas as pd
from tqdm import tqdm
import pytrec_eval
from IPython.display import display

In [None]:
from dotenv import load_dotenv
import os

# Load .env file from the current directory
load_dotenv("..\\secrets\\secrets.env")

# Access environment variables
es_host = os.getenv("ES_HOST")
es_user = os.getenv("ES_USERNAME")
es_pass = os.getenv("ES_PASSWORD")

- Connect to ElasticSearch

In [None]:
from elasticsearch import Elasticsearch

es = Elasticsearch(es_host, basic_auth=(es_user, es_pass))

if es.ping():
    print("✅ Connected to ElasticSearch")
else:
    print("❌ Connection failed")

✅ Connected to ElasticSearch


- Load Index

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab') # instead of punkt: NLTK > 3.8.2 !
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
input_dir = '../data/trec-covid/'

with jsonlines.open(input_dir + 'corpus.jsonl') as reader:
    corpus = [obj for obj in reader]

with jsonlines.open(input_dir + 'queries.jsonl') as reader:
    queries = [obj for obj in reader]

test_df = pd.read_csv(input_dir + 'qrels/' + 'test.tsv', sep='\t')

In [None]:
from nltk.corpus import wordnet as wn
def get_wordnet_synonyms(word, max_synonyms=3):
    synonyms = set()
    word = word.lower()

    for syn in wn.synsets(word):
        # Keep only nouns and adjectives
        if syn.pos() in ('n', 'a'):
            for lemma in syn.lemmas():
                name = lemma.name().replace("_", " ").lower()

                # Filter out:
                if name == word: # Skip the original word
                    continue
                if len(name.split()) > 1:  # Skip multi-word phrases
                    continue
                if not name.isalpha(): # Skip non-alphabetic words
                    continue

                synonyms.add(name)

    # Rank by frequency (most-used synonyms first)
    ranked_synonyms = sorted(synonyms, key=lambda s: -sum(lemma.count() for syn in wn.synsets(s) for lemma in syn.lemmas() if lemma.name().lower() == s))

    return ranked_synonyms[:max_synonyms] # Return up to max_synonyms

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag

stop_words = set(stopwords.words('english'))

def expand_query_with_synonyms(query_text, max_synonyms=3):
  expanded_terms = []
  for word, pos in pos_tag(word_tokenize(query_text.lower())):
    # Only expand nouns and adjectives, skip stopwords and non-alphabetic tokens
      if word.isalpha() and word not in stop_words and pos.startswith(('NN', 'JJ')):
          synonyms = get_wordnet_synonyms(word, max_synonyms)
          expanded_terms.extend(synonyms)
  return query_text + " " + " ".join(expanded_terms)

In [None]:
expanded_queries = []
for query in tqdm(queries):
    new_query = query.copy()
    new_query["expanded_text"] = expand_query_with_synonyms(query["text"])
    expanded_queries.append(new_query)

100%|██████████| 50/50 [00:00<00:00, 284.24it/s]


In [None]:
with jsonlines.open("../data/trec-covid/queries_expanded_wordnet.jsonl", mode='w') as writer:
    for q in expanded_queries:
        writer.write({
            "_id": q["_id"],
            "text": q["expanded_text"]
        })
    print("✅ Expanded queries saved to queries_expanded_wordnet.jsonl")

✅ Expanded queries saved to queries_expanded_wordnet.jsonl


In [None]:
def process_queries_phase_2(queries_path):
    # Load queries
    with open(queries_path, 'r', encoding='utf-8') as f:
        queries = [json.loads(line) for line in f]

    INDEX_NAME = "ir2025-index"
    k_values = [20, 30, 50]

    runs = {f"run_{k}": {} for k in k_values}
    for k in k_values:
        output_dir = f"../results/phase_2"
        os.makedirs(output_dir, exist_ok=True)

        for query in tqdm(queries, desc=f"Processing Queries for run with k = {k}"):
            qid = query["_id"]
            original_text = query["text"]
            query_text = expand_query_with_synonyms(original_text)
            
            # This tells ElasticSearch:
            # - “Give higher importance to the original query terms (boost)”
            # - “But also consider the synonyms (with normal weight)”
            # => This prevents the expanded terms from drowning out the original intent.
            response = es.search(
                index=INDEX_NAME,
                query={
                    "bool": {
                        "should": [
                            { "match": { "text": { "query": original_text, "boost": 3 } }},
                            { "match": { "text": query_text }}
                        ]
                    }
                },
                size=k
            )

            runs[f"run_{k}"][qid] = {hit["_id"]: hit["_score"] for hit in response["hits"]["hits"]}

        # Save each run
        with open(os.path.join(output_dir, f'retrieval_top_{k}.json'), 'w', encoding='utf-8') as f:
            json.dump(runs[f"run_{k}"], f, ensure_ascii=False, indent=4)
            print(f"✅ Results saved to: ../results/phase_2/retrieval_top_{k}.json")

    return runs

In [None]:
runs = process_queries_phase_2("../data/trec-covid/queries_expanded_wordnet.jsonl")

Processing Queries for run with k = 20: 100%|██████████| 50/50 [00:01<00:00, 35.58it/s]


✅ Results saved to: ../results/phase_2/retrieval_top_20.json


Processing Queries for run with k = 30: 100%|██████████| 50/50 [00:01<00:00, 35.56it/s]


✅ Results saved to: ../results/phase_2/retrieval_top_30.json


Processing Queries for run with k = 50: 100%|██████████| 50/50 [00:01<00:00, 32.54it/s]

✅ Results saved to: ../results/phase_2/retrieval_top_50.json





In [None]:
qrels = load_qrels()

Average number of relevant documents per query: 493


In [None]:
compute_metrics(qrels, runs, 'phase_2')

Computing metrics for run with k = 20


✅ Per-query metrics saved to: ../results\phase_2\per_query_metrics_top_20.json
✅ Average metrics saved to: ../results\phase_2\average_metrics_top_20.json

Computing metrics for run with k = 30
✅ Per-query metrics saved to: ../results\phase_2\per_query_metrics_top_30.json
✅ Average metrics saved to: ../results\phase_2\average_metrics_top_30.json

Computing metrics for run with k = 50
✅ Per-query metrics saved to: ../results\phase_2\per_query_metrics_top_50.json
✅ Average metrics saved to: ../results\phase_2\average_metrics_top_50.json



### Let's compare the (AVG) results:

In [None]:
def compare_phases(phases, k_values=[20, 30, 50], metrics=['map', 'P_5', 'P_10', 'P_15', 'P_20']):
    """
    Display and optionally compare retrieval metrics for 1 to 4 phases.
    Parameters:
    - phases: dict mapping phase names to base file paths, e.g.
        {
            "Phase 1": "../results/phase_1/average_metrics_top_{}.json",
            "Phase 2": "../results/phase_2/average_metrics_top_{}.json",
            ...
        }
    - k_values: list of cutoff values to compare ([20, 30, 50])
    - metrics: list of TREC metric keys (['map', 'P_5', 'P_10'])

    Returns:
    - pandas DataFrame with metrics for all phases at each k
    """
    comparison = []

    for k in k_values:
        row = {"k": k}
        for phase_name, base_path in phases.items():
            try:
                with open(base_path.format(k), "r") as f:
                    phase_metrics = json.load(f)
                row[f"{phase_name} MAP"] = phase_metrics["map"]
                for m in metrics[1:]: # exclude MAP
                    row[f"{phase_name} avgPre@{m[2:]}"] = phase_metrics[m]
            except FileNotFoundError:
                print(f"⚠️ File not found: {base_path.format(k)}")
        comparison.append(row)

    df = pd.DataFrame(comparison)
    df.sort_values("k", inplace=True)
    df.set_index("k", inplace=True) # Set 'k' column as the index for visualization purposes
    display(df)
    return df

In [None]:
phases = {
    "Phase 1": "../results/phase_1/average_metrics_top_{}.json",
    "Phase 2": "../results/phase_2/average_metrics_top_{}.json",
    # "Phase 3": "../results/phase_3/average_metrics_top_{}.json",
    # "Phase 4": "../results/phase_4/average_metrics_top_{}.json"
}

In [None]:
_ = compare_phases(phases)

Unnamed: 0_level_0,Phase 1 MAP,Phase 1 avgPre@5,Phase 1 avgPre@10,Phase 1 avgPre@15,Phase 1 avgPre@20,Phase 2 MAP,Phase 2 avgPre@5,Phase 2 avgPre@10,Phase 2 avgPre@15,Phase 2 avgPre@20
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20,0.01988,0.632,0.59,0.556,0.545,0.014134,0.48,0.474,0.458667,0.447
30,0.026913,0.632,0.59,0.556,0.545,0.01945,0.48,0.474,0.458667,0.447
50,0.03879,0.632,0.59,0.556,0.545,0.029628,0.48,0.474,0.458667,0.447
