## Phase 2: Query Expansion with Wordnet

### Start ElasticSearch manually before running the notebook:
On Windows:
- Make sure you have at least JDK 17
- Open a terminal and execute this (or run it as a Windows service):
```bash
C:\path\to\elasticsearch-8.17.2\bin\elasticsearch.bat
```
- No Greek characters should be present in the path.
- Leave that terminal window open.

- If no password was autogenerated execute this to get one:
```bash
.\bin\elasticsearch-reset-password.bat -u elastic
```

In [99]:
# %pip install -r "..\\requirements.txt"

In [100]:
from collections import Counter
import jsonlines
import json
import csv
import pandas as pd
from tqdm import tqdm
import pytrec_eval
from IPython.display import display

In [101]:
from dotenv import load_dotenv
import os

# Load .env file from the current directory
load_dotenv("..\\secrets\\secrets.env")

# Access environment variables
es_host = os.getenv("ES_HOST")
es_user = os.getenv("ES_USERNAME")
es_pass = os.getenv("ES_PASSWORD")

- Connect to ElasticSearch

In [102]:
from elasticsearch import Elasticsearch

es = Elasticsearch(es_host, basic_auth=(es_user, es_pass))

if es.ping():
    print("✅ Connected to ElasticSearch")
else:
    print("❌ Connection failed")

✅ Connected to ElasticSearch


- Load Index

In [103]:
INDEX_NAME = "ir2025-index"

# Delete the index if it already exists
if es.indices.exists(index=INDEX_NAME):
    print(f"✅ Index '{INDEX_NAME}' already exists.")

else:
    # Define the settings and mappings for the index
    settings = {
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": "_english_"
                },
                "english_stemmer": {
                    "type": "kstem"
                }
            },
            "analyzer": {
                "custom_english": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase", # Converts all terms to lowercase
                        "english_stop", # Removes English stop words
                        "english_stemmer" # Reduces words to their root form usign kstem
                    ]
                }
            }
        }
    }
    
    mappings = {
        "properties": {
            "doc_id": {"type": "keyword"},
            "text": {
                "type": "text",
                "analyzer": "custom_english",
                "similarity": "BM25"
            }
        }
    }
    
    # Create the index with the specified settings and mappings
    es.indices.create(
        index=INDEX_NAME,
        settings=settings,
        mappings=mappings
    )
    print(f"✅ Index '{INDEX_NAME}' created")

✅ Index 'ir2025-index' already exists.


In [104]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab') # instead of punkt: NLTK > 3.8.2 !
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [105]:
input_dir = '../data/trec-covid/'

with jsonlines.open(input_dir + 'corpus.jsonl') as reader:
    corpus = [obj for obj in reader]

In [None]:
# Simulate custom_english Analyzer 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # KrovetzStemmer supports up to python 3.10 at best 
import string

# Initialize NLTK components
stop_words = set(stopwords.words('english'))

stemmer = PorterStemmer() # It's "Closer" to Korvetz than Snowball is

def es_like_preprocess(text):
    # Lowercase the text
    text = text.lower().strip()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords and apply stemming (Porter)
    processed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words or not token.isalpha()]
    # Join tokens back into a single string
    return ' '.join(processed_tokens)

In [107]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import numpy as np
import joblib
import json
import os

def build_and_save_tfidf_model(corpus, output_dir="../models"):
    """
    Build TF-IDF model from corpus, compute statistics, and save the model.
    
    Args:
        corpus: List of documents with 'text' field
        output_dir: Directory to save models and statistics
    
    Returns:
        tuple: (vectorizer, idf_scores, statistics_dict)
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Initialize counters for statistics
    total_tokens = 0
    unique_tokens = set()
    statistics = {}

    # Preprocess with detailed statistics
    print("Preprocessing corpus...")
    preprocessed_corpus = []
    for doc in tqdm(corpus, desc="Preprocessing documents", unit="doc"):
        processed_text = es_like_preprocess(doc["text"])
        tokens = processed_text.split()
        
        # Update statistics
        total_tokens += len(tokens)
        unique_tokens.update(tokens)
        
        preprocessed_corpus.append(processed_text)

    # Save preprocessing statistics
    statistics['preprocessing'] = {
        'total_tokens': total_tokens,
        'unique_tokens': len(unique_tokens),
        'average_tokens_per_doc': total_tokens/len(corpus)
    }

    print(f"\nPreprocessing statistics:")
    print(f"- Total tokens: {total_tokens:,}")
    print(f"- Unique tokens: {len(unique_tokens):,}")
    print(f"- Average tokens per document: {total_tokens/len(corpus):,.1f}")

    # Build TF-IDF model with detailed progress
    print("\nBuilding TF-IDF model...")
    tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')

    with tqdm(total=3, desc="TF-IDF computation") as pbar:
        # Fit the vectorizer
        tfidf_vectorizer.fit(preprocessed_corpus)
        pbar.update(1)
        
        # Get feature names
        feature_names = tfidf_vectorizer.get_feature_names_out()
        pbar.update(1)
        
        # Calculate IDF scores
        idf_scores = dict(zip(feature_names, tfidf_vectorizer.idf_))
        pbar.update(1)

    # Calculate and save TF-IDF statistics
    idf_values = list(idf_scores.values())
    statistics['tfidf'] = {
        'vocabulary_size': len(idf_scores),
        'average_idf': float(np.mean(idf_values)),
        'max_idf': float(max(idf_values)),
        'min_idf': float(min(idf_values))
    }

    print("\nTF-IDF statistics:")
    print(f"- Vocabulary size: {len(idf_scores):,} terms")
    print(f"- Average IDF: {statistics['tfidf']['average_idf']:.2f}")
    print(f"- Max IDF: {statistics['tfidf']['max_idf']:.2f}")
    print(f"- Min IDF: {statistics['tfidf']['min_idf']:.2f}")

    # Save everything
    print("\nSaving models and statistics...")
    try:
        # Save vectorizer
        joblib.dump(tfidf_vectorizer, os.path.join(output_dir, 'tfidf_vectorizer.joblib'))
        
        # Save IDF scores
        with open(os.path.join(output_dir, 'idf_scores.json'), 'w', encoding='utf-8') as f:
            json.dump(idf_scores, f, ensure_ascii=False, indent=2)
            
        # Save statistics
        with open(os.path.join(output_dir, 'tfidf_statistics.json'), 'w', encoding='utf-8') as f:
            json.dump(statistics, f, ensure_ascii=False, indent=2)
            
        print("\n✅ Saved successfully:")
        print(f"- Vectorizer: {os.path.join(output_dir, 'tfidf_vectorizer.joblib')}")
        print(f"- IDF scores: {os.path.join(output_dir, 'idf_scores.json')}")
        print(f"- Statistics: {os.path.join(output_dir, 'tfidf_statistics.json')}")
        
    except Exception as e:
        print(f"\n❌ Error saving files: {e}")
        
    return tfidf_vectorizer, idf_scores

# Function to load the saved model
def load_tfidf_model(output_dir="../models"):
    """
    Load the saved TF-IDF model, IDF scores, and statistics.
    
    Args:
        output_dir: Directory where models and statistics are saved
        
    Returns:
        tuple: (vectorizer, idf_scores, statistics)
    """
    try:
        # Load vectorizer
        vectorizer = joblib.load(os.path.join(output_dir, 'tfidf_vectorizer.joblib'))
        
        # Load IDF scores
        with open(os.path.join(output_dir, 'idf_scores.json'), 'r', encoding='utf-8') as f:
            idf_scores = json.load(f)
            
        # Load statistics
        with open(os.path.join(output_dir, 'tfidf_statistics.json'), 'r', encoding='utf-8') as f:
            statistics = json.load(f)
        
        print("\nModel validation:")
        print(f"- Vocabulary size: {len(idf_scores):,}")
        print(f"- Average IDF: {statistics['tfidf']['average_idf']:.2f}")
        print(f"- Max IDF: {statistics['tfidf']['max_idf']:.2f}")
        print(f"- Min IDF: {statistics['tfidf']['min_idf']:.2f}")
 
        print("✅ Model loaded successfully")
        return vectorizer, idf_scores
        
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        return None, None, None
        
# Transform new text using the loaded vectorizer
def transform_text(text, vectorizer):
    """Transform new text using the loaded vectorizer"""
    try:
        transformed = vectorizer.transform([text])
        return transformed
    except Exception as e:
        print(f"❌ Error transforming text: {e}")
        return None

In [108]:
# Build and save the model
vectorizer, idf_scores = build_and_save_tfidf_model(corpus)
 
# Test loading
vectorizer, idf_scores = load_tfidf_model()


Model validation:
- Vocabulary size: 298,422
- Average IDF: 11.84
- Max IDF: 12.36
- Min IDF: 2.03
✅ Model loaded successfully


In [109]:
from nltk.corpus import wordnet as wn
def get_wordnet_synonyms(word, max_synonyms=3):
    synonyms = set()
    word = word.lower()

    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            name = lemma.name().replace("_", " ").lower()

            # Filter out:
            if name == word: # Skip the original word
                continue
            if len(name.split()) > 1:  # Skip multi-word phrases
                continue
            if not name.isalpha(): # Skip non-alphabetic words
                continue

            synonyms.add(name)

    # Rank by frequency (most-used synonyms first)
    ranked_synonyms = sorted(synonyms, key=lambda s: -sum(lemma.count() for syn in wn.synsets(s) for lemma in syn.lemmas() if lemma.name().lower() == s))

    return ranked_synonyms[:max_synonyms] # Return up to max_synonyms

In [110]:
from nltk import pos_tag

def is_expandable(pos):
    return pos.startswith('NN') or pos.startswith('JJ')  # nouns & adjectives
    
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def expand_query_with_wordnet(query_text, tfidf_vectorizer, max_synonyms=1, n_expand=1):
    # For expansion decisions, work with the original query text
    tokens = word_tokenize(query_text.lower())
    tagged = pos_tag(tokens)
    
    # Candidate words = noun/adjective, not stopword, alphabetic
    candidates = [
        (word, pos) for word, pos in tagged
        if word.isalpha() and word not in stop_words and is_expandable(pos)
    ]

    # Get IDF scores from vectorizer to identify important terms
    # We need to preprocess the word to match vectorizer's vocabulary
    idf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))
    
    # Score each candidate by IDF from the corpus
    scored = [
        (word, idf_scores.get(es_like_preprocess(word), 0.0))  # preprocess just for vocabulary lookup
        for word, _ in candidates
    ]

    # Sort by IDF weight descending and keep top-n
    top_words = [word for word, _ in sorted(scored, key=lambda x: -x[1])[:n_expand]]
    
    # Expand only top words
    expanded_terms = []
    for word in top_words:
        synonyms = get_wordnet_synonyms(word, max_synonyms)
        if synonyms:  # only add if we found synonyms
            expanded_terms.extend(synonyms)

    # Return original query + expansion terms (let Elasticsearch handle preprocessing)
    if expanded_terms:
        return query_text + " " + " ".join(expanded_terms)
        
    return query_text

In [126]:
with jsonlines.open(input_dir + 'queries.jsonl') as reader:
    queries = [obj for obj in reader]
    print(f"Loaded {len(queries)} queries.")

Loaded 50 queries.


In [127]:
expanded_queries = []
print("Expanding Queries..")
for query in tqdm(queries, unit="query"):
    new_query = query.copy()
    new_query["expanded_text"] = expand_query_with_wordnet(query["text"], vectorizer)
    expanded_queries.append(new_query)

Expanding Queries..


100%|██████████| 50/50 [00:29<00:00,  1.68query/s]


In [128]:
with jsonlines.open("../data/trec-covid/queries_expanded_wordnet.jsonl", mode='w') as writer:
    for q in expanded_queries:
        writer.write(q)
    print("✅ Expanded queries saved to ../data/trec-covid/queries_expanded_wordnet.jsonl")

✅ Expanded queries saved to ../data/trec-covid/queries_expanded_wordnet.jsonl


In [None]:
def process_queries_phase_2(expanded_queries_path):
    # Load queries
    with open(expanded_queries_path, 'r', encoding='utf-8') as f:
        queries = [json.loads(line) for line in f]

    INDEX_NAME = "ir2025-index"
    k_values = [20, 30, 50]

    runs = {f"run_{k}": {} for k in k_values}
    for k in k_values:
        output_dir = f"../results/phase_2"
        os.makedirs(output_dir, exist_ok=True)

        for query in tqdm(queries, desc=f"Processing Expanded Queries with WordNet for run with k = {k}"):
            qid = query["_id"]
            query_text = query["expanded_text"] # already did this: expand_query_with_wordnet()
            response = es.search(
                index=INDEX_NAME,
                query={{ "match": { "text": query_text}}},
                size=k
            )

            runs[f"run_{k}"][qid] = {hit["_id"]: hit["_score"] for hit in response["hits"]["hits"]}

        # Save each run
        with open(os.path.join(output_dir, f'retrieval_top_{k}.json'), 'w', encoding='utf-8') as f:
            json.dump(runs[f"run_{k}"], f, ensure_ascii=False, indent=4)
            print(f"✅ Results saved to: ../results/phase_2/retrieval_top_{k}.json")

    return runs
    
runs = process_queries_phase_2("../data/trec-covid/queries_expanded_wordnet.jsonl")

Processing Expanded Queries with WordNet for run with k = 20: 100%|██████████| 50/50 [00:01<00:00, 27.48it/s]


✅ Results saved to: ../results/phase_2/retrieval_top_20.json


Processing Expanded Queries with WordNet for run with k = 30: 100%|██████████| 50/50 [00:01<00:00, 43.12it/s]


✅ Results saved to: ../results/phase_2/retrieval_top_30.json


Processing Expanded Queries with WordNet for run with k = 50: 100%|██████████| 50/50 [00:01<00:00, 35.05it/s]

✅ Results saved to: ../results/phase_2/retrieval_top_50.json





In [130]:
def load_qrels(qrels_path="../data/trec-covid/qrels/test.tsv"):
    qrels = {}
    with open(qrels_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        for row in reader:
            qid = row['query-id']
            docid = row['corpus-id']
            relevance = int(row['score'])
            qrels.setdefault(qid, {})[docid] = relevance

    relevant_counts = Counter()
    for qid, docs in qrels.items():
        relevant_counts[qid] = sum(1 for rel in docs.values() if rel > 0)
    print("Average number of relevant documents per query:", int(sum(relevant_counts.values()) / len(relevant_counts)))

    return qrels

qrels = load_qrels()

Average number of relevant documents per query: 493


In [131]:
def compute_metrics(qrels, runs, folder, metrics=['map', 'P_5', 'P_10', 'P_15', 'P_20']):    
    # Metrics to Evaluate
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'P'})
    
    for run_name, run in runs.items():
        k = run_name.split("_")[1]
        print(f"Computing metrics for run with k = {k}")
        
        # Verify how many documents were retrieved per query
        # for query_id, docs in run.items():
            # num_docs = len(docs)
            # print(f"Query ID: {query_id} - Retrieved Documents: {num_docs}")
            
        results = evaluator.evaluate(run)
        
        #Print available metrics for debugging
        # first_query = list(results.keys())[0]
        # print(f"Available metrics for {first_query}: {list(results[first_query].keys())}")
        
        # Compute average metrics
        avg_scores = {metric: 0.0 for metric in metrics}
        num_queries = len(results)
        
        for res in results.values():
            for metric in metrics:
                avg_scores[metric] += res.get(metric, 0.0)
        
        for metric in metrics:
            avg_scores[metric] /= num_queries
                                                                                                                                               
        # Prepare output directory
        output_dir = os.path.join("../results", folder)
        os.makedirs(output_dir, exist_ok=True)
        
        # Save per-query metrics
        per_query_path = os.path.join(output_dir, f"per_query_metrics_top_{k}.json")
        with open(per_query_path, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=4)
        
        # Save average metrics
        avg_metrics_path = os.path.join(output_dir, f"average_metrics_top_{k}.json")
        with open(avg_metrics_path, "w", encoding="utf-8") as f:
            json.dump(avg_scores, f, indent=4)
        
        print(f"✅ Per-query metrics saved to: {per_query_path}")
        print(f"✅ Average metrics saved to: {avg_metrics_path}\n")
        
compute_metrics(qrels, runs, 'phase_2')

Computing metrics for run with k = 20
✅ Per-query metrics saved to: ../results\phase_2\per_query_metrics_top_20.json
✅ Average metrics saved to: ../results\phase_2\average_metrics_top_20.json

Computing metrics for run with k = 30
✅ Per-query metrics saved to: ../results\phase_2\per_query_metrics_top_30.json
✅ Average metrics saved to: ../results\phase_2\average_metrics_top_30.json

Computing metrics for run with k = 50
✅ Per-query metrics saved to: ../results\phase_2\per_query_metrics_top_50.json
✅ Average metrics saved to: ../results\phase_2\average_metrics_top_50.json



### Let's compare the (AVG) results:

In [132]:
def compare_phases(phases, k_values=[20, 30, 50], metrics=['map', 'P_5', 'P_10', 'P_15', 'P_20']):
    """
    Display and optionally compare retrieval metrics for 1 to 4 phases.
    Parameters:
    - phases: dict mapping phase names to base file paths, e.g.
        {
            "Phase 1": "../results/phase_1/average_metrics_top_{}.json",
            "Phase 2": "../results/phase_2/average_metrics_top_{}.json",
            ...
        }
    - k_values: list of cutoff values to compare ([20, 30, 50])
    - metrics: list of TREC metric keys (['map', 'P_5', 'P_10'])

    Returns:
    - pandas DataFrame with metrics for all phases at each k
    """
    comparison = []

    for k in k_values:
        row = {"k": k}
        for phase_name, base_path in phases.items():
            try:
                with open(base_path.format(k), "r") as f:
                    phase_metrics = json.load(f)
                row[f"{phase_name} MAP"] = phase_metrics["map"]
                for m in metrics[1:]: # exclude MAP
                    row[f"{phase_name} avgPre@{m[2:]}"] = phase_metrics[m]
            except FileNotFoundError:
                print(f"⚠️ File not found: {base_path.format(k)}")
        comparison.append(row)

    df = pd.DataFrame(comparison)
    df.sort_values("k", inplace=True)
    df.set_index("k", inplace=True) # Set 'k' column as the index for visualization purposes
    display(df)
    return df

In [133]:
phases = {
    "Phase 1": "../results/phase_1/average_metrics_top_{}.json",
    "Phase 2": "../results/phase_2/average_metrics_top_{}.json",
    # "Phase 3": "../results/phase_3/average_metrics_top_{}.json",
    # "Phase 4": "../results/phase_4/average_metrics_top_{}.json"
}
_ = compare_phases(phases)

Unnamed: 0_level_0,Phase 1 MAP,Phase 1 avgPre@5,Phase 1 avgPre@10,Phase 1 avgPre@15,Phase 1 avgPre@20,Phase 2 MAP,Phase 2 avgPre@5,Phase 2 avgPre@10,Phase 2 avgPre@15,Phase 2 avgPre@20
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20,0.020569,0.64,0.582,0.564,0.548,0.020473,0.604,0.586,0.554667,0.536
30,0.027753,0.64,0.582,0.564,0.549,0.02828,0.604,0.586,0.554667,0.536
50,0.039911,0.64,0.582,0.564,0.549,0.040742,0.604,0.586,0.554667,0.536
