### Start ElasticSearch manually before running the notebook:
On Windows:
- Make sure you have at least JDK 17
- Open a terminal and execute this (or run it as a Windows service):
```bash
C:\path\to\elasticsearch-8.17.2\bin\elasticsearch.bat
```
- No Greek characters should be present in the path.
- Leave that terminal window open.

- If no password was autogenerated execute this to get one:
```bash
.\bin\elasticsearch-reset-password.bat -u elastic
```

In [65]:
# %pip install -r "..\\requirements.txt"

3210122 + 3210191 = 6420313
- So we get the `trec_covid` IR2025 collection.

In [91]:
from collections import Counter
import jsonlines
import json
import csv
import pandas as pd
from tqdm import tqdm
import pytrec_eval
from IPython.display import display

> Load and Preprocess the Data: 
> 
> Manual Code is not used as it is done by ElasticSearch when we load the data as we have defined the settings and mappings.

In [67]:
# import json
# import re
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import SnowballStemmer

# nltk.download('stopwords')

# stop_words = set(stopwords.words('english'))
# stemmer = SnowballStemmer("english")

# def preprocess(text):
#     # Lowercase
#     text = text.lower()
#     # Remove punctuation
#     text = re.sub(r"[^\w\s]", "", text)
#     # Tokenize
#     tokens = text.split()
#     # Remove stopwords and apply stemming
#     tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
#     # Join back into string
#     return " ".join(tokens)

# def process_jsonl(input_path="..\\data\\trec-covid\\corpus.jsonl", output_path="..\\data\\corpus_processed.jsonl"):
#     with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
#         for line in infile:
#             obj = json.loads(line)
#             if "text" in obj:
#                 obj["text"] = preprocess(obj["text"])
#             json.dump(obj, outfile)
#             outfile.write("\n")

In [68]:
# # Verify preprocessing works
# example = "The quick brown foxes were jumping over the lazy dogs."
# print(preprocess(example))

In [69]:
# process_jsonl()

### Step 1: Load, Preprocess Data & Create Index

In [70]:
from dotenv import load_dotenv
import os

# Load .env file from the current directory
load_dotenv("..\\secrets\\secrets.env")

# Access environment variables
es_host = os.getenv("ES_HOST")
es_user = os.getenv("ES_USERNAME")
es_pass = os.getenv("ES_PASSWORD")

- Connect to ElasticSearch

In [71]:
from elasticsearch import Elasticsearch

es = Elasticsearch(es_host, basic_auth=(es_user, es_pass))

if es.ping():
    print("✅ Connected to ElasticSearch")
else:
    print("❌ Connection failed")

✅ Connected to ElasticSearch


- Create Index

In [72]:
INDEX_NAME = "ir2025-index"

# Delete the index if it already exists
if es.indices.exists(index=INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)
    print(f"✅ Index '{INDEX_NAME}' deleted.")

# Define the settings and mappings for the index
settings = {
    "analysis": {
        "filter": {
            "english_stop": {
                "type": "stop",
                "stopwords": "_english_"
            }
        },
        "analyzer": {
            "custom_english": {
                "type": "custom",
                "tokenizer": "standard",
                "filter": [
                    "lowercase",
                    "english_stop"
                ]
            }
        }
    }
}

mappings = {
    "properties": {
        "doc_id": {"type": "keyword"},
        "text": {
            "type": "text",
            "analyzer": "custom_english",
            "similarity": "BM25"
        }
    }
}

# Create the index with the specified settings and mappings
es.indices.create(
    index=INDEX_NAME,
    settings=settings,
    mappings=mappings
)
print(f"✅ Index '{INDEX_NAME}' created")

✅ Index 'ir2025-index' deleted.
✅ Index 'ir2025-index' created


### Step 2: Populate Index

In [73]:
from elasticsearch.helpers import streaming_bulk

# Generator function to yield documents
def generate_documents(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            doc = json.loads(line)
            yield {
                "_index": INDEX_NAME,
                "_id": doc["_id"],
                "_source": {
                    "doc_id": doc["_id"],
                    "text": doc["text"]
                }
            }

# Path to the JSONL file
file_path = "../data/trec-covid/corpus.jsonl"

# Count the total number of documents for the progress bar
with open(file_path, 'r', encoding='utf-8') as f:
    total_docs = sum(1 for _ in f)

# Initialize the progress bar
progress = tqdm(unit="docs", total=total_docs)

successes = 0
for ok, action in streaming_bulk(client=es, actions=generate_documents(file_path), chunk_size=500):
    progress.update(1)
    successes += int(ok)

progress.close()
print(f"✅ Indexed {successes}/{total_docs} documents into '{INDEX_NAME}'")

100%|██████████| 171332/171332 [00:40<00:00, 4225.04docs/s]

✅ Indexed 171332/171332 documents into 'ir2025-index'





### Step 3: Execute Queries

In [74]:
def process_queries_phase_1(queries_path):
    # Load queries
    with open(queries_path, 'r', encoding='utf-8') as f:
        queries = [json.loads(line) for line in f]
        
    INDEX_NAME = "ir2025-index"
    k_values = [20, 30, 50] # Number of top documents to retrieve
        
    runs = {f"run_{k}": {} for k in k_values}
    for k in k_values:
        # Prepare output directory
        output_dir = f"../results/phase_1"
        os.makedirs(output_dir, exist_ok=True)
        for query in tqdm(queries, desc=f"Processing Queries for run with k = {k}"):
            qid = query["_id"]
            query_text = query["text"]
            response = es.search(
                index=INDEX_NAME,
                query={"match": {"text": query_text}},
                size=k
            )
            runs[f"run_{k}"][qid] = {hit["_id"]: hit["_score"] for hit in response["hits"]["hits"]}
                
        with open(os.path.join(output_dir, f'retrieval_top_{k}.json'), 'w', encoding='utf-8') as f:
            json.dump(runs[f"run_{k}"], f, ensure_ascii=False, indent=4)
            print(f"✅ Results saved to: ../results/phase_1/retrieval_top_{k}.json")
    
    return runs
    
runs = process_queries_phase_1("../data/trec-covid/queries.jsonl")

Processing Queries for run with k = 20: 100%|██████████| 50/50 [00:02<00:00, 17.88it/s]


✅ Results saved to: ../results/phase_1/retrieval_top_20.json


Processing Queries for run with k = 30: 100%|██████████| 50/50 [00:00<00:00, 72.21it/s]


✅ Results saved to: ../results/phase_1/retrieval_top_30.json


Processing Queries for run with k = 50: 100%|██████████| 50/50 [00:00<00:00, 54.38it/s]

✅ Results saved to: ../results/phase_1/retrieval_top_50.json





### Step 4: Query Evaluation

In [121]:
def load_qrels(qrels_path="../data/trec-covid/qrels/test.tsv"):
    qrels = {}
    with open(qrels_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        for row in reader:
            qid = row['query-id']
            docid = row['corpus-id']
            relevance = int(row['score'])
            qrels.setdefault(qid, {})[docid] = relevance

    relevant_counts = Counter()
    for qid, docs in qrels.items():
        relevant_counts[qid] = sum(1 for rel in docs.values() if rel > 0)
    print("Average number of relevant documents per query:", int(sum(relevant_counts.values()) / len(relevant_counts)))

    return qrels

qrels = load_qrels()

Average number of relevant documents per query: 493


In [76]:
print(pytrec_eval.supported_measures)

{'11pt_avg', 'utility', 'Rndcg', 'gm_bpref', 'recip_rank', 'relative_P', 'infAP', 'Rprec', 'bpref', 'map_cut', 'P', 'set_P', 'map', 'set_map', 'num_q', 'set_relative_P', 'ndcg', 'gm_map', 'ndcg_rel', 'runid', 'iprec_at_recall', 'Rprec_mult', 'relstring', 'num_ret', 'recall', 'success', 'ndcg_cut', 'num_nonrel_judged_ret', 'binG', 'num_rel', 'set_recall', 'set_F', 'G', 'num_rel_ret'}


In [None]:
def compute_metrics(qrels, runs, folder):    
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'P.5', 'P.10', 'P.15', 'P.20'})
    for run_name, run in runs.items():
        k = run_name.split("_")[1]
        print(f"Computing metrics for run with k = {k}")
        results = evaluator.evaluate(run)
    
        # Compute average metrics
        metrics = ['map', 'P_5', 'P_10', 'P_15', 'P_20']
        avg_scores = {metric: 0.0 for metric in metrics}
        num_queries = len(results)
        
        for res in results.values():
            for metric in metrics:
                avg_scores[metric] += res.get(metric, 0.0)
        
        for metric in metrics:
            avg_scores[metric] /= num_queries
        
        # Prepare output directory
        output_dir = os.path.join("../results", folder)
        os.makedirs(output_dir, exist_ok=True)
        
        # Save per-query metrics
        per_query_path = os.path.join(output_dir, f"per_query_metrics_top_{k}.json")
        with open(per_query_path, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=4)
        
        # Save average metrics
        avg_metrics_path = os.path.join(output_dir, f"average_metrics_top_{k}.json")
        with open(avg_metrics_path, "w", encoding="utf-8") as f:
            json.dump(avg_scores, f, indent=4)
        
        print(f"✅ Per-query metrics saved to: {per_query_path}")
        print(f"✅ Average metrics saved to: {avg_metrics_path}\n")
    
compute_metrics(qrels, runs, 'phase_1')

Computing metrics for run with k = 20
✅ Per-query metrics saved to: ../results\phase_1\per_query_metrics_top_20.json
✅ Average metrics saved to: ../results\phase_1\average_metrics_top_20.json

Computing metrics for run with k = 30
✅ Per-query metrics saved to: ../results\phase_1\per_query_metrics_top_30.json
✅ Average metrics saved to: ../results\phase_1\average_metrics_top_30.json

Computing metrics for run with k = 50
✅ Per-query metrics saved to: ../results\phase_1\per_query_metrics_top_50.json
✅ Average metrics saved to: ../results\phase_1\average_metrics_top_50.json



## Phase 2: Query Expansion with Wordnet

In [78]:
import nltk
from nltk.corpus import wordnet as wn

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [79]:
input_dir = '../data/trec-covid/'

with jsonlines.open(input_dir + 'corpus.jsonl') as reader:
    corpus = [obj for obj in reader]

with jsonlines.open(input_dir + 'queries.jsonl') as reader:
    queries = [obj for obj in reader]

test_df = pd.read_csv(input_dir + 'qrels/' + 'test.tsv', sep='\t')

In [125]:
def get_wordnet_synonyms(word, max_synonyms=1):
    synonyms = set()
    word = word.lower()

    for syn in wn.synsets(word):
        # Keep only nouns and adjectives
        if syn.pos() in ('n', 'a'):
            for lemma in syn.lemmas():
                name = lemma.name().replace("_", " ").lower()

                # Filter out:
                if name == word: # Skip the original word
                    continue
                if len(name.split()) > 1:  # Skip multi-word phrases
                    continue
                if not name.isalpha(): # Skip non-alphabetic words
                    continue

                synonyms.add(name)

    # Rank by frequency (most-used synonyms first)
    ranked_synonyms = sorted(synonyms, key=lambda s: -sum(lemma.count() for syn in wn.synsets(s) for lemma in syn.lemmas() if lemma.name().lower() == s))

    return ranked_synonyms[:max_synonyms] # Return up to max_synonyms

In [126]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def expand_query_with_synonyms(query_text):
    expanded_terms = []
    for word in word_tokenize(query_text.lower()):
        if word.isalpha() and word not in stop_words:
            synonyms = get_wordnet_synonyms(word)
            expanded_terms.extend(synonyms)
    return query_text + " " + " ".join(expanded_terms)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mitsa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [127]:
expanded_queries = []
for query in tqdm(queries):
    new_query = query.copy()
    new_query["expanded_text"] = expand_query_with_synonyms(query["text"])
    expanded_queries.append(new_query)

100%|██████████| 50/50 [00:00<00:00, 320.00it/s]


In [128]:
with jsonlines.open("../data/trec-covid/queries_expanded_wordnet.jsonl", mode='w') as writer:
    for q in expanded_queries:
        writer.write({
            "_id": q["_id"],
            "text": q["expanded_text"]
        })
    print("✅ Expanded queries saved to queries_expanded_wordnet.jsonl")

✅ Expanded queries saved to queries_expanded_wordnet.jsonl


In [134]:
def process_queries_phase_2(queries_path):
    # Load queries
    with open(queries_path, 'r', encoding='utf-8') as f:
        queries = [json.loads(line) for line in f]

    INDEX_NAME = "ir2025-index"
    k_values = [20, 30, 50]

    runs = {f"run_{k}": {} for k in k_values}
    for k in k_values:
        output_dir = f"../results/phase_2"
        os.makedirs(output_dir, exist_ok=True)

        for query in tqdm(queries, desc=f"Processing Queries for run with k = {k}"):
            qid = query["_id"]
            original_text = query["text"]
            query_text = expand_query_with_synonyms(original_text)
            
            # This tells ElasticSearch:
            # - “Give higher importance to the original query terms (boost=2)”
            # - “But also consider the synonyms (with normal weight)”
            # => This prevents the expanded terms from drowning out the original intent.
            response = es.search(
                index=INDEX_NAME,
                query={
                    "bool": {
                        "should": [
                            { "match": { "text": { "query": original_text, "boost": 3 } }},
                            { "match": { "text": query_text }}
                        ]
                    }
                },
                size=k
            )

            runs[f"run_{k}"][qid] = {hit["_id"]: hit["_score"] for hit in response["hits"]["hits"]}

        # Save each run
        with open(os.path.join(output_dir, f'retrieval_top_{k}.json'), 'w', encoding='utf-8') as f:
            json.dump(runs[f"run_{k}"], f, ensure_ascii=False, indent=4)
            print(f"✅ Results saved to: ../results/phase_2/retrieval_top_{k}.json")

    return runs

In [135]:
runs = process_queries_phase_2("../data/trec-covid/queries_expanded_wordnet.jsonl")

Processing Queries for run with k = 20:   0%|          | 0/50 [00:00<?, ?it/s]

Processing Queries for run with k = 20: 100%|██████████| 50/50 [00:01<00:00, 27.17it/s]


✅ Results saved to: ../results/phase_2/retrieval_top_20.json


Processing Queries for run with k = 30: 100%|██████████| 50/50 [00:01<00:00, 26.62it/s]


✅ Results saved to: ../results/phase_2/retrieval_top_30.json


Processing Queries for run with k = 50: 100%|██████████| 50/50 [00:02<00:00, 23.60it/s]

✅ Results saved to: ../results/phase_2/retrieval_top_50.json





In [136]:
qrels = load_qrels()

Average number of relevant documents per query: 493


In [137]:
compute_metrics(qrels, runs, 'phase_2')

Computing metrics for run with k = 20
✅ Per-query metrics saved to: ../results\phase_2\per_query_metrics_top_20.json
✅ Average metrics saved to: ../results\phase_2\average_metrics_top_20.json

Computing metrics for run with k = 30
✅ Per-query metrics saved to: ../results\phase_2\per_query_metrics_top_30.json
✅ Average metrics saved to: ../results\phase_2\average_metrics_top_30.json

Computing metrics for run with k = 50
✅ Per-query metrics saved to: ../results\phase_2\per_query_metrics_top_50.json
✅ Average metrics saved to: ../results\phase_2\average_metrics_top_50.json



### Let's compare the (AVG) results:

In [143]:
phases = {
    "Phase 1": "../results/phase_1/average_metrics_top_{}.json",
    "Phase 2": "../results/phase_2/average_metrics_top_{}.json",
    # "Phase 3": "../results/phase_3/average_metrics_top_{}.json",
    # "Phase 4": "../results/phase_4/average_metrics_top_{}.json"
}

In [150]:
def compare_phases(phases, k_values=[20, 30, 50], metrics=['map', 'P_5', 'P_10', 'P_15', 'P_20']):
    """
    Display and optionally compare retrieval metrics for 1 to 4 phases.
    Parameters:
    - phases: dict mapping phase names to base file paths, e.g.
        {
            "Phase 1": "../results/phase_1/average_metrics_top_{}.json",
            "Phase 2": "../results/phase_2/average_metrics_top_{}.json",
            ...
        }
    - k_values: list of cutoff values to compare (e.g. [20, 30, 50])
    - metrics: list of TREC metric keys (e.g. ['map', 'P_5', 'P_10'])

    Returns:
    - pandas DataFrame with metrics for all phases at each k
    """
    comparison = []

    for k in k_values:
        row = {"k": k}
        for phase_name, base_path in phases.items():
            try:
                with open(base_path.format(k), "r") as f:
                    phase_metrics = json.load(f)
                row[f"{phase_name} MAP"] = phase_metrics["map"]
                for m in metrics:
                    row[f"{phase_name} avgPre@{m[2:]}"] = phase_metrics[m]
            except FileNotFoundError:
                print(f"⚠️ File not found: {base_path.format(k)}")
        comparison.append(row)

    df = pd.DataFrame(comparison)
    df.sort_values("k", inplace=True)
    df.set_index("k", inplace=True) # Set 'k' column as the index for visualization purposes
    display(df)
    return df

In [151]:
_ = compare_phases(phases)

Unnamed: 0_level_0,Phase 1 MAP,Phase 1 avgPre@p,Phase 1 avgPre@5,Phase 1 avgPre@10,Phase 1 avgPre@15,Phase 1 avgPre@20,Phase 2 MAP,Phase 2 avgPre@p,Phase 2 avgPre@5,Phase 2 avgPre@10,Phase 2 avgPre@15,Phase 2 avgPre@20
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
20,0.01988,0.01988,0.632,0.59,0.556,0.545,0.015925,0.015925,0.504,0.51,0.478667,0.464
30,0.026913,0.026913,0.0,0.59,0.0,0.0,0.021949,0.021949,0.0,0.51,0.0,0.0
50,0.03879,0.03879,0.0,0.59,0.0,0.0,0.03175,0.03175,0.0,0.51,0.0,0.0
