### Start ElasticSearch manually before running the notebook:
On Windows:
- Make sure you have at least JDK 17
- Open a terminal and execute this (or run it as a Windows service):
```bash
C:\path\to\elasticsearch-8.17.2\bin\elasticsearch.bat
```
- No Greek characters should be present in the path.
- Leave that terminal window open.

- If no password was autogenerated execute this to get one:
```bash
.\bin\elasticsearch-reset-password.bat -u elastic
```

In [1]:
# %pip install -r "..\\requirements.txt"

# TODO: !
3210122 + 3210191 = 6420313
- So we get the `trec_covid` IR2025 collection.

In [2]:
from collections import Counter
import jsonlines
import json
import csv
import pandas as pd
from tqdm import tqdm
import pytrec_eval
from IPython.display import display

In [3]:
from dotenv import load_dotenv
import os

# Load .env file from the current directory
load_dotenv("..\\secrets\\secrets.env")

# Access environment variables
es_host = os.getenv("ES_HOST")
es_user = os.getenv("ES_USERNAME")
es_pass = os.getenv("ES_PASSWORD")

- Connect to ElasticSearch

In [4]:
from elasticsearch import Elasticsearch

es = Elasticsearch(es_host, basic_auth=(es_user, es_pass))

if es.ping():
    print("✅ Connected to ElasticSearch")
else:
    print("❌ Connection failed")

✅ Connected to ElasticSearch


- Load Index

In [5]:
INDEX_NAME = "ir2025-index"

# Delete the index if it already exists
if es.indices.exists(index=INDEX_NAME):
    print(f"✅ Index '{INDEX_NAME}' already exists.")

else:
    # Define the settings and mappings for the index
    settings = {
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": "_english_"
                },
                "english_stemmer": {
                    "type": "kstem"
                }
            },
            "analyzer": {
                "custom_english": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase", # Converts all terms to lowercase
                        "english_stop", # Removes English stop words
                        "english_stemmer" # Reduces words to their root form usign kstem
                    ]
                }
            }
        }
    }
    
    mappings = {
        "properties": {
            "doc_id": {"type": "keyword"},
            "text": {
                "type": "text",
                "analyzer": "custom_english",
                "similarity": "BM25"
            }
        }
    }
    
    # Create the index with the specified settings and mappings
    es.indices.create(
        index=INDEX_NAME,
        settings=settings,
        mappings=mappings
    )
    print(f"✅ Index '{INDEX_NAME}' created")

✅ Index 'ir2025-index' already exists.


In [6]:
input_dir = '../data/trec-covid/'

with jsonlines.open(input_dir + 'corpus.jsonl') as reader:
    corpus = [obj for obj in reader]

In [None]:
# Simulate custom_english Analyzer 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # KrovetzStemmer supports up to python 3.10 at best 
import string

# Initialize NLTK components
stop_words = set(stopwords.words('english'))

stemmer = PorterStemmer() # It's "Closer" to Korvetz than Snowball is

def es_like_preprocess(text):
    # Lowercase the text
    text = text.lower().strip()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove invisible or non-ASCII chars (TODO: SEEE IF TOU CAN ADD IT TO ElasticSearch AND THE OTHER PHASES)
    #text = text.encode("ascii", "ignore").decode("ascii")
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords, apply stemming (Porter), (TODO: SEEE IF TOU CAN ADD IT TO ElasticSearch AND THE OTHER PHASES) skip anything not purely arithmetic
    processed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words or not token.isalpha()]
    # Join tokens back into a single string
    return ' '.join(processed_tokens)

In [8]:
from nltk.tokenize import sent_tokenize
# Final list of tokenized sentences
processed_sentences = []
print("Preprocessing corpus into sentences...")
for doc in tqdm(corpus, unit="doc"): # ~8 minutes
    doc_text = doc["text"]
    # Split into sentences
    sentences = sent_tokenize(doc_text)
    for sentence in sentences:
        tokens = es_like_preprocess(sentence).split() 
        if tokens:  # skip empty
            processed_sentences.append(tokens)

Preprocessing corpus into sentences...


100%|██████████| 171332/171332 [08:47<00:00, 325.01doc/s]


- Train a Word2Vec Model

In [9]:
from gensim.models import Word2Vec
# --- Train Word2Vec --- # ~15 minutes
model = Word2Vec(processed_sentences, vector_size=200, window=5, min_count=5, epochs=15, negative=10, sample=1e-4, sg=1, workers=6, seed=42) # Use skip-gram = 1 | Use CBOW 0

In [11]:
import numpy as np
vec_norms = np.linalg.norm(model.wv.vectors, axis=1)
sorted_words = sorted(zip(model.wv.index_to_key, vec_norms), key=lambda x: x[1])
print("10 words with smallest vector norm:")
for word, norm in sorted_words[:10]:
    print(f"{word}: {norm:.4f}")


10 words with smallest vector norm:
wwwactabiomedicait: 0.0384
bioeng: 0.0396
protoc: 1.2187
briq: 1.4147
amer: 1.6496
physiol: 1.7822
bjog: 1.8216
subchapt: 1.8225
patientssubject: 1.8352
highinfect: 1.8380


### 🔧 Word2Vec Hyperparameter Summary

| Parameter     | Chosen Value | Purpose                                   | Pros                                              | Cons                                               |
|---------------|-------------------|-------------------------------------------|---------------------------------------------------|----------------------------------------------------|
| `vector_size` | 200               | Dimensionality of word embeddings         | Captures semantic nuances                         | Higher means more computational cost                          |
| `window`      | 5                 | Context window size                       | Balances syntactic and semantic information       | Too large may introduce noise                      |
| `min_count`   | 5                 | Minimum frequency threshold               | Removes rare noise words                          | May exclude rare but important terms               |
| `sg`          | 1 (Skip-Gram)     | Training algorithm                        | Better for rare words                             | Slower training                                    |
| `epochs`      | 15                | Number of training iterations             | Improves convergence                              | Risk of overfitting with too many epochs           |
| `negative`    | 10                | Number of negative samples                | Enhances embedding quality                        | Too many can slow training                         |
| `sample`      | 1e-4              | Subsampling frequent words                | Reduces dominance of frequent/common words        | May remove useful frequent words if too aggressive |
| `workers`     | 6         | Number of parallel training threads       | Speeds up training                                | Overhead with too many threads                     |
| `seed`        | 42                | Random seed for reproducibility           | Ensures consistent results                        | None                                               |


In [None]:
import os
# --- Save model ---
os.makedirs("../models", exist_ok=True)

# Save only the KeyedVectors part
model.wv.save("../models/w2v_ir2025.kv")
print("✅ Word2Vec model saved (only KeyedVectors) to: ../models/w2v_ir2025.kv")

In [13]:
from gensim.models import KeyedVectors

# --- Load model ---
kv_model = KeyedVectors.load("../models/w2v_ir2025.kv", mmap='r')
print("✅ Word2Vec model (only KeyedVectors) successfully loaded.")

✅ Word2Vec model (only KeyedVectors) successfully loaded.


In [14]:
print(f"Vocabulary size: {len(kv_model.key_to_index)}")

Vocabulary size: 65459


In [15]:
import numpy as np
vector_norms = np.linalg.norm(kv_model.vectors, axis=1)
print(f"Mean vector norm: {np.mean(vector_norms):.4f}")
print(f"Max vector norm: {np.max(vector_norms):.4f}")
print(f"Min vector norm: {np.min(vector_norms):.4f}")

Mean vector norm: 3.7523
Max vector norm: 7.7379
Min vector norm: 0.0384


In [23]:
def expand_query_with_word2vec(query_text, kv_model, topn=3):
    """
    Expands a query by finding similar terms using Word2Vec model.
    
    Args:
        query_text (str): Original query text
        kv_model: Loaded KeyedVectors model
        topn (int): Number of similar terms to add per query term
        
    Returns:
        str: Expanded query text
    """
    # Preprocess query same way as corpus
    query_tokens = es_like_preprocess(query_text).split()
    
    expanded_terms = []

    for token in query_tokens:
        expanded_terms.append(token)

        if token in kv_model.key_to_index:
            similar_terms = kv_model.most_similar(token, topn=topn)
            for term, score in similar_terms:
                if score >= 0.7 and term.isalpha():  # add only if clean and relevant
                    expanded_terms.append(term)

    return " ".join(expanded_terms)

In [24]:
with jsonlines.open(input_dir + 'queries.jsonl') as reader:
    queries = [obj for obj in reader]
    print(f"Loaded {len(queries)} queries.")

Loaded 50 queries.


In [26]:
expanded_queries = []
print("Expanding Queries..")
for query in tqdm(queries, unit="query"):
    new_query = query.copy()
    new_query["expanded_text"] = expand_query_with_word2vec(query["text"], kv_model)
    expanded_queries.append(new_query)

Expanding Queries..


  0%|          | 0/50 [00:00<?, ?query/s]

100%|██████████| 50/50 [00:01<00:00, 34.41query/s]


In [27]:
with jsonlines.open("../data/trec-covid/queries_expanded_word2vec.jsonl", mode='w') as writer:
    for q in expanded_queries:
        writer.write(q)
    print("✅ Expanded queries saved to ../data/trec-covid/queries_expanded_word2vec.jsonl")

✅ Expanded queries saved to ../data/trec-covid/queries_expanded_word2vec.jsonl


In [29]:
def process_queries_phase_3(expanded_queries_path):
    # Load queries
    with open(expanded_queries_path, 'r', encoding='utf-8') as f:
        queries = [json.loads(line) for line in f]

    INDEX_NAME = "ir2025-index"
    k_values = [20, 30, 50]

    runs = {f"run_{k}": {} for k in k_values}
    for k in k_values:
        output_dir = f"../results/phase_3"
        os.makedirs(output_dir, exist_ok=True)

        for query in tqdm(queries, desc=f"Processing Expanded Queries with Word2Vec for run with k = {k}"):
            qid = query["_id"]
            query_text = query["expanded_text"] # already did this: expand_query_with_word2vec()
            
            response = es.search(
                index=INDEX_NAME,
                query={
                    "bool": {
                        "should": [
                            { "match": { "text": query_text }}
                        ]
                    }
                },
                size=k
            )

            runs[f"run_{k}"][qid] = {hit["_id"]: hit["_score"] for hit in response["hits"]["hits"]}

        # Save each run
        with open(os.path.join(output_dir, f'retrieval_top_{k}.json'), 'w', encoding='utf-8') as f:
            json.dump(runs[f"run_{k}"], f, ensure_ascii=False, indent=4)
            print(f"✅ Results saved to: ../results/phase_3/retrieval_top_{k}.json")

    return runs
    
runs = process_queries_phase_3("../data/trec-covid/queries_expanded_word2vec.jsonl")

Processing Expanded Queries with Word2Vec for run with k = 20:   0%|          | 0/50 [00:00<?, ?it/s]

Processing Expanded Queries with Word2Vec for run with k = 20: 100%|██████████| 50/50 [00:02<00:00, 21.69it/s]


✅ Results saved to: ../results/phase_3/retrieval_top_20.json


Processing Expanded Queries with Word2Vec for run with k = 30: 100%|██████████| 50/50 [00:01<00:00, 46.42it/s]


✅ Results saved to: ../results/phase_3/retrieval_top_30.json


Processing Expanded Queries with Word2Vec for run with k = 50: 100%|██████████| 50/50 [00:01<00:00, 32.87it/s]

✅ Results saved to: ../results/phase_3/retrieval_top_50.json





In [30]:
def load_qrels(qrels_path="../data/trec-covid/qrels/test.tsv"):
    qrels = {}
    with open(qrels_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        for row in reader:
            qid = row['query-id']
            docid = row['corpus-id']
            relevance = int(row['score'])
            qrels.setdefault(qid, {})[docid] = relevance

    relevant_counts = Counter()
    for qid, docs in qrels.items():
        relevant_counts[qid] = sum(1 for rel in docs.values() if rel > 0)
    print("Average number of relevant documents per query:", int(sum(relevant_counts.values()) / len(relevant_counts)))

    return qrels

qrels = load_qrels()

Average number of relevant documents per query: 493


In [32]:
def compute_metrics(qrels, runs, folder, metrics=['map', 'P_5', 'P_10', 'P_15', 'P_20']):    
    # Metrics to Evaluate
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'P'})
    
    for run_name, run in runs.items():
        k = run_name.split("_")[1]
        print(f"Computing metrics for run with k = {k}")
        
        # Verify how many documents were retrieved per query
        # for query_id, docs in run.items():
            # num_docs = len(docs)
            # print(f"Query ID: {query_id} - Retrieved Documents: {num_docs}")
            
        results = evaluator.evaluate(run)
        
        #Print available metrics for debugging
        # first_query = list(results.keys())[0]
        # print(f"Available metrics for {first_query}: {list(results[first_query].keys())}")
        
        # Compute average metrics
        avg_scores = {metric: 0.0 for metric in metrics}
        num_queries = len(results)
        
        for res in results.values():
            for metric in metrics:
                avg_scores[metric] += res.get(metric, 0.0)
        
        for metric in metrics:
            avg_scores[metric] /= num_queries
                                                                                                                                               
        # Prepare output directory
        output_dir = os.path.join("../results", folder)
        os.makedirs(output_dir, exist_ok=True)
        
        # Save per-query metrics
        per_query_path = os.path.join(output_dir, f"per_query_metrics_top_{k}.json")
        with open(per_query_path, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=4)
        
        # Save average metrics
        avg_metrics_path = os.path.join(output_dir, f"average_metrics_top_{k}.json")
        with open(avg_metrics_path, "w", encoding="utf-8") as f:
            json.dump(avg_scores, f, indent=4)
        
        print(f"✅ Per-query metrics saved to: {per_query_path}")
        print(f"✅ Average metrics saved to: {avg_metrics_path}\n")
        
compute_metrics(qrels, runs, "phase_3")

Computing metrics for run with k = 20
✅ Per-query metrics saved to: ../results\phase_3\per_query_metrics_top_20.json
✅ Average metrics saved to: ../results\phase_3\average_metrics_top_20.json

Computing metrics for run with k = 30
✅ Per-query metrics saved to: ../results\phase_3\per_query_metrics_top_30.json
✅ Average metrics saved to: ../results\phase_3\average_metrics_top_30.json

Computing metrics for run with k = 50
✅ Per-query metrics saved to: ../results\phase_3\per_query_metrics_top_50.json
✅ Average metrics saved to: ../results\phase_3\average_metrics_top_50.json



In [33]:
def compare_phases(phases, k_values=[20, 30, 50], metrics=['map', 'P_5', 'P_10', 'P_15', 'P_20']):
    """
    Display and optionally compare retrieval metrics for 1 to 4 phases.
    Parameters:
    - phases: dict mapping phase names to base file paths, e.g.
        {
            "Phase 1": "../results/phase_1/average_metrics_top_{}.json",
            "Phase 2": "../results/phase_2/average_metrics_top_{}.json",
            ...
        }
    - k_values: list of cutoff values to compare (e.g. [20, 30, 50])
    - metrics: list of TREC metric keys (e.g. ['map', 'P_5', 'P_10'])

    Returns:
    - pandas DataFrame with metrics for all phases at each k
    """
    comparison = []

    for k in k_values:
        row = {"k": k}
        for phase_name, base_path in phases.items():
            try:
                with open(base_path.format(k), "r") as f:
                    phase_metrics = json.load(f)
                row[f"{phase_name} MAP"] = phase_metrics["map"]
                for m in metrics[1:]: # exclude MAP
                    row[f"{phase_name} avgPre@{m[2:]}"] = phase_metrics[m]
            except FileNotFoundError:
                print(f"⚠️ File not found: {base_path.format(k)}")
        comparison.append(row)

    df = pd.DataFrame(comparison)
    df.sort_values("k", inplace=True)
    df.set_index("k", inplace=True) # Set 'k' column as the index for visualization purposes
    display(df)
    return df

In [34]:
phases = {
    "Phase 1": "../results/phase_1/average_metrics_top_{}.json",
    "Phase 2": "../results/phase_2/average_metrics_top_{}.json",
    "Phase 3": "../results/phase_3/average_metrics_top_{}.json",
    # "Phase 4": "../results/phase_4/average_metrics_top_{}.json"
}
_ = compare_phases(phases)

Unnamed: 0_level_0,Phase 1 MAP,Phase 1 avgPre@5,Phase 1 avgPre@10,Phase 1 avgPre@15,Phase 1 avgPre@20,Phase 2 MAP,Phase 2 avgPre@5,Phase 2 avgPre@10,Phase 2 avgPre@15,Phase 2 avgPre@20,Phase 3 MAP,Phase 3 avgPre@5,Phase 3 avgPre@10,Phase 3 avgPre@15,Phase 3 avgPre@20
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20,0.020569,0.64,0.582,0.564,0.548,0.020473,0.604,0.586,0.554667,0.536,0.006545,0.252,0.224,0.224,0.229
30,0.027753,0.64,0.582,0.564,0.549,0.02828,0.604,0.586,0.554667,0.536,0.008953,0.252,0.224,0.224,0.229
50,0.039911,0.64,0.582,0.564,0.549,0.040742,0.604,0.586,0.554667,0.536,0.013308,0.252,0.224,0.224,0.229
