### Start ElasticSearch manually before running the notebook:
On Windows:
- Make sure you have at least JDK 17
- Open a terminal and execute this (or run it as a Windows service):
```bash
C:\path\to\elasticsearch-8.17.2\bin\elasticsearch.bat
```
- No Greek characters should be present in the path.
- Leave that terminal window open.

- If no password was autogenerated execute this to get one:
```bash
.\bin\elasticsearch-reset-password.bat -u elastic
```

In [1]:
%pip install -r "..\\requirements.txt"

Note: you may need to restart the kernel to use updated packages.


3210122 + 3210191 = 6420313
- So we get the `trec_covid` IR2025 collection.

In [2]:
%pip list

Package                   Version
------------------------- --------------
anyio                     4.9.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 3.0.0
async-lru                 2.0.5
attrs                     25.3.0
babel                     2.17.0
beautifulsoup4            4.13.3
bleach                    6.2.0
certifi                   2025.1.31
cffi                      1.17.1
chardet                   5.2.0
charset-normalizer        3.4.1
click                     8.1.8
colorama                  0.4.6
comm                      0.2.2
contourpy                 1.3.1
cycler                    0.12.1
debugpy                   1.8.13
decorator                 5.2.1
defusedxml                0.7.1
elastic-transport         8.17.1
elasticsearch             8.10.0
et_xmlfile                2.0.0
executing                 2.2.0
faiss-cpu                 1.10.0
fastjsonschema            2.21.1
fonttools    

> Load and Preprocess the Data

In [3]:
# import json
# import re
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import SnowballStemmer

# nltk.download('stopwords')

# stop_words = set(stopwords.words('english'))
# stemmer = SnowballStemmer("english")

# def preprocess(text):
#     # Lowercase
#     text = text.lower()
#     # Remove punctuation
#     text = re.sub(r"[^\w\s]", "", text)
#     # Tokenize
#     tokens = text.split()
#     # Remove stopwords and apply stemming
#     tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
#     # Join back into string
#     return " ".join(tokens)

# def process_jsonl(input_path="..\\data\\trec-covid\\corpus.jsonl", output_path="..\\data\\corpus_processed.jsonl"):
#     with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
#         for line in infile:
#             obj = json.loads(line)
#             if "text" in obj:
#                 obj["text"] = preprocess(obj["text"])
#             json.dump(obj, outfile)
#             outfile.write("\n")

In [4]:
# # Verify preprocessing works
# example = "The quick brown foxes were jumping over the lazy dogs."
# print(preprocess(example))

In [5]:
# process_jsonl()

### Step 1: Load, Preprocess Data & Create Index

In [6]:
from dotenv import load_dotenv
import os

# Load .env file from the current directory
load_dotenv("..\\secrets\\secrets.env")

# Access environment variables
es_host = os.getenv("ES_HOST")
es_user = os.getenv("ES_USERNAME")
es_pass = os.getenv("ES_PASSWORD")

- Connect to ElasticSearch

In [7]:
from elasticsearch import Elasticsearch

es = Elasticsearch(es_host, basic_auth=(es_user, es_pass))

if es.ping():
    print("✅ Connected to ElasticSearch")
else:
    print("❌ Connection failed")

✅ Connected to ElasticSearch


- Create Index

In [8]:
INDEX_NAME = "ir2025-index"

# Delete the index if it already exists
if es.indices.exists(index=INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)
    print(f"✅ Index '{INDEX_NAME}' deleted.")

# Define the settings and mappings for the index
settings = {
    "analysis": {
        "analyzer": {
            "custom_english": {
                "type": "custom",
                "tokenizer": "standard",
                "filter": [
                    "lowercase",
                    "english_stop"
                ]
            }
        },
        "filter": {
            "english_stop": {
                "type": "stop",
                "stopwords": "_english_"
            }
        }
    }
}

mappings = {
    "properties": {
        "doc_id": {"type": "keyword"},
        "text": {"type": "text", "analyzer": "custom_english", "similarity": "BM25"}
    }
}

# Create the index with the specified settings and mappings
es.indices.create(
    index=INDEX_NAME,
    settings={"analysis": settings["analysis"]},
    mappings=mappings
)
print(f"✅ Index '{INDEX_NAME}' created")

✅ Index 'ir2025-index' deleted.
✅ Index 'ir2025-index' created


### Step 2: Populate Index

In [9]:
import json
from elasticsearch.helpers import streaming_bulk
from tqdm import tqdm

# Generator function to yield documents
def generate_documents(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            doc = json.loads(line)
            yield {
                "_index": INDEX_NAME,
                "_id": doc["_id"],
                "_source": {
                    "doc_id": doc["_id"],
                    "text": doc["text"]
                }
            }

# Path to your JSONL file
file_path = "../data/trec-covid/corpus.jsonl"

# Count the total number of documents for the progress bar
with open(file_path, 'r', encoding='utf-8') as f:
    total_docs = sum(1 for _ in f)

# Initialize the progress bar
progress = tqdm(unit="docs", total=total_docs)

successes = 0
for ok, action in streaming_bulk(client=es, actions=generate_documents(file_path), chunk_size=500):
    progress.update(1)
    successes += int(ok)

progress.close()
print(f"✅ Indexed {successes}/{total_docs} documents into '{INDEX_NAME}'")

100%|██████████| 171332/171332 [00:28<00:00, 5961.71docs/s]

✅ Indexed 171332/171332 documents into 'ir2025-index'





### Step 3: Execute Queries

In [10]:
import json
from tqdm import tqdm

# Path to your queries file
queries_path = "../data/trec-covid/queries.jsonl"

# Load queries
with open(queries_path, 'r', encoding='utf-8') as f:
    queries = [json.loads(line) for line in f]
    
INDEX_NAME = "ir2025-index"
k_values = [20, 30, 50] # Number of top documents to retrieve

responses = {k:{} for k in k_values} # nested dictionary

for query in tqdm(queries, desc="Processing Queries"):
    qid = query["_id"]
    query_text = query["text"]
    for k in k_values:
        response = es.search(
            index=INDEX_NAME,
            query={"match": {"text": query_text}},
            size=k
        )
        responses[k][qid] = [hit["_id"] for hit in response["hits"]["hits"]]

Processing Queries: 100%|██████████| 50/50 [00:01<00:00, 26.20it/s]


In [11]:
import json
output_dir = "../results/"
# Loop over each k and save a JSON file
for k, qid_dict in responses.items():
    nested_json = {}

    for query_id, doc_ids in qid_dict.items():
        nested_json[query_id] = [
            {"doc_id": doc_id, "rank": rank}
            for rank, doc_id in enumerate(doc_ids, start=1)
        ]

    output_path = output_dir + f"retrieval_top_{k}.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(nested_json, f, indent=2, ensure_ascii=False)

    print(f"✅ Saved retrieval results for top {k} to: {output_path}")

✅ Saved retrieval results for top 20 to: ../results/retrieval_top_20.json
✅ Saved retrieval results for top 30 to: ../results/retrieval_top_30.json
✅ Saved retrieval results for top 50 to: ../results/retrieval_top_50.json


### Step 4: Query Evaluation

In [14]:
import csv
qrels_path = "../data/trec-covid/qrels/test.tsv"

qrels = {}
with open(qrels_path, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f, delimiter='\t')
    for row in reader:
        qid = row['query-id']
        docid = row['corpus-id']
        relevance = int(row['score'])
        qrels.setdefault(qid, {})[docid] = relevance

In [16]:
import pytrec_eval
import pandas as pd

output_dir = "../results/"

# Define evaluator
evaluator = pytrec_eval.RelevanceEvaluator(
    qrels, {'map', 'P.5', 'P.10', 'P.15', 'P.20'}
)

# Store results per k
all_avg_scores = []

for k, qid_dict in responses.items():
    # Build `run` dict with fake scores (1 / rank)
    run = {}
    for qid, doc_ids in qid_dict.items():
        run[qid] = {
            doc_id: 1.0 / (rank + 1)  # e.g., 1.0, 0.5, 0.33, ...
            for rank, doc_id in enumerate(doc_ids)
        }

    # Evaluate using pytrec_eval
    results = evaluator.evaluate(run)

    # Average results across all queries
    metrics = ['map', 'P_5', 'P_10', 'P_15', 'P_20']
    avg_scores = {metric: 0.0 for metric in metrics}

    for res in results.values():
        for metric in metrics:
            avg_scores[metric] += res.get(metric, 0.0)

    num_queries = len(results)
    for metric in metrics:
        avg_scores[metric] /= num_queries

    avg_scores["run_id"] = f"top_{k}"
    all_avg_scores.append(avg_scores)

# Convert to DataFrame
df_metrics = pd.DataFrame(all_avg_scores)
print(df_metrics)

# Optional: Save to Excel
df_metrics.to_excel(output_dir + "evaluation_metrics.xlsx", index=False)
print(f"✅ Evaluation results saved to '{output_dir}evaluation_metrics.xlsx'")

        map    P_5   P_10      P_15   P_20  run_id
0  0.019953  0.636  0.594  0.557333  0.547  top_20
1  0.026907  0.000  0.594  0.000000  0.000  top_30
2  0.038876  0.000  0.594  0.000000  0.000  top_50
✅ Evaluation results saved to '../results/evaluation_metrics.xlsx'
