### Start ElasticSearch manually before running the notebook:
On Windows:
- Make sure you have at least JDK 17
- Open a terminal and execute this (or run it as a Windows service):
```bash
C:\path\to\elasticsearch-8.17.2\bin\elasticsearch.bat
```
- No Greek characters should be present in the path.
- Leave that terminal window open.

- If no password was autogenerated execute this to get one:
```bash
.\bin\elasticsearch-reset-password.bat -u elastic
```

In [1]:
%pip install -r "..\\requirements.txt"

Note: you may need to restart the kernel to use updated packages.


3210122 + 32 =
- So we get the `` IR2025 collection.

In [2]:
%pip list

Package                   Version
------------------------- --------------
anyio                     4.9.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 3.0.0
async-lru                 2.0.5
attrs                     25.3.0
babel                     2.17.0
beautifulsoup4            4.13.3
bleach                    6.2.0
certifi                   2025.1.31
cffi                      1.17.1
chardet                   5.2.0
charset-normalizer        3.4.1
click                     8.1.8
colorama                  0.4.6
comm                      0.2.2
contourpy                 1.3.1
cycler                    0.12.1
debugpy                   1.8.13
decorator                 5.2.1
defusedxml                0.7.1
elastic-transport         8.17.1
elasticsearch             8.10.0
executing                 2.2.0
faiss-cpu                 1.10.0
fastjsonschema            2.21.1
fonttools                 4.56.0
fqdn        

### Load and Preprocess the Data

In [None]:
import json
import re

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def process_jsonl(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            obj = json.loads(line)
            if "text" in obj:
                obj["text"] = preprocess(obj["text"])
            json.dump(obj, outfile)
            outfile.write("\n")


### Connect to ElasticSearch

In [3]:
from dotenv import load_dotenv
import os

# Load .env file from the current directory
load_dotenv("..\\secrets\\secrets.env")

# Access environment variables
es_host = os.getenv("ES_HOST")
es_user = os.getenv("ES_USERNAME")
es_pass = os.getenv("ES_PASSWORD")

In [4]:
from elasticsearch import Elasticsearch

es = Elasticsearch(es_host, basic_auth=(es_user, es_pass))

if es.ping():
    print("✅ Connected to ElasticSearch")
else:
    print("❌ Connection failed")

✅ Connected to ElasticSearch


- Create and Load Index

In [None]:
INDEX_NAME = "ir2025-index"

# Διαγραφή παλιού index (προαιρετικά)
if es.indices.exists(index=INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)

# Ορισμός Analyzer και συνάρτησης ομοιότητας (πχ BM25)
mapping = {
    "settings": {
        "analysis": {
            "analyzer": {
                "my_english": {
                    "type": "standard",
                    "stopwords": "_english_"
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "doc_id": {"type": "keyword"},
            "text": {
                "type": "text",
                "analyzer": "my_english"
            }
        }
    }
}

es.indices.create(index=INDEX_NAME, body=mapping)

In [None]:
import json
from tqdm import tqdm

with open("data/processed_corpus.jsonl", "r", encoding="utf-8") as f:
    for line in tqdm(f):
        doc = json.loads(line)
        es.index(index=INDEX_NAME, id=doc["doc_id"], document=doc)

print("✅ Done")