# ELSER v2 + BM25 Search — Single-Notebook Edition

**Last generated:** 2025-11-04T02:29:28

**What you get**  
- Robust hybrid search (ELSER text_expansion + BM25) with automatic fallback to BM25-only on any ML errors.  
- Zero up‑front license/deployment probing.  
- Easy config cell — just point to your `.xlsx`, `.xls`, or `.csv` with a text column.

> **Tip:** If ELSER is not available, the notebook still works: indexing/queries will automatically fall back to BM25.

## 1) Install dependencies

In [None]:
# If running on a fresh environment, uncomment:
# %pip install elasticsearch=8.13.0 pandas openpyxl python-dateutil

## 2) Class definition (BertDescriptionElser)

In [1]:
from __future__ import annotations

from datetime import datetime
from pathlib import Path
from typing import Dict, Any, Iterable, Optional, List, Sequence, Union

import pandas as pd
from dateutil import parser as dtparser
from elasticsearch import Elasticsearch, helpers

# elastic_transport.ApiError moved across versions; guard for broader compat
try:
    from elastic_transport import ApiError
except Exception:
    try:
        from elasticsearch import ApiError  # type: ignore
    except Exception:
        class ApiError(Exception):
            pass

try:
    from elasticsearch.helpers import BulkIndexError
except Exception:
    class BulkIndexError(Exception):
        pass


def _coerce_str(v) -> Optional[str]:
    if v is None:
        return None
    s = str(v).strip()
    return s if s else None


def to_iso(v) -> Optional[str]:
    if pd.isna(v):
        return None
    try:
        if isinstance(v, datetime):
            return v.isoformat()
        return dtparser.parse(str(v)).isoformat()
    except Exception:
        return None


class BertDescriptionElser:
    def __init__(
        self,
        es_url: str = "http://localhost:9200",
        es_user: str = "elastic",
        es_pass: str = "changeme",
        index_name: str = "chat_elser_description_only",
        pipeline_id: str = "elser_v2_description_only",
        model_id: str = ".elser_model_2_linux-x86_64",
        description_col: str = "Description",
        request_timeout: int = 120,
        use_ml: bool = True,
    ) -> None:
        self.es = Elasticsearch(
            es_url,
            basic_auth=(es_user, es_pass),
            request_timeout=request_timeout,
            verify_certs=False,
        )
        self.index_name = index_name
        self.pipeline_id = pipeline_id
        self.model_id = model_id
        self.description_col = description_col
        self.use_ml_requested = use_ml  # user preference to try ELSER

    # --------------------------
    # Mapping and pipeline
    # --------------------------
    def ensure_index(self) -> None:
        """Create or update a minimal mapping. Add rank_features if we expect ML tokens."""
        props: Dict[str, Any] = {
            self.description_col: {"type": "text"},
            "timestamp": {"type": "date", "ignore_malformed": True},
        }
        # It is safe to declare the token field even if it won’t be used.
        props.setdefault("ml", {"properties": {}})
        props["ml"]["properties"]["description_tokens"] = {"type": "rank_features"}

        body = {"mappings": {"properties": props}}
        if self.es.indices.exists(index=self.index_name):
            self.es.indices.put_mapping(index=self.index_name, properties=props)
        else:
            self.es.indices.create(index=self.index_name, **body)

    def ensure_pipeline(self) -> None:
        """
        Create or update ingest pipeline that writes to ml.description_tokens.
        If the model is unavailable, putting this pipeline still succeeds; any error would occur at ingest-time.
        """
        if not self.use_ml_requested:
            return
        processors = [
            {
                "inference": {
                    "model_id": self.model_id,
                    "inference_config": {
                        "text_expansion": {"results_field": "ml.description_tokens"}
                    },
                    "field_map": {self.description_col: "text_field"},
                }
            }
        ]
        self.es.ingest.put_pipeline(id=self.pipeline_id, processors=processors)

    def ensure_ready(self) -> None:
        """
        Backward-compat shim for older scripts that call `ensure_ready()`.
        Keep it side-effect-free for indexing; just make sure the ingest
        pipeline exists if ML was requested. Index creation happens in
        the caller (e.g., ensure_indexed()).
        """
        try:
            if self.use_ml_requested:
                self.ensure_pipeline()
        except Exception:
            # Don't fail here; actual search/indexing will gracefully fall back.
            pass

    # --------------------------
    # Ingestion
    # --------------------------
    def _sanitize_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        if self.description_col not in df.columns:
            raise ValueError(
                f"Required column '{self.description_col}' not found. "
                f"Available columns: {list(df.columns)}"
            )
        s = df[self.description_col].astype(str).map(lambda x: x.strip())
        df = df.copy()
        df[self.description_col] = s
        df = df[df[self.description_col].astype(bool)]
        if df.empty:
            raise ValueError(
                f"All rows are empty in '{self.description_col}'. Provide non-empty text."
            )
        return df

    def _iter_actions(self, df: pd.DataFrame, id_field: Optional[str]) -> Iterable[Dict[str, Any]]:
        for _, row in df.iterrows():
            doc: Dict[str, Any] = {}
            for c in df.columns:
                val = row[c]
                if pd.isna(val):
                    continue
                doc[c] = val

            # Optional timestamp detection
            for cand in ("created_dttm", "created_at", "timestamp", "time", "date"):
                if cand in df.columns and not pd.isna(row.get(cand)):
                    iso = to_iso(row[cand])
                    if iso:
                        doc["timestamp"] = iso
                        break

            action = {
                "_op_type": "index",
                "_index": self.index_name,
                "_source": doc,
            }
            if self.use_ml_requested:
                action["pipeline"] = self.pipeline_id  # safe; errors surface at bulk time
            if id_field and id_field in row and pd.notna(row[id_field]):
                action["_id"] = str(row[id_field])
            yield action

    def bulk_index_dataframe(self, df: pd.DataFrame, id_field: Optional[str] = None, chunk_size: int = 500) -> None:
        df = self._sanitize_dataframe(df)
        try:
            helpers.bulk(
                self.es,
                self._iter_actions(df, id_field),
                chunk_size=chunk_size,
                refresh="wait_for",
            )
        except BulkIndexError as bie:
            errors = getattr(bie, "errors", [])
            preview = errors[:3]
            raise RuntimeError(
                f"Bulk indexing failed for {len(errors)} documents. First errors: {preview}"
            ) from bie

    def bulk_index_file(self, csv_or_xlsx: Union[str, Path], id_field: Optional[str] = None) -> None:
        p = Path(csv_or_xlsx)
        if not p.exists():
            raise FileNotFoundError(p)
        if p.suffix.lower() == ".csv":
            df = pd.read_csv(p)
        elif p.suffix.lower() in (".xlsx", ".xls"):
            # openpyxl engine required for some environments
            df = pd.read_excel(p, engine="openpyxl")
        else:
            raise ValueError("Only .csv, .xlsx, or .xls are supported")
        self.bulk_index_dataframe(df, id_field=id_field)

    # --------------------------
    # Search
    # --------------------------
    def _build_body(self, question: str, size: int, include_elser: bool, fields_to_return: Optional[Sequence[str]]) -> Dict[str, Any]:
        should: List[Dict[str, Any]] = []
        # BM25 always present
        should.append({"match": {self.description_col: {"query": question, "boost": 0.6}}})
        # ELSER if requested
        if include_elser and self.use_ml_requested:
            should.append({
                "text_expansion": {
                    "ml.description_tokens": {
                        "model_id": self.model_id,
                        "model_text": question,
                    }
                }
            })

        body: Dict[str, Any] = {
            "size": size,
            "query": {"bool": {"should": should, "minimum_should_match": 1}},
        }
        if fields_to_return:
            body["_source"] = list(fields_to_return)
        return body

    def semantic_search(
        self,
        question: str,
        size: int = 10,
        hybrid: bool = True,
        fields_to_return: Optional[Sequence[str]] = None,
    ) -> pd.DataFrame:
        if not _coerce_str(question):
            raise ValueError("Provide a non-empty search question.")

        # Try ELSER + BM25 first; on API error, retry BM25-only
        try:
            body = self._build_body(question, size, include_elser=hybrid, fields_to_return=fields_to_return)
            res = self.es.search(index=self.index_name, body=body)
        except ApiError:
            body = self._build_body(question, size, include_elser=False, fields_to_return=fields_to_return)
            res = self.es.search(index=self.index_name, body=body)

        rows: List[Dict[str, Any]] = []
        for h in res.get("hits", {}).get("hits", []):
            src = h.get("_source", {})
            rows.append({"_score": h.get("_score", 0.0), **src})
        return pd.DataFrame(rows)

## 3) Helper functions

In [2]:
import pandas as pd
from pathlib import Path

def ensure_indexed(pipe: BertDescriptionElser, file_path: str, reindex: bool) -> None:
    """Create mapping/pipeline and index the provided file if requested or if index is empty."""
    count = 0
    if pipe.es.indices.exists(index=pipe.index_name):
        try:
            count = pipe.es.count(index=pipe.index_name)["count"]
        except Exception:
            count = 0

    if reindex or count == 0:
        if pipe.es.indices.exists(index=pipe.index_name):
            pipe.es.indices.delete(index=pipe.index_name, ignore_unavailable=True)
        pipe.ensure_index()
        pipe.ensure_pipeline()  # no-op if ML unavailable
        pipe.bulk_index_file(file_path, id_field=None)
        count = pipe.es.count(index=pipe.index_name)["count"]
        print(f"[INFO] Indexed docs: {count}")
    else:
        print(f"[INFO] Using existing index '{pipe.index_name}' with {count} docs.")

def do_query(pipe: BertDescriptionElser, q: str, text_col: str, size: int = 10, bm25_only: bool = False):
    hits = pipe.semantic_search(
        question=q,
        size=size,
        hybrid=(not bm25_only),  # BM25 always; add ELSER if allowed and available
    )
    if hits.empty:
        print("(no matches)")
        return hits
    # Compact view in notebooks
    cols = ["_score"]
    if text_col in hits.columns:
        cols.append(text_col)
    if "timestamp" in hits.columns:
        cols.append("timestamp")
    try:
        display(hits[cols] if set(cols).issubset(hits.columns) else hits)
    except Exception:
        print(hits[cols] if set(cols).issubset(hits.columns) else hits)
    return hits

## 4) Configure here

In [3]:
# === Elasticsearch connection ===
ES_URL   = "http://localhost:9200"
ES_USER  = "elastic"
ES_PASS  = "changeme"

# === Data file ===  (must be .xlsx/.xls/.csv and contain the TEXT COLUMN below)
DATA_FILE = r"C:\Users\dell\elser-python\sample_descriptions.xlsx"

# === Index / Pipeline / Model ===
INDEX_NAME   = "chat_elser_description_only"
PIPELINE_ID  = "elser_v2_description_only"
MODEL_ID     = ".elser_model_2_linux-x86_64"

# === Text column to index & search ===
TEXT_COL = "Description"

# === Query settings ===
ONE_SHOT_QUERY = "BlueSky Airlines safety compliance"  # set None or "" to skip
TOP_K          = 10
BM25_ONLY      = False   # set True to ignore ELSER and use BM25 only

# === Reindex control ===
REINDEX = False          # set True to drop/recreate index and re-ingest the file

## 5) Preview your data

In [4]:
from pathlib import Path
import pandas as pd

fp = Path(DATA_FILE)
assert fp.exists(), f"Input file not found: {fp}"

if fp.suffix.lower() in (".xlsx", ".xls"):
    df_preview = pd.read_excel(fp)
elif fp.suffix.lower() == ".csv":
    df_preview = pd.read_csv(fp)
else:
    raise AssertionError("Only .xlsx, .xls, or .csv are supported.")

print("=== DATA PREVIEW (first 3 rows) ===")
try:
    display(df_preview.head(3))
except Exception:
    print(df_preview.head(3).to_string(index=False))

print("=== COLUMNS ===")
print(list(df_preview.columns))

assert TEXT_COL in df_preview.columns, f"Column '{TEXT_COL}' not found. Available: {list(df_preview.columns)}"

=== DATA PREVIEW (first 3 rows) ===


Unnamed: 0,Description
0,"John Doe, born on January 14, 1985, currently ..."
1,Mary Johnson joined BlueSky Airlines in 2018 a...
2,"Ahmed Ibrahim, a senior architect at GreenBuil..."


=== COLUMNS ===
['Description']


## 6) Create mapping/pipeline and index (as needed)

In [5]:
pipe = BertDescriptionElser(
    es_url=ES_URL,
    es_user=ES_USER,
    es_pass=ES_PASS,
    index_name=INDEX_NAME,
    pipeline_id=PIPELINE_ID,
    model_id=MODEL_ID,
    description_col=TEXT_COL,
    use_ml=(not BM25_ONLY),
)

# Back-compat shim: ensures pipeline if ML requested (safe no-op otherwise)
pipe.ensure_ready()

# Create mapping + (optional) pipeline + bulk index file (if reindex or empty)
ensure_indexed(pipe, str(fp), reindex=REINDEX)

[INFO] Using existing index 'chat_elser_description_only' with 10 docs.


## 7) One-shot search

In [6]:
if ONE_SHOT_QUERY and str(ONE_SHOT_QUERY).strip():
    print(f"=== SEARCH RESULTS for: {ONE_SHOT_QUERY!r} ===")
    _hits = do_query(pipe, ONE_SHOT_QUERY, TEXT_COL, size=TOP_K, bm25_only=BM25_ONLY)
else:
    print("(Skipped — set ONE_SHOT_QUERY to a non-empty string to run a one-shot search.)")

=== SEARCH RESULTS for: 'BlueSky Airlines safety compliance' ===


Unnamed: 0,_score,Description
0,4.704845,Mary Johnson joined BlueSky Airlines in 2018 a...


## 8) Interactive search loop (optional)

In [None]:
print("Interactive mode. Type your query and press Enter.")
print("Commands: :quit to exit, :help for help.\n")

while True:
    try:
        q = input("query> ").strip()
    except (EOFError, KeyboardInterrupt):
        print("\nExiting.")
        break
    if not q:
        continue
    if q in {":quit", ":exit"}:
        print("Exiting.")
        break
    if q in {":help", "help", "?"}:
        print("Enter any text to search. Use :quit to exit.")
        continue
    print(f"\n=== SEARCH RESULTS for: {q!r} ===")
    _hits = do_query(pipe, q, TEXT_COL, size=TOP_K, bm25_only=BM25_ONLY)
    print("")

Interactive mode. Type your query and press Enter.
Commands: :quit to exit, :help for help.

