# ELSER: Excel (Single Column) → Elasticsearch → Semantic Search

**Purpose:** Ingest a single long-text column named **`Description`** from Excel/CSV into Elasticsearch,
generate ELSER tokens via an ingest pipeline, and run semantic search entirely on that column.

### What this notebook does
1. Connects to your local Elasticsearch (auth: `elastic/changeme`, adjustable).
2. Ensures the ELSER v2 model is deployed and the ingest pipeline exists.
3. Creates an index with `content` (text) and `ml.tokens` (rank_features).
4. Reads `Description` from your sheet and indexes it as `content`, using the pipeline to create tokens.
5. Runs a `text_expansion` query using ELSER tokens.

### Requirements
- Python packages in your venv:  
  `pip install elasticsearch==8.14.0 "urllib3<2" pandas python-dateutil openpyxl`
- ELSER v2 model `.elser_model_2_linux-x86_64` loaded and deployable (offline model already configured).


#### 0) Configure connection and paths

In [38]:

# Adjust these for your environment
ES_URL   = "http://localhost:9200"
ES_USER  = "elastic"
ES_PASS  = "changeme"

INDEX_NAME  = "excel_single_col"
FILE_PATH   = r"C:\Users\dell\elser-python\long_distance_runners.xlsx"  # or .csv
SHEET_NAME  = "Sheet1"   # set None for CSV
CONTENT_COL = "Description"  # <-- the ONLY column used for semantics
ID_COL      = "RowID"        # optional; set None if not present
UPDATED_COL = None           # optional timestamp column; set e.g. "updated_at"

TEST_QUERY = "who set records in long distance running?"
TOPK       = 5


#### 1) Imports and ES client

In [39]:

import time
from pathlib import Path
from datetime import datetime

import pandas as pd
from dateutil import parser as dtparser
from elasticsearch import Elasticsearch, helpers

MODEL_ID     = ".elser_model_2_linux-x86_64"
PIPELINE_ID  = "elser_v2_pipeline"
TOKENS_FIELD = "ml.tokens"

ES = Elasticsearch(ES_URL, basic_auth=(ES_USER, ES_PASS), request_timeout=120)


#### 2) Helpers: wait for ES, ensure model/pipeline/index

In [40]:

def wait_es(timeout_s: int = 60):
    deadline = time.time() + timeout_s
    while time.time() < deadline:
        try:
            ES.info()
            return
        except Exception:
            time.sleep(1)
    raise RuntimeError("Elasticsearch not responding")


In [41]:

def ensure_model_started():
    """Ensure the ELSER model is deployed (no-op if already started)."""
    try:
        stats = ES.ml.get_trained_models_stats(model_id=MODEL_ID)
        tms = stats.get("trained_model_stats", [])
        if tms:
            dstats = tms[0].get("deployment_stats") or {}
            if dstats.get("state") == "started":
                return
    except Exception:
        pass
    try:
        ES.ml.start_trained_model_deployment(
            model_id=MODEL_ID,
            number_of_allocations=1,
            threads_per_allocation=1,
            queue_capacity=1024,
        )
    except Exception:
        # already started or transient issue—ignore for idempotence
        pass


In [42]:

def ensure_pipeline():
    """Create/update the ELSER ingest pipeline (idempotent)."""
    pipeline = {
        "processors": [
            {
                "inference": {
                    "model_id": MODEL_ID,
                    "input_output": [
                        {"input_field": "content", "output_field": TOKENS_FIELD}
                    ],
                    "inference_config": {"text_expansion": {}}
                }
            }
        ]
    }
    ES.ingest.put_pipeline(id=PIPELINE_ID, processors=pipeline["processors"])


In [43]:

def ensure_index(es: Elasticsearch, index: str, with_extra_fields: dict | None = None):
    """Create an index with the minimal ELSER mapping (idempotent)."""
    if es.indices.exists(index=index):
        return
    props = {
        "content": {"type": "text"},
        "ml": {"properties": {"tokens": {"type": "rank_features"}}}
    }
    if with_extra_fields:
        props.update(with_extra_fields)
    es.indices.create(index=index, body={"mappings": {"properties": props}})


In [44]:

def to_dt(v):
    if pd.isna(v):
        return None
    if isinstance(v, datetime):
        return v
    try:
        return dtparser.parse(str(v))
    except Exception:
        return None


#### 3) Ingest a single long-text column (`Description`)

In [45]:

def ingest_excel_single_column(
    es: Elasticsearch,
    index: str,
    file_path: Path,
    sheet=None,
    *,
    content_col="content",    # the ONLY column used for embeddings
    id_col="id",              # optional
    updated_col=None,         # optional timestamp
    batch=1000,
):
    file_path = Path(file_path)
    # Load table
    if file_path.suffix.lower() == ".xlsx":
        sheet_name = None
        if sheet is not None:
            try:
                sheet_name = int(sheet)
            except ValueError:
                sheet_name = sheet
        df = pd.read_excel(file_path, sheet_name=sheet_name, engine="openpyxl")
    elif file_path.suffix.lower() == ".csv":
        df = pd.read_csv(file_path)
    else:
        raise SystemExit("Unsupported tabular format. Use .xlsx or .csv")

    # Case-insensitive column mapping
    cols = {c.lower().strip(): c for c in df.columns}
    def col(name): return cols.get(str(name).lower(), name)

    content_col = col(content_col)
    id_col      = col(id_col) if id_col else None
    updated_col = col(updated_col) if updated_col else None

    if content_col not in df.columns:
        raise SystemExit(f"Missing required column '{content_col}' in {file_path.name}")

    # Ensure mapping
    ensure_index(es, index, with_extra_fields={
        "id":         {"type": "keyword"},
        "updated_at": {"type": "date"}
    })

    actions = []
    for _, row in df.iterrows():
        rid = row.get(id_col) if id_col and id_col in df.columns else None
        txt = row.get(content_col)

        # Normalize updated_at to ISO if present
        updated_iso = None
        if updated_col and updated_col in df.columns:
            try:
                dt = to_dt(row.get(updated_col))
                if dt is not None:
                    updated_iso = dt.isoformat()
            except Exception:
                pass

        doc = {"content": "" if pd.isna(txt) else str(txt)}
        if rid is not None:
            doc["id"] = rid
        if updated_iso is not None:
            doc["updated_at"] = updated_iso

        actions.append({
            "_op_type": "index",
            "_index": index,
            "_id": str(rid) if rid is not None else None,
            "pipeline": PIPELINE_ID,   # ELSER pipeline expands content -> ml.tokens
            "_source": doc
        })

    if not actions:
        print("No rows to index.")
        return

    print(f"Indexing {len(actions)} rows from '{file_path.name}' → '{index}' via '{PIPELINE_ID}'...")
    success, fail = helpers.bulk(es, actions, stats_only=True, chunk_size=batch, request_timeout=120)
    es.indices.refresh(index=index)
    print(f"Done. success={success}, failed={fail}")


#### 4) Semantic search helper

In [46]:

def semantic_search(index: str, query: str, size: int = 5) -> pd.DataFrame:
    try:
        cnt = ES.count(index=index).get("count", 0)
        if cnt == 0:
            print(f"(Index '{index}' has 0 docs — nothing to search yet.)")
    except Exception:
        pass

    body = {
        "size": size,
        "query": {
            "text_expansion": {
                TOKENS_FIELD: {
                    "model_id": MODEL_ID,
                    "model_text": query
                }
            }
        },
        "_source": ["id", "content", "updated_at"]
    }
    res = ES.search(index=index, body=body)
    hits = res.get("hits", {}).get("hits", [])
    rows = []
    for h in hits:
        src = h.get("_source", {})
        rows.append({
            "score": h.get("_score"),
            "id": src.get("id"),
            "content_preview": (src.get("content") or "")[:200].replace("\n", " "),
            "updated_at": src.get("updated_at"),
        })
    return pd.DataFrame(rows)


#### 5) (Optional) Inspect your sheet columns

In [47]:

from pathlib import Path
if str(FILE_PATH).lower().endswith(".xlsx"):
    df_preview = pd.read_excel(FILE_PATH, sheet_name=SHEET_NAME, engine="openpyxl")
elif str(FILE_PATH).lower().endswith(".csv"):
    df_preview = pd.read_csv(FILE_PATH)
else:
    raise SystemExit("Unsupported file format; use .xlsx or .csv")
print("Columns in your file:", list(df_preview.columns))
df_preview.head(5)


Columns in your file: ['Description']


Unnamed: 0,Description
0,"Eliud Kipchoge, a Kenyan long-distance runner,..."
1,"Mo Farah, originally from Somalia and represen..."
2,Kenenisa Bekele of Ethiopia is a legend in lon...
3,"Brigid Kosgei, another Kenyan superstar, broke..."
4,"Haile Gebrselassie, often considered a pioneer..."


#### 6) Ingest and query

In [48]:

wait_es()
ensure_model_started()
ensure_pipeline()

ingest_excel_single_column(
    ES,
    index=INDEX_NAME,
    file_path=FILE_PATH,
    sheet=SHEET_NAME,
    content_col=CONTENT_COL,
    id_col=ID_COL,
    updated_col=UPDATED_COL,
    batch=1000
)

print("Ready. Run the next cell to query.")


Indexing 5 rows from 'long_distance_runners.xlsx' → 'excel_single_col' via 'elser_v2_pipeline'...
Done. success=5, failed=0
Ready. Run the next cell to query.


  success, fail = helpers.bulk(es, actions, stats_only=True, chunk_size=batch, request_timeout=120)


In [49]:

result_df = semantic_search(INDEX_NAME, TEST_QUERY, size=TOPK)
result_df


Unnamed: 0,score,id,content_preview,updated_at
0,24.061375,,Kenenisa Bekele of Ethiopia is a legend in lon...,
1,24.061375,,Kenenisa Bekele of Ethiopia is a legend in lon...,
2,24.061375,,Kenenisa Bekele of Ethiopia is a legend in lon...,
3,24.061375,,Kenenisa Bekele of Ethiopia is a legend in lon...,
4,24.061375,,Kenenisa Bekele of Ethiopia is a legend in lon...,


> Tip: In Kibana Dev Tools you can also run the equivalent DSL:

```json
POST excel_single_col/_search
{
  "size": 5,
  "query": {
    "text_expansion": {
      "ml.tokens": {
        "model_id": ".elser_model_2_linux-x86_64",
        "model_text": "who set records in long distance running?"
      }
    }
  },
  "_source": ["id","content","updated_at"]
}
```