In [7]:
# --- Config ---
ES_URL  = "http://localhost:9200"
ES_USER = "elastic"
ES_PASS = "changeme"

CSV_OR_XLSX = r"C:\Users\dell\elser-python\sample_descriptions.xlsx"  # <-- path to your sheet
INDEX_NAME  = "chat_elser_description_only"
PIPELINE_ID = "elser_v2_description_only"
MODEL_ID    = ".elser_model_2_linux-x86_64"

DESCRIPTION_COL = "Description"   # <- hard requirement

# --- Setup ---
import os, time, uuid, re
from datetime import datetime
from pathlib import Path
import pandas as pd
from dateutil import parser as dtparser
from elasticsearch import Elasticsearch, helpers

ES = Elasticsearch(ES_URL, basic_auth=(ES_USER, ES_PASS), request_timeout=120)

def wait_es(timeout_s=60):
    deadline = time.time() + timeout_s
    while time.time() < deadline:
        try:
            ES.info()
            return
        except Exception:
            time.sleep(1)
    raise RuntimeError("Elasticsearch not responding")

def ensure_model_started(model_id: str):
    try:
        stats = ES.ml.get_trained_models_stats(model_id=model_id)
        tms = stats.get("trained_model_stats", [])
        if tms and (tms[0].get("deployment_stats") or {}).get("state") == "started":
            return
    except Exception:
        pass
    try:
        ES.ml.start_trained_model_deployment(
            model_id=model_id,
            number_of_allocations=1,
            threads_per_allocation=1,
            queue_capacity=1024,
        )
    except Exception:
        pass

def to_iso(v):
    if pd.isna(v): return None
    try:
        if isinstance(v, datetime): return v.isoformat()
        return dtparser.parse(str(v)).isoformat()
    except Exception:
        return None


In [None]:
wait_es()
ensure_model_started(MODEL_ID)

# Ingest pipeline: ONLY Description -> ml.description_tokens
pipeline = {
    "processors": [
        {
            "inference": {
                "model_id": MODEL_ID,
                "input_output": [
                    {"input_field": DESCRIPTION_COL, "output_field": "ml.description_tokens"}
                ],
                "inference_config": {"text_expansion": {}}
            }
        }
    ]
}
ES.ingest.put_pipeline(id=PIPELINE_ID, processors=pipeline["processors"])

# Index mapping: keep Description as text, add rank_features field
mapping = {
    "mappings": {
        "properties": {
            DESCRIPTION_COL: {"type": "text"},
            "ml": {"properties": {"description_tokens": {"type": "rank_features"}}},
            # optional timestamp if present in your CSV (e.g., created_dttm)
            "timestamp": {"type": "date"}
        }
    }
}

if not ES.indices.exists(index=INDEX_NAME):
    ES.indices.create(index=INDEX_NAME, body=mapping)
else:
    # idempotent: ensure the tokens field exists
    ES.indices.put_mapping(index=INDEX_NAME, body={
        "properties": {"ml": {"properties": {"description_tokens": {"type": "rank_features"}}}}
    })

print("Pipeline + index ready.")

Pipeline + index ready.


In [9]:
p = Path(CSV_OR_XLSX)
if p.suffix.lower() == ".csv":
    df = pd.read_csv(p)
elif p.suffix.lower() in (".xlsx", ".xls"):
    df = pd.read_excel(p, engine="openpyxl")
else:
    raise SystemExit("Provide a .csv or .xlsx file")

# Require the Description column
if DESCRIPTION_COL not in df.columns:
    raise SystemExit(f"Column '{DESCRIPTION_COL}' not found in {p.name}. Available: {list(df.columns)}")

# Try to detect a timestamp-ish column (optional)
def find_ts(cols):
    for c in cols:
        if str(c).lower() in ("created_dttm","created_at","timestamp","time","date"):
            return c
    return None

ts_col = find_ts(df.columns)

actions = []
for _, row in df.iterrows():
    doc = {}
    # Persist all columns as-is so you can still see metadata in hits
    for c in df.columns:
        val = row.get(c)
        if pd.isna(val): 
            continue
        doc[c] = val.item() if hasattr(val, "item") else val

    # Optional normalized timestamp
    if ts_col and ts_col in df.columns:
        iso = to_iso(row.get(ts_col))
        if iso:
            doc["timestamp"] = iso

    # IMPORTANT: indexing runs through the pipeline that expands ONLY Description
    actions.append({
        "_op_type": "index",
        "_index": INDEX_NAME,
        "_id": str(uuid.uuid4()),
        "pipeline": PIPELINE_ID,
        "_source": doc
    })

print(f"Indexing {len(actions)} docs → {INDEX_NAME} via {PIPELINE_ID} ...")
success, fail = helpers.bulk(ES, actions, stats_only=True, chunk_size=1000, request_timeout=120)
ES.indices.refresh(index=INDEX_NAME)
print("Bulk done. success=", success, " failed=", fail)

# quick count
print("Doc count:", ES.count(index=INDEX_NAME)["count"])


Indexing 10 docs → chat_elser_description_only via elser_v2_description_only ...


  success, fail = helpers.bulk(ES, actions, stats_only=True, chunk_size=1000, request_timeout=120)


Bulk done. success= 10  failed= 0
Doc count: 10


In [10]:
def search_description(question: str, size=5, hybrid=False):
    should = []
    if hybrid:
        # (optional) mix in keyword search over Description too
        should.append({"match": {DESCRIPTION_COL: {"query": question}}})

    should.append({
        "text_expansion": {
            "ml.description_tokens": {
                "model_id": MODEL_ID,
                "model_text": question
            }
        }
    })

    body = {
        "size": size,
        "query": {"bool": {"should": should, "minimum_should_match": 1}},
        "_source": list(df.columns) + [DESCRIPTION_COL, "timestamp"]
    }
    res = ES.search(index=INDEX_NAME, body=body)
    rows = []
    for h in res.get("hits", {}).get("hits", []):
        src = h.get("_source", {})
        rows.append({"_score": h.get("_score", 0.0), DESCRIPTION_COL: src.get(DESCRIPTION_COL), **{c: src.get(c) for c in df.columns if c != DESCRIPTION_COL}})
    return pd.DataFrame(rows)

# Example:
QUESTION = "Who works at FinEdge Capital?"
hits = search_description(QUESTION, size=5, hybrid=True)
display(hits)


Unnamed: 0,_score,Description
0,20.466158,"Robert Green, born December 9, 1975, serves as..."
1,5.147705,"John Doe, born on January 14, 1985, currently ..."
2,5.063447,"Li Wei, born on August 5, 1993, works for Sino..."
3,4.327348,"Miguel Santos, born July 11, 1991, is a logist..."
4,4.2763,"Ahmed Ibrahim, a senior architect at GreenBuil..."


In [None]:
# Install once per environment
%pip install -q "transformers>=4.44" torch

from transformers import pipeline
import json, re

ner = pipeline("token-classification", model="dslim/bert-base-NER", aggregation_strategy="simple")

DATE_RE = re.compile(
    r"\b("
    r"\d{4}-\d{2}-\d{2}|"
    r"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|"
    r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|"
    r"January|February|March|April|May|June|July|August|September|October|November|December)"
    r"\s+\d{1,2},?\s*\d{2,4}"
    r")\b",
    re.IGNORECASE
)

def extract_profile_bert(text: str):
    if not text or not str(text).strip():
        return {"name": None, "employer": None, "dob": None, "raw_entities": []}
    ents = ner(text)
    raw = [{"text": e["word"], "label": e["entity_group"]} for e in ents]

    persons = [e["text"] for e in raw if e["label"] == "PER"]
    orgs    = [e["text"] for e in raw if e["label"] == "ORG"]
    dates   = [e["text"] for e in raw if e["label"] == "DATE"]

    name = persons[0] if persons else None
    employer = orgs[0] if orgs else None
    dob = dates[0] if dates else None

    m = DATE_RE.search(text)
    if m: dob = dob or m.group(0)

    return {"name": name, "employer": employer, "dob": dob, "raw_entities": raw}

# Run NER on top hit from the last search
if hits is None or hits.empty:
    print("No hits to analyze. Run the search cell first.")
else:
    top_text = hits.iloc[0][DESCRIPTION_COL]
    profile = extract_profile_bert(top_text)
    print(json.dumps(profile, indent=2))
