In [1]:

# --- Config ---
ES_URL  = "http://localhost:9200"
ES_USER = "elastic"
ES_PASS = "changeme"

CSV_PATH      = r"C:\Users\dell\elser-python\llm_outputs_sample.csv"  # <- change to your CSV path
INDEX_NAME    = "chat_elser_dynamic"
PIPELINE_ID   = "elser_v2_dynamic_pipeline"
MODEL_ID      = ".elser_model_2_linux-x86_64"

# Optional: cap the number of text columns to tokenize
MAX_TEXT_COLUMNS = 16

# For large CSVs, you can chunk ingest
BULK_CHUNK_SIZE = 1000


In [2]:

import os, re, time, uuid
from datetime import datetime
from pathlib import Path

import pandas as pd
from dateutil import parser as dtparser
from elasticsearch import Elasticsearch, helpers

ES = Elasticsearch(ES_URL, basic_auth=(ES_USER, ES_PASS), request_timeout=120)

def wait_es(timeout_s=60):
    deadline = time.time() + timeout_s
    while time.time() < deadline:
        try:
            ES.info()
            return
        except Exception:
            time.sleep(1)
    raise RuntimeError("Elasticsearch not responding")

def ensure_model_started(model_id: str):
    try:
        stats = ES.ml.get_trained_models_stats(model_id=model_id)
        tms = stats.get("trained_model_stats", [])
        if tms and (tms[0].get("deployment_stats") or {}).get("state") == "started":
            return
    except Exception:
        pass
    try:
        ES.ml.start_trained_model_deployment(
            model_id=model_id,
            number_of_allocations=1,
            threads_per_allocation=1,
            queue_capacity=1024,
        )
    except Exception:
        pass

def norm_field(name: str):
    s = re.sub(r"[^a-zA-Z0-9_]+", "_", str(name).strip())
    s = re.sub(r"_+", "_", s).strip("_").lower()
    return s or "field"


In [4]:

# --- Preview CSV ---
df = pd.read_csv(CSV_PATH)
print("Rows:", len(df), " Columns:", list(df.columns))
df.head(5)


Rows: 5  Columns: ['metadata_id', 'llm_outputs_id', 'incident_id', 'template_type', 'summary_string_llm', 'model_string', 'narrative_strings_llm', 'shipment_other_info', 'shipment_entity_info', 'created_dttm']


Unnamed: 0,metadata_id,llm_outputs_id,incident_id,template_type,summary_string_llm,model_string,narrative_strings_llm,shipment_other_info,shipment_entity_info,created_dttm
0,b89d09b0-12b4-4c6b-b94c-4b3a9f8a9b60,df60a3e3-fc60-4a12-8223-cbbaea58d6ff,INC0000000001,TYPE1,Summary generated by LLM for incident 0001,gpt-4,Shipment delayed due to weather conditions.,SH123456789,FedEx Logistics,2025-10-20 09:15:00
1,c84e78c3-8e9e-4a34-9f2a-202a22dbd510,1afc9e5b-5573-46a3-a012-1adfa0a4104b,INC0000000002,TYPE2,Summary generated for a mechanical failure case.,claude-3,Aircraft engine malfunction detected during ma...,SH987654321,DHL Aviation,2025-10-20 09:20:00
2,f4b273cf-c4ef-4ec8-8099-567b31e0f341,afdf91f9-46b2-4a1a-bd2f-1c25e7a1a14b,INC0000000003,TYPE1,Summary created for cargo loss case.,mistral-7b,Cargo lost during transit between hubs.,SH246810121,UPS Supply Chain,2025-10-20 09:25:00
3,2e5f12df-f44d-49b2-b13a-04d4b53b0ed1,68f04df1-3e92-45c1-a66b-fb2d32c7c9e9,INC0000000004,TYPE3,Summary: customs delay incident.,llama-3,Shipment held at customs for verification.,SH135791113,USPS International,2025-10-20 09:30:00
4,a7cd0a7f-482a-4d2b-9f92-05f373a09b79,cb2041a3-9a73-4d18-bba1-4419c58ff888,INC0000000005,TYPE2,Summary: package misroute resolved.,gemini-1.5,Package rerouted successfully after initial mi...,SH192837465,Amazon Logistics,2025-10-20 09:35:00


In [7]:

# --- Detect text-like columns dynamically ---
def is_text_series(s: pd.Series) -> bool:
    if s.dtype == "O":
        return True
    if pd.api.types.is_numeric_dtype(s) or pd.api.types.is_datetime64_any_dtype(s):
        return False
    try:
        sample = s.dropna().astype(str).head(100)
        if len(sample) == 0:
            return False
        return sample.map(len).mean() >= 8
    except Exception:
        return False

text_cols = [c for c in df.columns if is_text_series(df[c])]
text_cols = text_cols[:MAX_TEXT_COLUMNS]
print("Detected text columns:", text_cols)

# simple heuristic for a time field
def find_timestamp_column(cols):
    for name in cols:
        if str(name).lower() in ("timestamp", "created_dttm", "created_at", "time", "date"):
            return name
    return None

timestamp_col = find_timestamp_column(df.columns)

input_output = []
dynamic_props = {}
for col in text_cols:
    safe = norm_field(col)
    input_output.append({"input_field": col, "output_field": f"ml.{safe}_tokens"})
    dynamic_props[col] = {"type": "text"}
    dynamic_props.setdefault("ml", {"properties": {}})
    dynamic_props["ml"].setdefault("properties", {})[f"{safe}_tokens"] = {"type": "rank_features"}

dynamic_props["content"] = {"type": "text"}
if timestamp_col:
    dynamic_props["timestamp"] = {"type": "date"}

print("Token fields to create:", [f"ml.{norm_field(c)}_tokens" for c in text_cols])


Detected text columns: ['metadata_id', 'llm_outputs_id', 'incident_id', 'template_type', 'summary_string_llm', 'model_string', 'narrative_strings_llm', 'shipment_other_info', 'shipment_entity_info', 'created_dttm']
Token fields to create: ['ml.metadata_id_tokens', 'ml.llm_outputs_id_tokens', 'ml.incident_id_tokens', 'ml.template_type_tokens', 'ml.summary_string_llm_tokens', 'ml.model_string_tokens', 'ml.narrative_strings_llm_tokens', 'ml.shipment_other_info_tokens', 'ml.shipment_entity_info_tokens', 'ml.created_dttm_tokens']


In [8]:

# --- Ensure index and pipeline ---
wait_es()
ensure_model_started(MODEL_ID)

pipeline = {
    "processors": [
        {
            "inference": {
                "model_id": MODEL_ID,
                "input_output": input_output,
                "inference_config": {"text_expansion": {}}
            }
        }
    ]
}
ES.ingest.put_pipeline(id=PIPELINE_ID, processors=pipeline["processors"])

if not ES.indices.exists(index=INDEX_NAME):
    ES.indices.create(index=INDEX_NAME, body={"mappings": {"properties": dynamic_props}})
else:
    for col in text_cols:
        safe = norm_field(col)
        try:
            ES.indices.put_mapping(
                index=INDEX_NAME,
                body={"properties": {"ml": {"properties": {f"{safe}_tokens": {"type": "rank_features"}}}}}
            )
        except Exception:
            pass

print("Index + pipeline ready.")


Index + pipeline ready.


In [9]:

# --- Ingest CSV rows ---
def to_iso(v):
    if pd.isna(v):
        return None
    try:
        if isinstance(v, datetime):
            return v.isoformat()
        return dtparser.parse(str(v)).isoformat()
    except Exception:
        return None

def concat_content(row, cols):
    parts = []
    for c in cols:
        val = row.get(c)
        if pd.isna(val) or val is None:
            continue
        txt = str(val).strip()
        if txt:
            parts.append(f"{c}: {txt}")
    return "\n".join(parts)

actions = []
for _, row in df.iterrows():
    doc = {}
    for c in df.columns:
        val = row.get(c)
        if pd.isna(val):
            continue
        doc[c] = val.item() if hasattr(val, "item") else val
    doc["content"] = concat_content(row, text_cols)
    if timestamp_col and timestamp_col in df.columns:
        iso = to_iso(row.get(timestamp_col))
        if iso:
            doc["timestamp"] = iso

    actions.append({
        "_op_type": "index",
        "_index": INDEX_NAME,
        "_id": str(uuid.uuid4()),
        "pipeline": PIPELINE_ID,
        "_source": doc
    })

print(f"Indexing {len(actions)} docs → {INDEX_NAME} via {PIPELINE_ID} ...")
success, fail = helpers.bulk(ES, actions, stats_only=True, chunk_size=BULK_CHUNK_SIZE, request_timeout=120)
ES.indices.refresh(index=INDEX_NAME)
print("Bulk done. success=", success, " failed=", fail)


Indexing 5 docs → chat_elser_dynamic via elser_v2_dynamic_pipeline ...


  success, fail = helpers.bulk(ES, actions, stats_only=True, chunk_size=BULK_CHUNK_SIZE, request_timeout=120)


Bulk done. success= 5  failed= 0


In [None]:

# --- Semantic search ---
def build_semantic_query(model_text: str, token_fields: list[str], size=5, must_filters=None, hybrid=False):
    should = []
    if hybrid:
        should.append({
            "multi_match": {
                "query": model_text,
                "fields": ["content^1"]
            }
        })
    for tf in token_fields:
        should.append({
            "text_expansion": {
                tf: {"model_id": MODEL_ID, "model_text": model_text}
            }
        })
    q = {"bool": {"should": should, "minimum_should_match": 1}}
    if must_filters:
        q["bool"]["filter"] = must_filters
    return {"size": size, "query": q, "_source": list(df.columns) + ["content", "timestamp"]}

TOKEN_FIELDS = [f"ml.{norm_field(c)}_tokens" for c in text_cols]

def semantic_search(query: str, size=5, hybrid=False, must_filters=None):
    body = build_semantic_query(query, TOKEN_FIELDS, size=size, must_filters=must_filters, hybrid=hybrid)
    res = ES.search(index=INDEX_NAME, body=body)
    rows = []
    for hit in res.get("hits", {}).get("hits", []):
        src = hit.get("_source", {})
        rows.append({"_score": hit.get("_score", 0.0), **{k: src.get(k) for k in (list(df.columns) + ["content","timestamp"])}})
    import pandas as pd
    return pd.DataFrame(rows)

Ready. Example:
semantic_search('Which responses mention high risk shipments?', size=5, hybrid=True)


In [12]:
# Make sure this matches the index you ingested into earlier
print("Index:", INDEX_NAME)

# Quick sanity check: do we have docs?
try:
    print("Doc count:", ES.count(index=INDEX_NAME)["count"])
except Exception as e:
    print("Count error:", e)

# Now run a semantic query and display the results
QUESTION = "Which responses mention high risk shipments?"
df_hits = semantic_search(QUESTION, size=5, hybrid=True)  # hybrid=True mixes keyword + semantic
display(df_hits)


Index: chat_elser_dynamic
Doc count: 5


Unnamed: 0,_score,metadata_id,llm_outputs_id,incident_id,template_type,summary_string_llm,model_string,narrative_strings_llm,shipment_other_info,shipment_entity_info,created_dttm,content,timestamp
0,21.816092,b89d09b0-12b4-4c6b-b94c-4b3a9f8a9b60,df60a3e3-fc60-4a12-8223-cbbaea58d6ff,INC0000000001,TYPE1,Summary generated by LLM for incident 0001,gpt-4,Shipment delayed due to weather conditions.,SH123456789,FedEx Logistics,2025-10-20 09:15:00,metadata_id: b89d09b0-12b4-4c6b-b94c-4b3a9f8a9...,2025-10-20T09:15:00
1,21.152145,2e5f12df-f44d-49b2-b13a-04d4b53b0ed1,68f04df1-3e92-45c1-a66b-fb2d32c7c9e9,INC0000000004,TYPE3,Summary: customs delay incident.,llama-3,Shipment held at customs for verification.,SH135791113,USPS International,2025-10-20 09:30:00,metadata_id: 2e5f12df-f44d-49b2-b13a-04d4b53b0...,2025-10-20T09:30:00
2,20.626652,f4b273cf-c4ef-4ec8-8099-567b31e0f341,afdf91f9-46b2-4a1a-bd2f-1c25e7a1a14b,INC0000000003,TYPE1,Summary created for cargo loss case.,mistral-7b,Cargo lost during transit between hubs.,SH246810121,UPS Supply Chain,2025-10-20 09:25:00,metadata_id: f4b273cf-c4ef-4ec8-8099-567b31e0f...,2025-10-20T09:25:00
3,14.403996,a7cd0a7f-482a-4d2b-9f92-05f373a09b79,cb2041a3-9a73-4d18-bba1-4419c58ff888,INC0000000005,TYPE2,Summary: package misroute resolved.,gemini-1.5,Package rerouted successfully after initial mi...,SH192837465,Amazon Logistics,2025-10-20 09:35:00,metadata_id: a7cd0a7f-482a-4d2b-9f92-05f373a09...,2025-10-20T09:35:00
4,5.509855,c84e78c3-8e9e-4a34-9f2a-202a22dbd510,1afc9e5b-5573-46a3-a012-1adfa0a4104b,INC0000000002,TYPE2,Summary generated for a mechanical failure case.,claude-3,Aircraft engine malfunction detected during ma...,SH987654321,DHL Aviation,2025-10-20 09:20:00,metadata_id: c84e78c3-8e9e-4a34-9f2a-202a22dbd...,2025-10-20T09:20:00
