In [1]:
import csv
import os
from dotenv import load_dotenv
from elasticsearch import Elasticsearch
from pprint import pprint
import pandas as pd
import numpy as np
from eland.ml.ltr import LTRModelConfig, QueryFeatureExtractor, FeatureLogger
from tqdm import tqdm
tqdm.pandas()


# Load environment variables from .env file
load_dotenv()

ELASTICSEARCH_HOST = os.getenv("ELASTICSEARCH_HOST", "localhost")
ELASTICSEARCH_PORT = int(os.getenv("ELASTICSEARCH_PORT", "9200"))
ELASTICSEARCH_USERNAME = os.getenv("ELASTICSEARCH_USERNAME")
ELASTICSEARCH_PASSWORD = os.getenv("ELASTICSEARCH_PASSWORD")

es_client = Elasticsearch(
            hosts=[{
                "host": ELASTICSEARCH_HOST,
                "port": ELASTICSEARCH_PORT,
                "scheme": "http"
            }],
            basic_auth=(ELASTICSEARCH_USERNAME, ELASTICSEARCH_PASSWORD)
        )

In [2]:
client_info = es_client.info()

f"Successfully connected to cluster {client_info['cluster_name']} (version {client_info['version']['number']})"

'Successfully connected to cluster my-cluster (version 8.14.0)'

In [3]:
judgments_df = pd.read_csv("reference_colletion.csv")
judgments_df

Unnamed: 0,query,document,label
0,action comedy anime,14813,1
1,action comedy anime,23847,1
2,action comedy anime,3080,0
3,action comedy anime,34800,11
4,action comedy anime,43007,1
...,...,...,...
94,superhero anime,35988,2
95,survival anime,2333,1
96,survival anime,27911,1
97,survival anime,2810,2


In [76]:
import numpy as np

# Assuming es_client and index_name are already initialized
index_name = "anime"

# Define LTR model configuration
ltr_config = LTRModelConfig(
    feature_extractors=[
        QueryFeatureExtractor(
            feature_name="title_bm25", 
            query={"match": {"title": "{{query}}"}}
        ),
        QueryFeatureExtractor(
            feature_name="synopsis_bm25", 
            query={"match": {"synopsis": "{{query}}"}}
        )
    ]
)

# Initialize FeatureLogger
feature_logger = FeatureLogger(es_client, index_name, ltr_config)



In [93]:
import json
def to_named_query(query, query_name):
    return {"bool": {"must": query, "_name": query_name}}

def extract_query_features(query_params, doc_ids):
   from elasticsearch._sync.client import _quote
   __path = f"/{_quote("anime")}/_search/template"
   __query = {"include_named_queries_score": True}
   __headers = {"accept": "application/json", "content-type": "application/json"}


   query_extractors = feature_logger._model_config.query_feature_extractors

   queries = [
         to_named_query(extractor.query, extractor.feature_name)
         for extractor in query_extractors
   ]

   feats_names = [extractor.feature_name for extractor in query_extractors]

   adicional_features = ["score","score_count","score_rank"]
   
   source = json.dumps({
      "query": {
         "bool": {
               "should": queries,
               "filter": [
                  {
                     "terms": {
                           "anime_id": doc_ids
                     }
                  }
               ]
         }
      },
      "_source": ["anime_id", "title"]+adicional_features
   })

   __body = {
         "source": source,
         "params": {**query_params},
      }

   response = es_client.perform_request("GET", __path, params=__query, headers=__headers, body=__body)["hits"]["hits"]

   documents = {}
   
   for hit in response:
      
      dic_features = {
         feature: hit['matched_queries'][feature] 
         if 'matched_queries' in hit and feature in hit['matched_queries'] 
         else 0 
         for feature in feats_names
      }

      doc_id = hit['_source']['anime_id']

      for new_feature in adicional_features:
         dic_features[new_feature] = hit['_source'][new_feature]

      documents[doc_id] = dic_features

   return documents,feats_names+adicional_features


In [94]:

def _extract_query_features(query_judgements_group):
    # Retrieve document ids in the query group as strings.
    doc_ids = query_judgements_group["document"].astype(int).tolist()

    # Resolve query params for the current query group
    query_params = {"query": query_judgements_group["query"].iloc[0]}

    try:
        # Extract the features for the documents in the query group:
        doc_features, fet_name = extract_query_features(query_params, doc_ids)
    except Exception as ke:
        print(f"Error extracting features for query: {query_params}, error: {ke}")
        # Handle the error as needed
        return None  # Or handle in a different way based on your application

    # Adding a column to the dataframe for each feature:
    for feature_name in fet_name:
        query_judgements_group[feature_name] = np.array(
            [doc_features[doc_id][feature_name] for doc_id in doc_ids]
        )

    return query_judgements_group

# Assuming judgments_df is your DataFrame
judgments_with_features = judgments_df.groupby("query", group_keys=False).apply(_extract_query_features)

print("Judgments with features:")
judgments_with_features

Judgments with features:


Unnamed: 0,query,document,label,title_bm25,synopsis_bm25,score,score_count,score_rank
0,action comedy anime,14813,1,5.451886,0.000000,8.02,739365,533
1,action comedy anime,23847,1,5.091943,0.000000,8.23,594638,292
2,action comedy anime,3080,0,6.393344,3.203357,6.01,5192,8601
3,action comedy anime,34800,11,8.969734,0.000000,6.34,4069,6066
4,action comedy anime,43007,1,5.866586,0.000000,6.06,92035,8354
...,...,...,...,...,...,...,...,...
94,superhero anime,35988,2,4.672631,0.000000,6.39,7957,6283
95,survival anime,2333,1,6.393344,0.000000,7.13,1487,3384
96,survival anime,27911,1,5.133145,0.000000,6.53,16352,5764
97,survival anime,2810,2,5.133145,0.000000,7.10,2309,3534
