In [1]:
from elasticsearch import Elasticsearch

client = Elasticsearch("http://localhost:9200")

In [2]:
print(client.info())

{'name': 'lyz-MS-7A94', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'DF8C2BvoSxen3wXyIlydcQ', 'version': {'number': '8.12.2', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '48a287ab9497e852de30327444b0809e55d46466', 'build_date': '2024-02-19T10:04:32.774273190Z', 'build_snapshot': False, 'lucene_version': '9.9.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


## Pretty printing Elasticsearch responses

Let's add a helper function to print Elasticsearch responses in a readable format. This function is similar to the one that was used in the [quickstart](https://github.com/elastic/elasticsearch-labs/blob/main/notebooks/search/00-quick-start.ipynb) guide.

In [3]:
def pretty_response(response):
    if len(response["hits"]["hits"]) == 0:
        print("Your search returned no results.")
    else:
        for hit in response["hits"]["hits"]:
            id = hit["_id"]
            publication_date = hit["_source"]["publish_date"]
            rank = hit["_rank"]
            title = hit["_source"]["title"]
            summary = hit["_source"]["summary"]
            pretty_output = f"\nID: {id}\nPublication date: {publication_date}\nTitle: {title}\nSummary: {summary}\nRank: {rank}"
            print(pretty_output)

# Querying Documents with Hybrid Search

🔐 NOTE: Before you can run the query in this section, you need the `book_index` dataset from our [quick start](https://github.com/elastic/elasticsearch-labs/blob/main/notebooks/search/00-quick-start.ipynb). If you haven't worked through the quick start, please follow the steps described there to create an Elasticsearch deployment with the dataset in it, and then come back to run the query here.

Now we need to perform a query using two different search strategies:
- Semantic search using the "all-MiniLM-L6-v2" embedding model
- Keyword search using the "title" field

We then use [Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html) to balance the scores to provide a final list of documents, ranked in order of relevance. RRF is a ranking algorithm for combining results from different information retrieval strategies.

Note that _score is null, and we instead use _rank to show our top-ranked documents.

In [4]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

In [5]:
response = client.search(
    index="book_index",
    size=5,
    query={"match": {"summary": "python programming"}},
    knn={
        "field": "title_vector",
        "query_vector": model.encode(
            "python programming"
        ).tolist(),  # generate embedding for query so it can be compared to `title_vector`
        "k": 5,
        "num_candidates": 10,
    },
    rank={"rrf": {}},
)

pretty_response(response)

AuthorizationException: AuthorizationException(403, 'security_exception', 'current license is non-compliant for [Reciprocal Rank Fusion (RRF)]')

In [9]:
import elasticsearch
elasticsearch.__version__

(8, 3, 1)