In [1]:
from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print('Connected to Elasticsearch')
pprint(client_info.body)

Connected to Elasticsearch
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'ZLPKoMvyRwO3jn9eeAD8Ug',
 'name': '310ad8ef32ea',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-08-05T10:05:34.233336849Z',
             'build_flavor': 'default',
             'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.11.1',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.15.0'}}


# Search API

In [2]:
es.indices.delete(index='index_1', ignore_unavailable=True)
es.indices.create(index='index_1')

es.indices.delete(index='index_2', ignore_unavailable=True)
es.indices.create(index='index_2')

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'index_2'})

In [3]:
import os
import json

dummy_data = json.load(open(os.path.join(os.getcwd(), "data", "dummy.json")))
for document in dummy_data:
    response = es.index(index='index_1', body=document)

for document in dummy_data:
    response = es.index(index='index_2', body=document)

In [4]:
response = es.search(
    index='index_1',
    body={
        "query": {"match_all": {}}
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in index_1")

Found 3 documents in index_1


In [7]:
response['hits']['hits']

[{'_index': 'index_1',
  '_id': 'DthlL5MBnw3A4L5gAAR0',
  '_score': 1.0,
  '_source': {'title': 'Sample Title 1',
   'text': 'This is the first sample document text.',
   'created_on': '2024-09-22'}},
 {'_index': 'index_1',
  '_id': 'D9hlL5MBnw3A4L5gAAS_',
  '_score': 1.0,
  '_source': {'title': 'Sample Title 2',
   'text': 'Here is another example of a document.',
   'created_on': '2024-09-23'}},
 {'_index': 'index_1',
  '_id': 'ENhlL5MBnw3A4L5gAATG',
  '_score': 1.0,
  '_source': {'title': 'Sample Title 3',
   'text': 'The content of the third document goes here.',
   'created_on': '2024-09-24'}}]

In [8]:
response = es.search(
    index='index_1,index_2',
    body={
        "query": {"match_all": {}}
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in index_1 and index_2")

Found 6 documents in index_1 and index_2


In [9]:
response = es.search(
    index='index*',
    body={
        "query": {"match_all": {}}
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in all indexes with name starting with 'index'")

Found 6 documents in all indexes with name starting with 'index'


In [15]:
response = es.search(
    index='_all',
    body={
        "query": {"match_all": {}}
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in all indexes")

Found 21 documents in all indexes


## Leaf clauses

In [2]:
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index')

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

In [3]:
import os
import json

dummy_data = json.load(open(os.path.join(os.getcwd(), "data", "dummy.json")))
for document in dummy_data:
    response = es.index(index='my_index', body=document)

### Term query

In [4]:
response = es.search(
    index='my_index',
    body={
        "query": {
            "term": {
                "created_on": "2024-09-22"
            }
        }
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in my_index")

Found 1 documents in my_index


In [5]:
retrieved_documents = response['hits']['hits']
retrieved_documents

[{'_index': 'my_index',
  '_id': 'XVS9MpMBK_KRk-NKkcDA',
  '_score': 1.0,
  '_source': {'title': 'Sample Title 1',
   'text': 'This is the first sample document text.',
   'created_on': '2024-09-22'}}]

### Match query 

In [6]:
response = es.search(
    index='my_index',
    body={
        "query": {
            "match": {
                "text": "document"
            }
        }
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in my_index")

Found 3 documents in my_index


In [7]:
retrieved_documents = response['hits']['hits']
retrieved_documents

[{'_index': 'my_index',
  '_id': 'XVS9MpMBK_KRk-NKkcDA',
  '_score': 0.13606146,
  '_source': {'title': 'Sample Title 1',
   'text': 'This is the first sample document text.',
   'created_on': '2024-09-22'}},
 {'_index': 'my_index',
  '_id': 'XlS9MpMBK_KRk-NKksAu',
  '_score': 0.13606146,
  '_source': {'title': 'Sample Title 2',
   'text': 'Here is another example of a document.',
   'created_on': '2024-09-23'}},
 {'_index': 'my_index',
  '_id': 'X1S9MpMBK_KRk-NKksA4',
  '_score': 0.12874341,
  '_source': {'title': 'Sample Title 3',
   'text': 'The content of the third document goes here.',
   'created_on': '2024-09-24'}}]

### Range query

In [8]:
response = es.search(
    index='my_index',
    body={
        "query": {
            "range": {
                "created_on": {
                    "lte": "2024-09-23"
                }
            }
        }
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in my_index")

Found 2 documents in my_index


In [9]:
retrieved_documents = response['hits']['hits']
retrieved_documents

[{'_index': 'my_index',
  '_id': 'XVS9MpMBK_KRk-NKkcDA',
  '_score': 1.0,
  '_source': {'title': 'Sample Title 1',
   'text': 'This is the first sample document text.',
   'created_on': '2024-09-22'}},
 {'_index': 'my_index',
  '_id': 'XlS9MpMBK_KRk-NKksAu',
  '_score': 1.0,
  '_source': {'title': 'Sample Title 2',
   'text': 'Here is another example of a document.',
   'created_on': '2024-09-23'}}]

## Compound clauses

In [10]:
response = es.search(
    index='my_index',
    body={
        "query": {
            "bool": {
                "must": [
                    {
                        "match": {
                            "text": "third"
                        }
                    },
                    {
                        "range": {
                            "created_on": {
                                "gte": "2024-09-24",
                                "lte": "2024-09-24"
                            }
                        }
                    }
                ]
            }
        }
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in my_index")

Found 1 documents in my_index


In [11]:
retrieved_documents = response['hits']['hits']
retrieved_documents

[{'_index': 'my_index',
  '_id': 'X1S9MpMBK_KRk-NKksA4',
  '_score': 1.94566,
  '_source': {'title': 'Sample Title 3',
   'text': 'The content of the third document goes here.',
   'created_on': '2024-09-24'}}]

In [12]:
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index')

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

In [13]:
import os
import json

dummy_data = json.load(open(os.path.join(os.getcwd(), "data", "dummy_2.json")))
for _ in range(10):
    dummy_data += dummy_data

len(dummy_data)

5120

In [14]:
operations = []
for document in dummy_data:
    operations.append({'index': {'_index': 'my_index'}})
    operations.append(document)

es.bulk(operations=operations)

ObjectApiResponse({'errors': False, 'took': 1709793, 'items': [{'index': {'_index': 'my_index', '_id': 'YFTPMpMBK_KRk-NKI8D_', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_index', '_id': 'YVTPMpMBK_KRk-NKI8D_', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_index', '_id': 'YlTPMpMBK_KRk-NKI8D_', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_index', '_id': 'Y1TPMpMBK_KRk-NKI8D_', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_index', '_id': 'ZFTPMpMBK_KRk-NKI8D_', '_version': 1, 'result': 'created', '_shards': {'

## Size & From

In [15]:
response = es.search(
    index="my_index",
    body={
        "query": {
            "match_all": {}
        },
        "size": 10,
        "from": 10
    },
)

for hit in response['hits']['hits']:
    print(hit['_source'])

{'message': 'This is an important keyword search result.', 'age': 25, 'price': 100.0}
{'message': 'Another search result with an important keyword.', 'age': 30, 'price': 150.0}
{'message': 'Keyword match in this result as well.', 'age': 40, 'price': 200.0}
{'message': 'Important keyword again in this document.', 'age': 35, 'price': 120.0}
{'message': 'Final document with the important keyword.', 'age': 28, 'price': 180.0}
{'message': 'This is an important keyword search result.', 'age': 25, 'price': 100.0}
{'message': 'Another search result with an important keyword.', 'age': 30, 'price': 150.0}
{'message': 'Keyword match in this result as well.', 'age': 40, 'price': 200.0}
{'message': 'Important keyword again in this document.', 'age': 35, 'price': 120.0}
{'message': 'Final document with the important keyword.', 'age': 28, 'price': 180.0}


## Timeout

In [20]:
response = es.search(
    index="my_index",
    body={
        "query": {
            "match": {
                "message": "search keyword"
            }
        },
        "timeout": "10s"
    },
)

response.body

{'took': 5,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 5120, 'relation': 'eq'},
  'max_score': 0.8941701,
  'hits': [{'_index': 'my_index',
    '_id': 'YFTPMpMBK_KRk-NKI8D_',
    '_score': 0.8941701,
    '_source': {'message': 'This is an important keyword search result.',
     'age': 25,
     'price': 100.0}},
   {'_index': 'my_index',
    '_id': 'YVTPMpMBK_KRk-NKI8D_',
    '_score': 0.8941701,
    '_source': {'message': 'Another search result with an important keyword.',
     'age': 30,
     'price': 150.0}},
   {'_index': 'my_index',
    '_id': 'ZVTPMpMBK_KRk-NKI8D_',
    '_score': 0.8941701,
    '_source': {'message': 'This is an important keyword search result.',
     'age': 25,
     'price': 100.0}},
   {'_index': 'my_index',
    '_id': 'ZlTPMpMBK_KRk-NKI8D_',
    '_score': 0.8941701,
    '_source': {'message': 'Another search result with an important keyword.',
     'age': 30,
     'price': 150.0}},
   

## Aggregation

In [21]:
response = es.search(
    index="my_index",
    body={
        "query": {
            "match_all": {}
        },
        "aggs": {
            "avg_age": {
                "avg": {
                    "field": "age"
                }
            }
        }
    }
)

average_age = response['aggregations']['avg_age']['value']
print(f"Average Age: {average_age}")

Average Age: 31.6


## Combine

In [22]:
response = es.search(
    index="my_index",
    body={
        "query": {
            "match": {
                "message": "important keyword"
            }
        },
        "aggs": {
            "max_price": {
                "max": {
                    "field": "price"
                }
            }
        },
        "size": 5,
        "from": 20,
        "timeout": "5s"
    },
)

for hit in response['hits']['hits']:
    print(hit['_source'])

max_price = response['aggregations']['max_price']['value']
print(f"Max Price: {max_price}")

{'message': 'Important keyword again in this document.', 'age': 35, 'price': 120.0}
{'message': 'Final document with the important keyword.', 'age': 28, 'price': 180.0}
{'message': 'Important keyword again in this document.', 'age': 35, 'price': 120.0}
{'message': 'Final document with the important keyword.', 'age': 28, 'price': 180.0}
{'message': 'Important keyword again in this document.', 'age': 35, 'price': 120.0}
Max Price: 200.0
