In [1]:
from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print('Connected to Elasticsearch!')
pprint(client_info.body)

Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'nkRHUCJSQ-O3ADkv3ZPOkw',
 'name': '118de91b32d2',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-08-05T10:05:34.233336849Z',
             'build_flavor': 'default',
             'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.11.1',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.15.0'}}


# Synonyms

In [2]:
settings = {
    "settings": {
        "analysis": {
            "filter": {
                "synonym_filter": {
                    "type": "synonym",
                    "synonyms": [
                        "car, automobile, vehicle",
                        "tv, television",
                        "smartphone, mobile, cell phone",
                        "jupyter, jupyter notebook, jupyterlab",
                        "jupiter, mars, earth, venus, mercury, saturn, uranus, neptune => planet"
                    ]
                }
            },
            "analyzer": {
                "synonym_analyzer": {
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "synonym_filter"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "description": {
                "type": "text",
                "analyzer": "synonym_analyzer"
            }
        }
    }
}

index_name = "my_synonym_index"
es.indices.delete(index=index_name, ignore_unavailable=True)
response = es.indices.create(index=index_name, body=settings)
pprint(response.body)

{'acknowledged': True, 'index': 'my_synonym_index', 'shards_acknowledged': True}


In [3]:
import os
import json
from tqdm import tqdm

operations = []
dummy_data = json.load(open(os.path.join(os.getcwd(), "data", "dummy_synonyms.json")))
for document in tqdm(dummy_data, total=len(dummy_data)):
    operations.append({'index': {'_index': index_name}})
    operations.append(document)

response = es.bulk(operations=operations)
pprint(response.body)

100%|██████████| 5/5 [00:00<?, ?it/s]

{'errors': False,
 'items': [{'index': {'_id': 'EcF2NZMBjez6B0JSUmz8',
                      '_index': 'my_synonym_index',
                      '_primary_term': 1,
                      '_seq_no': 0,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': 'EsF2NZMBjez6B0JSUmz8',
                      '_index': 'my_synonym_index',
                      '_primary_term': 1,
                      '_seq_no': 1,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': 'E8F2NZMBjez6B0JSUmz8',
                      '_index': 'my_synonym_index',
                      '_primary_term': 1,
                      '_seq_no': 2,
                      '_shards': {'failed




In [4]:
query = {
    "query": {
        "match": {
            "description": "vehicle"
        }
    }
}

response = es.search(index=index_name, body=query)

print("Search Results:")
for hit in response["hits"]["hits"]:
    print(hit["_source"])

Search Results:
{'description': 'I love my car and television.'}


In [5]:
query = {
    "query": {
        "match": {
            "description": "planet"
        }
    }
}

response = es.search(index=index_name, body=query)

print("Search Results:")
for hit in response["hits"]["hits"]:
    print(hit["_source"])

Search Results:
{'description': 'I want to go to Mars.'}
{'description': 'I want to go to Venus.'}


In [6]:
settings = {
    "settings": {
        "analysis": {
            "filter": {
                "synonym_filter": {
                    "type": "synonym",
                    "synonyms": [
                        "car, automobile, vehicle",
                        "tv, television"
                    ]
                }
            },
            "analyzer": {
                "index_analyzer": {
                    "tokenizer": "standard",
                    "filter": ["lowercase"]
                },
                "search_analyzer": {
                    "tokenizer": "standard",
                    "filter": ["lowercase", "synonym_filter"]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "description": {
                "type": "text",
                "analyzer": "index_analyzer",
                "search_analyzer": "search_analyzer"
            }
        }
    }
}

es.indices.delete(index=index_name)
response = es.indices.create(index=index_name, body=settings)
pprint(response.body)

{'acknowledged': True, 'index': 'my_synonym_index', 'shards_acknowledged': True}


In [7]:
import os
import json
from tqdm import tqdm

operations = []
dummy_data = json.load(open(os.path.join(os.getcwd(), "data", "dummy_synonyms.json")))
for document in tqdm(dummy_data, total=len(dummy_data)):
    operations.append({'index': {'_index': index_name}})
    operations.append(document)

response = es.bulk(operations=operations)
pprint(response.body)

100%|██████████| 5/5 [00:00<?, ?it/s]

{'errors': False,
 'items': [{'index': {'_id': 'F8F3NZMBjez6B0JSx2z9',
                      '_index': 'my_synonym_index',
                      '_primary_term': 1,
                      '_seq_no': 0,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': 'GMF3NZMBjez6B0JSx2z9',
                      '_index': 'my_synonym_index',
                      '_primary_term': 1,
                      '_seq_no': 1,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': 'GcF3NZMBjez6B0JSx2z9',
                      '_index': 'my_synonym_index',
                      '_primary_term': 1,
                      '_seq_no': 2,
                      '_shards': {'failed




In [8]:
query = {
    "query": {
        "match": {
            "description": "vehicle"
        }
    }
}

response = es.search(index=index_name, body=query)

print("Search Results (Search-time synonyms):")
for hit in response["hits"]["hits"]:
    print(hit["_source"])

Search Results (Search-time synonyms):
{'description': 'I love my car and television.'}
