### import libraries

In [3]:
from opensearchpy import OpenSearch
from dotenv import load_dotenv
from pprint import pprint
import re

In [4]:
#load the enviornment variables from .env
load_dotenv()
from os import getenv

USERNAME = getenv("OPENSEARCH_ADMIN_USER")
PASSWORD = getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD")

### connect to host

In [5]:
host = 'localhost'
port = 19200
auth = (USERNAME, PASSWORD) # For testing only. Don't store credentials in code.
# ca_certs_path = '/full/path/to/root-ca.pem' # Provide a CA bundle if you use intermediate CAs with your root CA.

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)
client.info()

{'name': 'opensearch-node1',
 'cluster_name': 'opensearch-cluster',
 'cluster_uuid': 'NeKvN4_DTmuqlGdYJ8xw-w',
 'version': {'distribution': 'opensearch',
  'number': '3.0.0',
  'build_type': 'tar',
  'build_hash': 'dc4efa821904cc2d7ea7ef61c0f577d3fc0d8be9',
  'build_date': '2025-05-03T06:25:26.379676844Z',
  'build_snapshot': False,
  'lucene_version': '10.1.0',
  'minimum_wire_compatibility_version': '2.19.0',
  'minimum_index_compatibility_version': '2.0.0'},
 'tagline': 'The OpenSearch Project: https://opensearch.org/'}

### check my index

In [7]:
response = client.search(
    index="my_documents",
    body={
        "_source": ["content"],
        "query": {
            "match_all": {}
        }
    }
)

response

{'took': 35,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 15, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'my_documents',
    '_id': 'j3ibV5cBrn4Kmfkd6u4O',
    '_score': 1.0,
    '_source': {'content': "Effective: March 2020\nPurpose\n\nThe purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\nScope\n\nThis policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\nEligibility\n\nEmployees who can perform their work duties remotely and have received approval from their direct supervisor and the HR departm

In [21]:
#delete index
response = client.indices.delete("test-document-01", ignore_unavailable=True)
pprint(response)
# create index
response = client.indices.create("test-document-01")
pprint(response)

{'acknowledged': True}
{'acknowledged': True, 'index': 'test-document-01', 'shards_acknowledged': True}


### confirm the get method works

In [23]:
item_a = client.get(index="document_v4", id="1236")
pprint(item_a)

{'_id': '1236',
 '_index': 'document_v4',
 '_primary_term': 1,
 '_seq_no': 2,
 '_source': {'content': 'Some may say that supercar drivers dont really mind '
                        'risk',
             'content_embedding': {'##ability': 0.0043414626,
                                   '##away': 0.09657053,
                                   '##bility': 0.683933,
                                   '##car': 1.8049147,
                                   '##carriage': 0.18450333,
                                   '##cars': 0.7819286,
                                   '##erving': 0.741623,
                                   '##ever': 0.16756783,
                                   '##free': 0.16112545,
                                   '##hip': 0.08930432,
                                   '##moto': 0.70712644,
                                   '##oot': 0.37504345,
                                   '##ruck': 0.124648124,
                                   '##tical': 0.41809833,
       

In [25]:
item_a['_source']['content']

'Some may say that supercar drivers dont really mind risk'

### testing filter regex

In [8]:
import re
import json

In [9]:
def extract_filters(query):
    filters= []
    category_regex = r'category:([^\s]+)\s*'
    matches = re.search(category_regex, query)
    if matches:
        filters.append({
            'term': {
                'category.keyword': {
                    'value': matches.group(1)
                }
            }
        })

        #remove the category filter from the query
        query = re.sub(category_regex, '', query).strip()

    #year filter
    year_regex = r'year:([^\s]+)\s*'
    matches = re.search(year_regex, query)
    if matches:
        filters.append({
            'range': {
                'updated_at': {
                    'gte': f'{matches.group(1)}||/y',
                    'lte': f'{matches.group(1)}||/y',
                }
            },
        })
        #remove the year filter from the query
        query = re.sub(year_regex, '', query).strip()

    return {'filter': filters}, query

def handle_search():
    filters, parsed_query = extract_filters(query)
    if parsed_query:
        search_query = {
            'must': {
                'multi_match': {
                    'query': parsed_query,
                    'fields': ['name', 'summary', 'content']
                }
            }
        }
    else:
        search_query = {
            'must': {
                'match_all': {}
            }
        }
    results = client.search(
        index="my_documents",
        body={
            'query': {
                'bool': {
                    **search_query,
                    **filters
                }
            },
            'aggs': {
                'category-agg': {
                    'terms': {
                        'field': 'category.keyword',
                    }
                },
                'year-agg': {
                    'date_histogram': {
                        'field': 'updated_at',
                        'calendar_interval': 'year',
                        'format': 'yyyy',
                    },
                },
            },
            'size': 5,
        }
    )
    return results

In [10]:
query = 'work from home category:sharepoint'
results = handle_search()
results

{'took': 138,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 7, 'relation': 'eq'},
  'max_score': 1.0137551,
  'hits': [{'_index': 'my_documents',
    '_id': 'k3ibV5cBrn4Kmfkd6u4O',
    '_score': 1.0137551,
    '_source': {'summary': ': This policy outlines the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. Full-time employees accrue vacation time at a rate of [X hours] per month, equivalent to [Y days] per year. Vacation requests must be submitted to supervisors at least',
     'summary_embedding': [0.016417513,
      0.051348794,
      0.0070100795,
      -0.01109405,
      0.026815137,
      0.001640904,
      0.010457618,
      -0.07465126,
      -0.09470862,
      -0.0060991654,
      0.043164972,
      0.023318132,
      -0.07355902,
      0.048781127,
      0.07236117,
      0.013487333,
      0.07267836,
      -0.028688185,
      0.034466524,
     

In [12]:
print(extract_filters("category:news Summary of the latest"))

({'filter': [{'term': {'category.keyword': {'value': 'news'}}}]}, 'Summary of the latest')


In [14]:
pprint(client.transport.perform_request("GET", "/_plugins/_ml/models/eXiYV5cBrn4Kmfkd4u7C"))

{'algorithm': 'TEXT_EMBEDDING',
 'auto_redeploy_retry_times': 0,
 'created_time': 1749521326684,
 'current_worker_node_count': 2,
 'deploy_to_all_nodes': True,
 'is_hidden': False,
 'last_deployed_time': 1749521376024,
 'last_registered_time': 1749521342241,
 'last_updated_time': 1749521376025,
 'model_config': {'all_config': '{"_name_or_path": '
                                '"sentence-transformers/all-MiniLM-L6-v2", '
                                '"architectures": ["BertModel"], '
                                '"attention_probs_dropout_prob": 0.1, '
                                '"classifier_dropout": null, '
                                '"gradient_checkpointing": false, '
                                '"hidden_act": "gelu", "hidden_dropout_prob": '
                                '0.1, "hidden_size": 384, "initializer_range": '
                                '0.02, "intermediate_size": 1536, '
                                '"layer_norm_eps": 1e-12, '
               

### SentenceTransformer model

In [None]:
#the sentence tranfromer is already installed using pip install sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
embedding = model.encode('The quick brown fox jumps over the lazy dog')
pprint(embedding)

array([ 3.54968086e-02,  6.12862743e-02,  5.26920669e-02,  7.07050189e-02,
        3.31014246e-02, -3.06696091e-02,  6.62056403e-03, -6.11833222e-02,
       -1.32600125e-03,  1.06456708e-02,  3.86499278e-02,  3.99532020e-02,
       -3.83675843e-02, -1.66688375e-02, -5.61559061e-03, -2.43558865e-02,
       -3.59968618e-02, -3.02429460e-02,  5.84700331e-02, -4.94961627e-02,
       -7.72954449e-02, -5.23876995e-02,  2.45271660e-02,  2.93105654e-02,
       -7.39091858e-02, -2.49591712e-02, -6.53142035e-02, -4.28864993e-02,
        7.11656436e-02, -1.13819472e-01, -1.26593513e-02,  3.96260768e-02,
       -2.10036356e-02,  1.78063996e-02, -3.18874530e-02, -9.11229625e-02,
        5.91224842e-02, -7.30395922e-03,  3.31367664e-02,  2.99061146e-02,
        4.21688817e-02, -1.69129502e-02, -4.50015813e-02,  2.96744499e-02,
       -9.92584750e-02,  5.32891825e-02, -7.64784813e-02, -1.48680294e-02,
        1.52494898e-02,  1.37893585e-02, -4.41923738e-02, -2.78393030e-02,
        6.73078652e-03,  

### Current mapping of the index


In [15]:
import json

mapping = client.indices.get_mapping(index="my_documents")
print(json.dumps(mapping, indent=2))

{
  "my_documents": {
    "mappings": {
      "properties": {
        "category": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "content": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "created_on": {
          "type": "date"
        },
        "name": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "rolePermissions": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "summary": {
          "type": "text",
          "fields": {
           

In [93]:
#search 
response =  client.search(
    index="my_documents",
    body={
        "query": {
            "match_all":{}
        },
        "size": 1
    }
)

print(json.dumps(response, indent=2))

{
  "took": 2,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 15,
      "relation": "eq"
    },
    "max_score": 1.0,
    "hits": [
      {
        "_index": "my_documents",
        "_id": "gEvVVZcBpHtcKjIo6AZa",
        "_score": 1.0,
        "_source": {
          "summary": "This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns",
          "summary_embedding": [
            -0.034409862,
            0.045920573,
            0.050532352,
            -0.025691224,
            0.016922206,
            0.034359254,
            -0.0071740476,
            -0.02259518,
      

### Models

In [16]:
def get_model_id(model_name):
    models = client.transport.perform_request("GET", "/_plugins/_ml/models/_search", 
    body={
        "query": {
            "term": {
                "name.keyword": model_name
            }
        },
        "_source": ["model_id"],
        "size": 1,
    }
    )

    if models["hits"]["hits"]:
        return models["hits"]["hits"][0]["_source"]["model_id"] 
    else:
        raise ValueError(f"{model_name} model not found.")
    # for model in models["models"]:
    #     if model["name"] == model_name:
    #         return model["model_id"]
    # raise ValueError(f"Model '{model_name}' not found.")


In [82]:
get_model_id("amazon/neural-sparse/opensearch-neural-sparse-encoding-v2-distill")

'QXlXYJcBrn4KmfkdfQtM'

## Testing App codes

In [66]:
import json
from pprint import pprint
import os
import time

from dotenv import load_dotenv
from opensearchpy import OpenSearch

load_dotenv()


class Search:
    def __init__(self):
        self.ops = OpenSearch(
            hosts=[{"host": "localhost", "port": 19200}],
            http_compress=True,  # enables gzip compression for request bodies
            http_auth=(
                os.getenv("OPENSEARCH_ADMIN_USER"),
                os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD"),
            ),
            use_ssl=True,
            verify_certs=False,
            ssl_assert_hostname=False,
            ssl_show_warn=False,
        )
        client_info = self.ops.info()
        print("Connected to Opensearch!")
        pprint(client_info)

    def get_model_id(self, model_name):
        models = self.ops.transport.perform_request("GET", "/_plugins/_ml/models/_search", 
            body={
                "query": {
                    "term": {
                        "name.keyword": model_name
                    }
                },
                "_source": ["model_id"],
                "size": 1,
            }
        )

        if models["hits"]["hits"]:
            return models["hits"]["hits"][0]["_source"]["model_id"] 
        else:
            raise ValueError(f"{model_name} model not found.")

    def create_index(self):
        self.ops.indices.delete(index="my_documents", ignore_unavailable=True)
        self.ops.indices.create(
            index="my_documents",
            body={
                "settings": {
                    "index.knn": True,
                    "default_pipeline": "embedding-ingest-pipeline-l6mini",
                },
                "mappings": {
                    "properties": {
                        "summary_embedding": {
                            "type": "knn_vector",
                            "dimension": 384,
                            "method": {
                                "name": "hnsw",
                                "space_type": "cosinesimil",
                                "engine": "lucene",
                            },
                        }
                    }
                },
            },
        )

    def insert_document(self, document):
        return self.ops.index(index="my_documents", body=document)

    def insert_documents(self, documents):
        operations = []
        for document in documents:
            operations.append({"index": {"_index": "my_documents"}})
            operations.append(document)
        return self.ops.bulk(body=operations)

    def reindex(self):
        self.create_index()
        with open("data.json", "rt") as f:
            documents = json.loads(f.read())
        return self.insert_documents(documents)

    def search(self, **query_args):
        if "from_" in query_args:
            query_args["from"] = query_args["from_"]
            del query_args["from_"]
        return self.ops.search(index="my_documents", body=query_args, params= {"search_pipeline": "rrf-pipeline"})

    def retrieve_document(self, id):
        return self.ops.get(index="my_documents", id=id)


In [67]:
ops = Search()

Connected to Opensearch!
{'cluster_name': 'opensearch-cluster',
 'cluster_uuid': 'NeKvN4_DTmuqlGdYJ8xw-w',
 'name': 'opensearch-node1',
 'tagline': 'The OpenSearch Project: https://opensearch.org/',
 'version': {'build_date': '2025-05-03T06:25:26.379676844Z',
             'build_hash': 'dc4efa821904cc2d7ea7ef61c0f577d3fc0d8be9',
             'build_snapshot': False,
             'build_type': 'tar',
             'distribution': 'opensearch',
             'lucene_version': '10.1.0',
             'minimum_index_compatibility_version': '2.0.0',
             'minimum_wire_compatibility_version': '2.19.0',
             'number': '3.0.0'}}


In [72]:
def handle_search(query):
    filters, parsed_query = extract_filters(query)
    from_ = 0

    if parsed_query:
        search_query = {
            "must": {
                "multi_match": {
                    "query": parsed_query,
                    "fields": ["name", "summary", "content"],
                }
            }
        }
    else:
        search_query = {"must": {"match_all": {}}}

    # combine the filters and search query with a bool
    bool_query = {"bool": {**search_query, **filters}}

    # neural query setup with filters
    model_id = ops.get_model_id("huggingface/sentence-transformers/all-MiniLM-L6-v2")
    neural_query = {
        "bool": {
            "must": {
                "neural": {
                    "summary_embedding": {
                        "query_text": parsed_query,
                        "model_id": model_id,
                        "k": 50,  # number of results to return
                    }
                }
            },
            **filters,
        }
    }

    # combine both bool queries with a hybrid query
    hybrid_query = {
        "hybrid": {
            "queries": [
                bool_query,
                neural_query,
            ]
        }
    }

    results = ops.search(
        query=hybrid_query,
        from_=from_,
        _source=["name"],
        aggs={
            "category-agg": {
                "terms": {
                    "field": "category.keyword",
                }
            },
            "year-agg": {
                "date_histogram": {
                    "field": "updated_at",
                    "calendar_interval": "year",
                    "format": "yyyy",
                },
            },
        },
    )
    return results


In [73]:
query = "year:2018 summary"
handle_search(query)

{'took': 23,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 8, 'relation': 'eq'},
  'max_score': 0.016393442,
  'hits': [{'_index': 'my_documents',
    '_id': 'mXibV5cBrn4Kmfkd6u4O',
    '_score': 0.016393442,
    '_source': {'name': 'Performance Management Policy'}},
   {'_index': 'my_documents',
    '_id': 'l3ibV5cBrn4Kmfkd6u4O',
    '_score': 0.016129032,
    '_source': {'name': 'Code Of Conduct'}},
   {'_index': 'my_documents',
    '_id': 'm3ibV5cBrn4Kmfkd6u4O',
    '_score': 0.015873017,
    '_source': {'name': 'Compensation Framework For It Teams'}},
   {'_index': 'my_documents',
    '_id': 'lHibV5cBrn4Kmfkd6u4O',
    '_score': 0.015625,
    '_source': {'name': 'Swe Career Matrix'}},
   {'_index': 'my_documents',
    '_id': 'nXibV5cBrn4Kmfkd6u4O',
    '_score': 0.015384615,
    '_source': {'name': 'New Employee Onboarding Guide'}},
   {'_index': 'my_documents',
    '_id': 'mnibV5cBrn4Kmfkd6u4O',
    '_score'

### Checking the Index with a search

In [100]:
result_ = client.search(
    index="my_documents",
    body={
        "query": {
            "match": {
                "name": "work from home"
            }
        },
        "_source": ["name", "summary_sparse_embedding"],
        "size": 2
    }
)

print(json.dumps(result_, indent=2))



{
  "took": 4,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 2,
      "relation": "eq"
    },
    "max_score": 2.4594483,
    "hits": [
      {
        "_index": "my_documents",
        "_id": "FXmdY5cBrn4KmfkdsRa1",
        "_score": 2.4594483,
        "_source": {
          "name": "Work From Home Policy",
          "summary_sparse_embedding": {
            "weekends": 0.25181487,
            "##sen": 0.051348142,
            "munoz": 0.21688324,
            "discourage": 0.034650262,
            "oversee": 0.27850154,
            "scenarios": 0.66006947,
            "remote": 2.8324401,
            "constraints": 0.38286158,
            "walsh": 0.11210626,
            "concern": 0.10828397,
            "skills": 0.24487959,
            "encouraged": 0.3878752,
            "freelance": 0.37577653,
            "worker": 0.6184733,
            "welcome": 0.074457094,
          

In [89]:
#get mapping
mapping = client.indices.get_mapping(index="my_documents")
print(json.dumps(mapping, indent=2))

{
  "my_documents": {
    "mappings": {
      "properties": {
        "category": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "content": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "created_on": {
          "type": "date"
        },
        "name": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "rolePermissions": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "summary": {
          "type": "text",
          "fields": {
           

In [92]:
def handle_search(query):
    filters, parsed_query = extract_filters(query)
    from_ = 0

    if parsed_query.strip():
        lex_query = {
            "bool": {
                "must": [
                    {
                        "multi_match": {
                            "query": parsed_query,
                            "fields": ["name", "summary", "content"],
                        }
                    }
                ],
                **filters,
            }
        }
        # neural query setup with filters
        model_id = ops.get_model_id(
            "amazon/neural-sparse/opensearch-neural-sparse-encoding-v2-distill"
        )
        print(f"Using model ID: {model_id}")
        
        neural_query = {
            "bool": {
                "must": [
                    {
                        "neural": {
                            "summary_sparse_embedding": {
                                "query_text": parsed_query,
                                "model_id": model_id,               
                            }
                        }
                    }
                ],
                **filters,
            }
        }
        # combine the lexical and neural queries with a hybrid query
        search_query = {
            "hybrid": {
				"queries": [
					lex_query,
					neural_query,
				],
				"pagination_depth": 50,  # needed for hybrid queries. It specifies the maximum number of search results to retrieve from each shard for every subquery.
			}
		}
    else:
        search_query = {"bool": {"must": [{"match_all": {}}], **filters}}


    results = ops.search(
        query=search_query,
        aggs={
            "category-agg": {
                "terms": {
                    "field": "category.keyword",
                }
            },
            "year-agg": {
                "date_histogram": {
                    "field": "updated_at",
                    "calendar_interval": "year",
                    "format": "yyyy",
                },
            },
        },
        size=5,
        from_=from_,
    )

    return results


In [95]:
query = "year:2018 summary"
results = handle_search(query)
pprint(results)

Using model ID: QXlXYJcBrn4KmfkdfQtM


TransportError: TransportError(500, 'null_pointer_exception', 'Cannot invoke "java.util.List.iterator()" because "tensorsList" is null')