### import libraries

In [None]:
from opensearchpy import OpenSearch
from dotenv import load_dotenv
from pprint import pprint
import re

In [3]:
#load the enviornment variables from .env
load_dotenv()
from os import getenv

USERNAME = getenv("OPENSEARCH_ADMIN_USER")
PASSWORD = getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD")

### connect to host

In [12]:
host = 'localhost'
port = 19200
auth = (USERNAME, PASSWORD) # For testing only. Don't store credentials in code.
# ca_certs_path = '/full/path/to/root-ca.pem' # Provide a CA bundle if you use intermediate CAs with your root CA.

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)
client.info()

{'name': 'opensearch-node1',
 'cluster_name': 'opensearch-cluster',
 'cluster_uuid': 'GAWfTRd5RW-CucmhKN6hVA',
 'version': {'distribution': 'opensearch',
  'number': '3.0.0',
  'build_type': 'tar',
  'build_hash': 'dc4efa821904cc2d7ea7ef61c0f577d3fc0d8be9',
  'build_date': '2025-05-03T06:25:26.379676844Z',
  'build_snapshot': False,
  'lucene_version': '10.1.0',
  'minimum_wire_compatibility_version': '2.19.0',
  'minimum_index_compatibility_version': '2.0.0'},
 'tagline': 'The OpenSearch Project: https://opensearch.org/'}

### check my index

In [16]:
response = client.search(
    index="document_v4",
    body={
        "_source": ["content"],
        "query": {
            "match_all": {}
        }
    }
)

response

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'document_v4',
    '_id': '1234',
    '_score': 1.0,
    '_source': {'content': 'There once was a racecar driver that was super fast'}},
   {'_index': 'document_v4',
    '_id': '1235',
    '_score': 1.0,
    '_source': {'content': 'The golf driver used by tiger woods is the TaylorMade Qi10 LS prototype'}},
   {'_index': 'document_v4',
    '_id': '1236',
    '_score': 1.0,
    '_source': {'content': 'Some may say that supercar drivers dont really mind risk'}}]}}

In [21]:
#delete index
response = client.indices.delete("test-document-01", ignore_unavailable=True)
pprint(response)
# create index
response = client.indices.create("test-document-01")
pprint(response)

{'acknowledged': True}
{'acknowledged': True, 'index': 'test-document-01', 'shards_acknowledged': True}


### confirm the get method works

In [23]:
item_a = client.get(index="document_v4", id="1236")
pprint(item_a)

{'_id': '1236',
 '_index': 'document_v4',
 '_primary_term': 1,
 '_seq_no': 2,
 '_source': {'content': 'Some may say that supercar drivers dont really mind '
                        'risk',
             'content_embedding': {'##ability': 0.0043414626,
                                   '##away': 0.09657053,
                                   '##bility': 0.683933,
                                   '##car': 1.8049147,
                                   '##carriage': 0.18450333,
                                   '##cars': 0.7819286,
                                   '##erving': 0.741623,
                                   '##ever': 0.16756783,
                                   '##free': 0.16112545,
                                   '##hip': 0.08930432,
                                   '##moto': 0.70712644,
                                   '##oot': 0.37504345,
                                   '##ruck': 0.124648124,
                                   '##tical': 0.41809833,
       

In [25]:
item_a['_source']['content']

'Some may say that supercar drivers dont really mind risk'

### testing filter regex

In [36]:
import re
import json

In [None]:
def extract_filters(query):
    filters= []
    category_regex = r'category:([^\s]+)\s*'
    matches = re.search(category_regex, query)
    if matches:
        filters.append({
            'term': {
                'category.keyword': {
                    'value': matches.group(1)
                }
            }
        })

        #remove the category filter from the query
        query = re.sub(category_regex, '', query).strip()

    #year filter
    year_regex = r'year:([^\s]+)\s*'
    matches = re.search(year_regex, query)
    if matches:
        filters.append({
            'range': {
                'updated_at': {
                    'gte': f'{matches.group(1)}||/y',
                    'lte': f'{matches.group(1)}||/y',
                }
            },
        })
        #remove the year filter from the query
        query = re.sub(year_regex, '', query).strip()

    return {'filter': filters}, query

def handle_search():
    filters, parsed_query = extract_filters(query)
    if parsed_query:
        search_query = {
            'must': {
                'multi_match': {
                    'query': parsed_query,
                    'fields': ['name', 'summary', 'content']
                }
            }
        }
    else:
        search_query = {
            'must': {
                'match_all': {}
            }
        }
    results = client.search(
        index="my_documents",
        body={
            'query': {
                'bool': {
                    **search_query,
                    **filters
                }
            },
            'aggs': {
                'category-agg': {
                    'terms': {
                        'field': 'category.keyword',
                    }
                },
                'year-agg': {
                    'date_histogram': {
                        'field': 'updated_at',
                        'calendar_interval': 'year',
                        'format': 'yyyy',
                    },
                },
            },
            'size': 5,
        }
    )
    return results

In [57]:
query = 'work from home category:sharepoint'
results = handle_search()
results

{'took': 5,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 7, 'relation': 'eq'},
  'max_score': 1.0137551,
  'hits': [{'_index': 'my_documents',
    '_id': 'lkp6SJcBpHtcKjIoZdp_',
    '_score': 1.0137551,
    '_source': {'content': "Purpose\n\nThe purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\nScope\n\nThis policy applies to all full-time and part-time employees who have completed their probationary period.\nVacation Accrual\n\nFull-time employees accrue vacation time at a rate of [X hours] per month, equivalent to [Y days] per year. Part-time employees accrue vacation time on a pro-rata basis, calculated according to their scheduled work hours.\n\nVacation time will begin to accrue from the f

In [49]:
results

{'took': 5,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 1.0790452,
  'hits': [{'_index': 'my_documents',
    '_id': 'lUp6SJcBpHtcKjIoZdp_',
    '_score': 1.0790452,
    '_source': {'summary': "This sales strategy document outlines objectives, focus areas, and action plans for our tech company's sales operations in fiscal year 2024. Our primary goal is to increase revenue, expand market share, and strengthen customer relationships in our target markets. Focus areas include targeting new markets, segmenting customers, enhancing",
     'name': 'Fy2024 Company Sales Strategy',
     'content': "Executive Summary:\nThis sales strategy document outlines the key objectives, focus areas, and action plans for our tech company's sales operations in fiscal year 2024. Our primary goal is to increase revenue, expand market share, and strengthen customer relationships in our target markets

In [35]:
print(extract_filters("category:news Summary of the latest"))

query: Summary of the latest
({'filter': [{'term': {'category.keyword': {'value': 'news'}}}]}, 'Summary of the latest')


In [72]:
pprint(client.transport.perform_request("GET", "/_plugins/_ml/models/BwimQZcBCqL5fqAlD78u"))

{'algorithm': 'SPARSE_ENCODING',
 'auto_redeploy_retry_times': 0,
 'created_time': 1749153091372,
 'current_worker_node_count': 2,
 'deploy_to_all_nodes': True,
 'is_hidden': False,
 'last_deployed_time': 1749153134637,
 'last_registered_time': 1749153129463,
 'last_updated_time': 1749153134637,
 'model_content_hash_value': 'a7a80f911838c402d74a7ce05e20672642fc63aafaa982b1055ab277abe808d2',
 'model_content_size_in_bytes': 268867313,
 'model_format': 'TORCH_SCRIPT',
 'model_group_id': '10qmQZcBpHtcKjIoDMPE',
 'model_state': 'DEPLOYED',
 'model_version': '1',
 'name': 'amazon/neural-sparse/opensearch-neural-sparse-encoding-v2-distill',
 'planning_worker_node_count': 2,
 'planning_worker_nodes': ['sIlBw43qTWq5uL4CuhvLGg', 'mCPWC0PTSn-OYzlcX2EXsg'],
 'total_chunks': 27}


In [73]:
pprint(client.transport.perform_request("GET", "/_plugins/_ml/models/BggtPJcBCqL5fqAlzr_v"))

{'algorithm': 'TEXT_EMBEDDING',
 'auto_redeploy_retry_times': 0,
 'created_time': 1749061324415,
 'current_worker_node_count': 2,
 'deploy_to_all_nodes': True,
 'is_hidden': False,
 'last_deployed_time': 1749061405304,
 'last_registered_time': 1749061355437,
 'last_updated_time': 1749061409228,
 'model_config': {'all_config': '{"_name_or_path": '
                                '"/root/.cache/torch/sentence_transformers/sentence-transformers_msmarco-distilbert-base-tas-b/", '
                                '"activation": "gelu", "architectures": '
                                '["DistilBertModel"], "attention_dropout": '
                                '0.1, "dim": 768, "dropout": 0.1, '
                                '"hidden_dim": 3072, "initializer_range": '
                                '0.02, "max_position_embeddings": 512, '
                                '"model_type": "distilbert", "n_heads": 12, '
                                '"n_layers": 6, "pad_token_id": 0, '
    

### SentenceTransformer model

In [None]:
#the sentence tranfromer is already installed using pip install sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
embedding = model.encode('The quick brown fox jumps over the lazy dog')
pprint(embedding)

array([ 3.54968086e-02,  6.12862743e-02,  5.26920669e-02,  7.07050189e-02,
        3.31014246e-02, -3.06696091e-02,  6.62056403e-03, -6.11833222e-02,
       -1.32600125e-03,  1.06456708e-02,  3.86499278e-02,  3.99532020e-02,
       -3.83675843e-02, -1.66688375e-02, -5.61559061e-03, -2.43558865e-02,
       -3.59968618e-02, -3.02429460e-02,  5.84700331e-02, -4.94961627e-02,
       -7.72954449e-02, -5.23876995e-02,  2.45271660e-02,  2.93105654e-02,
       -7.39091858e-02, -2.49591712e-02, -6.53142035e-02, -4.28864993e-02,
        7.11656436e-02, -1.13819472e-01, -1.26593513e-02,  3.96260768e-02,
       -2.10036356e-02,  1.78063996e-02, -3.18874530e-02, -9.11229625e-02,
        5.91224842e-02, -7.30395922e-03,  3.31367664e-02,  2.99061146e-02,
        4.21688817e-02, -1.69129502e-02, -4.50015813e-02,  2.96744499e-02,
       -9.92584750e-02,  5.32891825e-02, -7.64784813e-02, -1.48680294e-02,
        1.52494898e-02,  1.37893585e-02, -4.41923738e-02, -2.78393030e-02,
        6.73078652e-03,  

### Current mapping of the index


In [85]:
import json

mapping = client.indices.get_mapping(index="my_documents")
print(json.dumps(mapping, indent=2))

{
  "my_documents": {
    "mappings": {
      "properties": {
        "category": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "content": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "content_embedding": {
          "type": "knn_vector",
          "dimension": 384,
          "method": {
            "engine": "lucene",
            "space_type": "innerproduct",
            "name": "hnsw",
            "parameters": {}
          }
        },
        "created_on": {
          "type": "date"
        },
        "name": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "rolePermis

In [91]:
#search 
response =  client.search(
    index="my_documents",
    body={
        "query": {
            "match_all":{}
        },
        "size": 1
    }
)

print(json.dumps(response, indent=2))

{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 15,
      "relation": "eq"
    },
    "max_score": 1.0,
    "hits": [
      {
        "_index": "my_documents",
        "_id": "QUvJVZcBpHtcKjIopgaK",
        "_score": 1.0,
        "_source": {
          "summary": "This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns",
          "summary_embedding": [
            -0.034409862,
            0.045920573,
            0.050532352,
            -0.025691224,
            0.016922206,
            0.034359254,
            -0.0071740476,
            -0.02259518,
      