In [1]:
from opensearchpy import OpenSearch
import pandas as pd
from datetime import datetime

## Connect to OpenSearch
Set the API endpoint URL, the search query and any required headers or parameters. Set the authentication credentials

In [2]:
client = OpenSearch(
    hosts = [{"host": "localhost", "port": 9200}],
    http_auth = ("admin", "admin"),
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)
client.info()

{'name': 'opensearch-node1',
 'cluster_name': 'opensearch-cluster',
 'cluster_uuid': 'qPUqDsgnSoKoMiCuxecfyw',
 'version': {'distribution': 'opensearch',
  'number': '2.5.0',
  'build_type': 'tar',
  'build_hash': 'b8a8b6c4d7fc7a7e32eb2cb68ecad8057a4636ad',
  'build_date': '2023-01-18T23:48:48.981786100Z',
  'build_snapshot': False,
  'lucene_version': '9.4.2',
  'minimum_wire_compatibility_version': '7.10.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'The OpenSearch Project: https://opensearch.org/'}

## Create an Index

In [6]:
index_name = 'search_index'
body = {
  "mappings": {
    "properties": {
      "doc_name": {
        "type": "text",
        "analyzer": "edge_ngram_analyzer"
      },
      "doc_type": {
        "type": "text",
        "analyzer": "standard"
      },
      "link": {
        "type": "text",
        "analyzer": "standard"
      },
      "source": {
        "type": "text",
        "analyzer": "standard"
      },
      "created_date": {
        "type" : "date",
        "format" : "strict_date_time_no_millis"
      },
      "modified_date": {
        "type" : "date",
        "format" : "strict_date_time_no_millis"
      },
      "summary": {
        "type" : "text",
        "analyzer": "standard"
      },
      "file_size": {
        "type" : "integer",
      }
    }
  },
  "settings": {
    "analysis": {
      "analyzer": {
        "edge_ngram_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": ["lowercase", "edge_ngram_filter"]
        }
      },
      "filter": {
        "edge_ngram_filter": {
          "type": "edge_ngram",
          "min_gram": 2,
          "max_gram": 10
        }
      }
    }
  }
}
response = client.indices.create(index=index_name, body=body)

## Add data to index

First check the number of items in the index.

In [9]:
doc_name = "Google"
doc_type = "txt"
link = "https://www.google.com/"
source = "Gmail"
created_date = datetime(2022, 2, 18, 12, 30, 0).strftime('%Y-%m-%dT%H:%M:%SZ')
modified_date = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
summary = "Google is a multinational technology company that specializes in Internet-related services and products. It was founded in 1998 by Larry Page and Sergey Brin, who were graduate students at Stanford University at the time. Today, Google is one of the largest and most influential companies in the world, with a market capitalization of over $1 trillion."
file_size = len(summary.encode('utf-8'))
item1 = {
    "doc_name": doc_name,
    "doc_type": doc_type,
    "link": link,
    "source": source,
    "created_date": created_date,
    "modified_date": modified_date,
    "summary": summary,
    "file_size": file_size
}


doc_name = "Youtube"
doc_type = "txt"
link = "https://www.google.com/"
source = "Google drive"
created_date = datetime(2022, 2, 18, 12, 30, 0).strftime('%Y-%m-%dT%H:%M:%SZ')
modified_date = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
summary = "YouTube is a video-sharing website and online platform that allows users to upload, share, and view videos. It was founded in 2005 by three former PayPal employees, and was later acquired by Google in 2006. Today, YouTube is one of the largest websites in the world, with over 2 billion monthly active users and over 1 billion hours of video watched daily."
file_size = len(summary.encode('utf-8'))

item2 = {
    "doc_name": doc_name,
    "doc_type": doc_type,
    "link": link,
    "source": source,
    "created_date": created_date,
    "modified_date": modified_date,
    "summary": summary,
    "file_size": file_size
}
to_be_inserted = []
to_be_inserted.append(item1)
to_be_inserted.append(item2)

In [8]:
response = client.count(index=index_name)
id = int(response['count'])
print(id)

0


In [12]:
for body in to_be_inserted:
    client.index(index=index_name, id=id,body=body)
    id += 1

RequestError: RequestError(400, 'mapper_parsing_exception', "failed to parse field [created_date] of type [date] in document with id '0'. Preview of field's value: '2022-02-18T12:30:00Z'")

In [13]:
response = client.count(index=index_name)
id = int(response['count'])
print(id)

0


## Search Data

In [38]:

response = client.search(
    index=index_name,
    body={
        "query": {
            "match": {
                "doc_name": "Youtube"
            }
        }          
    }
)
print(response)
print(type(response))

{'took': 22, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': 0.33425623, 'hits': [{'_index': 'search_index', '_id': '0', '_score': 0.33425623, '_source': {'doc_name': 'doc1', 'doc_type': 'txt', 'link': 'https://www.google.com/', 'source': 'Gmail', 'date': None}}, {'_index': 'search_index', '_id': '1', '_score': 0.3085442, '_source': {'doc_name': 'doc2', 'doc_type': 'txt', 'link': 'https://www.youtube.com/', 'source': 'Gmail', 'date': None}}]}}
<class 'dict'>


## Delete document from index

In [26]:
client.delete(index = index_name, id = "1")

{'_index': 'search_index',
 '_id': '1',
 '_version': 2,
 'result': 'deleted',
 '_shards': {'total': 2, 'successful': 2, 'failed': 0},
 '_seq_no': 4,
 '_primary_term': 1}

## Delete an index

In [5]:
client.indices.delete(index=index_name)

{'acknowledged': True}