# Approach Using Opensearch

Gui hosted on [http://localhost:5601/app/home](http://localhost:5601/app/home)

In [7]:
%pip install opensearch-py

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import json
from opensearchpy import OpenSearch
import requests

In [2]:
# https://opensearch.org/docs/latest/clients/python-low-level/

host = 'localhost'
port = 9200
auth = ('admin', 'admin')


# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)

## Creating an Index

In [123]:
index_name = 'pub_med_index'
index_body = {  
    'settings': {
    'index': {
      'number_of_shards': 4
    },
      'mappings': {
        # Your index mappings here
    }
  }
}

response = client.indices.create(index_name, body=index_body)

In [7]:
pubmed_data_path = "../../data/pubmed_data.json"
pubmed_data_preprocessed_path = "../../data/pubmed_data_preprocessed.json"

if not os.path.exists(pubmed_data_preprocessed_path):
  with open(pubmed_data_path, 'r') as f:
    records = json.loads(f.read())
     
  records = records["PubmedArticle"]
  preprocessed_records = []
  for idx, article in enumerate(records):
      if (not "Abstract" in article["MedlineCitation"]["Article"].keys()): continue
      article = {
          "_id": article["MedlineCitation"]["PMID"],
          "title": article["MedlineCitation"]["Article"]["ArticleTitle"],
          "text": " ".join(article["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]) # some abstracts are split in an array
      }
      response
      preprocessed_records.append(article)
  with open(pubmed_data_preprocessed_path, 'w') as f:
    f.write(json.dumps(preprocessed_records))
else:
    with open(pubmed_data_preprocessed_path, 'r') as f:
        preprocessed_records = json.loads(f.read())
     

In [8]:
[doc for doc in preprocessed_records[:3]]

[{'_id': '38085539',
  'title': 'High Seebeck Coefficient Inorganic Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub> Core/Polymer Cladding Fibers for Respiration and Body Temperature Monitoring.',
  'text': 'Wearable thermal sensors based on thermoelectric (TE) materials with high sensitivity and temperature resolution are extensively used in medical diagnosis, human-machine interfaces, and advanced artificial intelligence. However, their development is greatly limited by the lack of materials with both a high Seebeck coefficient and superior anticrystallization ability. Here, a new inorganic amorphous TE material, Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub>, with a high Seebeck coefficient of 1109 μV/K is reported. Owing to the large difference between the glass-transition temperature and initial crystallization temperature, Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub> strongly inhibits crystallization during fiber fabrication by thermally codrawing a precast rod comprising a Ge<sub>1

In [128]:
actions = [ 
    ({"index": {"_index": "pub_med_index", "_id":doc["_id"] }},{"title": doc["title"], "text": doc["text"] })
for doc in preprocessed_records[:2000] 
]

In [129]:
request = '\n'.join([f'{json.dumps(item, indent=None, separators=(",", ":"))}' for tpl in actions for item in tpl])

In [131]:
try:
    response = client.bulk(body=request, refresh=True)
    print("Bulk request successful.")
except Exception as e:
    print(f"Failed to perform bulk request. Error: {e}")

Bulk request successful.
