# Docker 

## Run in terminal 

In [None]:
!sudo docker pull opensearchproject/opensearch:1.0.1

In [29]:
!sudo docker run -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.0.1




# REST API

## Check connection

In [5]:
import json
import os

In [1]:
!curl -XGET "https://localhost:9200/_cluster/health" -u admin:admin --insecure

{"cluster_name":"docker-cluster","status":"green","timed_out":false,"number_of_nodes":1,"number_of_data_nodes":1,"discovered_master":true,"discovered_cluster_manager":true,"active_primary_shards":1,"active_shards":1,"relocating_shards":0,"initializing_shards":0,"unassigned_shards":0,"delayed_unassigned_shards":0,"number_of_pending_tasks":0,"number_of_in_flight_fetch":0,"task_max_waiting_in_queue_millis":0,"active_shards_percent_as_number":100.0}

## Create Index (use https!)

In [22]:
!curl -XPUT "https://localhost:9200/pub_med" -H 'Content-Type: application/json' -u admin:admin --insecure

{"acknowledged":true,"shards_acknowledged":true,"index":"pub_med2"}

In [6]:
with open('/home/chris/University/NLP_project/pubmed_data.json', 'r') as f:
    records = json.loads(f.read())

In [None]:
for record in records:
    print(record)

In [7]:
print(records['PubmedArticle'][0])

{'MedlineCitation': {'GeneralNote': [], 'OtherID': [], 'CitationSubset': ['IM'], 'KeywordList': [['semiconducting chalcogenide glass', 'thermal sensors', 'thermoelectric fibers', 'wearable electronics']], 'SpaceFlightMission': [], 'OtherAbstract': [], 'PMID': '38085539', 'DateRevised': {'Year': '2023', 'Month': '12', 'Day': '12'}, 'Article': {'ELocationID': ['10.1021/acsami.3c13239'], 'ArticleDate': [{'Year': '2023', 'Month': '12', 'Day': '12'}], 'Language': ['eng'], 'Journal': {'ISSN': '1944-8252', 'JournalIssue': {'PubDate': {'Year': '2023', 'Month': 'Dec', 'Day': '12'}}, 'Title': 'ACS applied materials & interfaces', 'ISOAbbreviation': 'ACS Appl Mater Interfaces'}, 'ArticleTitle': 'High Seebeck Coefficient Inorganic Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub> Core/Polymer Cladding Fibers for Respiration and Body Temperature Monitoring.', 'Abstract': {'AbstractText': ['Wearable thermal sensors based on thermoelectric (TE) materials with high sensitivity and temperature resolution a

In [8]:
records = records["PubmedArticle"]
preprocessed_records = []
for idx, article in enumerate(records):
      if (not "Abstract" in article["MedlineCitation"]["Article"].keys()): continue
      article = {
          "id": article["MedlineCitation"]["PMID"],
          "title": article["MedlineCitation"]["Article"]["ArticleTitle"],
          "text": " ".join(article["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]) # some abstracts are split in an array
      }
      preprocessed_records.append(article)
#with open('/content/drive/MyDrive/Colab Data/pubmed_data_preprocessed.json', 'w') as f:
#    f.write(json.dumps(preprocessed_records))

In [9]:
print(preprocessed_records[0])

{'id': '38085539', 'title': 'High Seebeck Coefficient Inorganic Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub> Core/Polymer Cladding Fibers for Respiration and Body Temperature Monitoring.', 'text': 'Wearable thermal sensors based on thermoelectric (TE) materials with high sensitivity and temperature resolution are extensively used in medical diagnosis, human-machine interfaces, and advanced artificial intelligence. However, their development is greatly limited by the lack of materials with both a high Seebeck coefficient and superior anticrystallization ability. Here, a new inorganic amorphous TE material, Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub>, with a high Seebeck coefficient of 1109 μV/K is reported. Owing to the large difference between the glass-transition temperature and initial crystallization temperature, Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub> strongly inhibits crystallization during fiber fabrication by thermally codrawing a precast rod comprising a Ge<sub>15</sub

In [49]:
import json

# Serializing json
json_object = json.dumps(preprocessed_records[0], indent=4)

# Writing to sample.json
with open("sample.json", "w") as outfile:
	outfile.write(json_object)


## Write Document into index

In [37]:
!curl -X POST "https://localhost:9200/pub_med/_doc" -u admin:admin --insecure -H 'Content-Type: application/json' -d @sample.json

{"_index":"pub_med","_id":"ZRwNBI0BZKko4otCxqts","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":1,"_primary_term":1}

In [38]:
!curl -X PUT "https://localhost:9200/pub_med/_doc/10" -u admin:admin --insecure -H 'Content-Type: application/json' -d @sample.json

{"_index":"pub_med","_id":"10","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":2,"_primary_term":1}

In [50]:
for i in range(2):
    !curl -X PUT "https://localhost:9200/pub_med/_doc/10" -u admin:admin --insecure -H 'Content-Type: application/json' -d @sample.json

{"_index":"pub_med","_id":"10","_version":2,"result":"updated","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":3,"_primary_term":1}{"_index":"pub_med","_id":"10","_version":3,"result":"updated","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":4,"_primary_term":1}

## Add multiple records

In [53]:
for i, record in enumerate(preprocessed_records[0:20]):
    # Serializing json
    json_object = json.dumps(record, indent=4)

    # Writing to sample.json
    with open("sample.json", "w") as outfile:
        outfile.write(json_object)
    
    #url = f"https://localhost:9200/pub_med/_doc/{i}"
        
    !curl -X POST "https://localhost:9200/pub_med/_doc" -u admin:admin --insecure -H 'Content-Type: application/json' -d @sample.json  
    

{"_index":"pub_med","_id":"ZhwrBI0BZKko4otC0quQ","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":5,"_primary_term":1}{"_index":"pub_med","_id":"ZxwrBI0BZKko4otC06sl","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":6,"_primary_term":1}{"_index":"pub_med","_id":"aBwrBI0BZKko4otC06u5","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":7,"_primary_term":1}{"_index":"pub_med","_id":"aRwrBI0BZKko4otC1KtH","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":8,"_primary_term":1}{"_index":"pub_med","_id":"ahwrBI0BZKko4otC1KvP","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":9,"_primary_term":1}{"_index":"pub_med","_id":"axwrBI0BZKko4otC1atk","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":10,"_primary_term":1}{"_index":"pub_med","_id":"bBwrBI0BZKko4otC1a

In [55]:
!curl -X GET "https://localhost:9200/pub_med/_search" -ku admin:admin -H 'Content-Type: application/json' -d '{"query":{"match_all":{}}}'

{"took":841,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":23,"relation":"eq"},"max_score":1.0,"hits":[{"_index":"pub_med","_id":"XRyVA40BZKko4otCQKu-","_score":1.0,"_source":{    "id": "38085539",    "title": "High Seebeck Coefficient Inorganic Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub> Core/Polymer Cladding Fibers for Respiration and Body Temperature Monitoring.",    "text": "Wearable thermal sensors based on thermoelectric (TE) materials with high sensitivity and temperature resolution are extensively used in medical diagnosis, human-machine interfaces, and advanced artificial intelligence. However, their development is greatly limited by the lack of materials with both a high Seebeck coefficient and superior anticrystallization ability. Here, a new inorganic amorphous TE material, Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub>, with a high Seebeck coefficient of 1109 \u03bcV/K is reported. Owing to the large difference betwe

## Retrieving Document by ID

Get all id's of the index

In [57]:
!curl -X GET "https://localhost:9200/pub_med/_search" -ku admin:admin -H 'Content-Type: application/json' -d '{"_source": false,"query":{"match_all":{}}}'

{"took":3,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":23,"relation":"eq"},"max_score":1.0,"hits":[{"_index":"pub_med","_id":"XRyVA40BZKko4otCQKu-","_score":1.0},{"_index":"pub_med","_id":"ZRwNBI0BZKko4otCxqts","_score":1.0},{"_index":"pub_med","_id":"10","_score":1.0},{"_index":"pub_med","_id":"ZhwrBI0BZKko4otC0quQ","_score":1.0},{"_index":"pub_med","_id":"ZxwrBI0BZKko4otC06sl","_score":1.0},{"_index":"pub_med","_id":"aBwrBI0BZKko4otC06u5","_score":1.0},{"_index":"pub_med","_id":"aRwrBI0BZKko4otC1KtH","_score":1.0},{"_index":"pub_med","_id":"ahwrBI0BZKko4otC1KvP","_score":1.0},{"_index":"pub_med","_id":"axwrBI0BZKko4otC1atk","_score":1.0},{"_index":"pub_med","_id":"bBwrBI0BZKko4otC1av5","_score":1.0}]}}

In [39]:
!curl -X GET "https://localhost:9200/pub_med/_doc/XRyVA40BZKko4otCQKu-" -u admin:admin --insecure

{"_index":"pub_med","_id":"XRyVA40BZKko4otCQKu-","_version":1,"_seq_no":0,"_primary_term":1,"found":true,"_source":{    "id": "38085539",    "title": "High Seebeck Coefficient Inorganic Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub> Core/Polymer Cladding Fibers for Respiration and Body Temperature Monitoring.",    "text": "Wearable thermal sensors based on thermoelectric (TE) materials with high sensitivity and temperature resolution are extensively used in medical diagnosis, human-machine interfaces, and advanced artificial intelligence. However, their development is greatly limited by the lack of materials with both a high Seebeck coefficient and superior anticrystallization ability. Here, a new inorganic amorphous TE material, Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub>, with a high Seebeck coefficient of 1109 \u03bcV/K is reported. Owing to the large difference between the glass-transition temperature and initial crystallization temperature, Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75

In [40]:
!curl -X GET "https://localhost:9200/pub_med/_doc/10" -u admin:admin --insecure

{"_index":"pub_med","_id":"10","_version":1,"_seq_no":2,"_primary_term":1,"found":true,"_source":{    "id": "38085539",    "title": "High Seebeck Coefficient Inorganic Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub> Core/Polymer Cladding Fibers for Respiration and Body Temperature Monitoring.",    "text": "Wearable thermal sensors based on thermoelectric (TE) materials with high sensitivity and temperature resolution are extensively used in medical diagnosis, human-machine interfaces, and advanced artificial intelligence. However, their development is greatly limited by the lack of materials with both a high Seebeck coefficient and superior anticrystallization ability. Here, a new inorganic amorphous TE material, Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub>, with a high Seebeck coefficient of 1109 \u03bcV/K is reported. Owing to the large difference between the glass-transition temperature and initial crystallization temperature, Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub> strongly in

## Search in Document 

In [17]:
!curl -X GET "https://localhost:9200/pub_med/_search?q=*" -u admin:admin --insecure

{"took":50,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":1,"relation":"eq"},"max_score":1.0,"hits":[{"_index":"pub_med","_id":"XRyVA40BZKko4otCQKu-","_score":1.0,"_source":{    "id": "38085539",    "title": "High Seebeck Coefficient Inorganic Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub> Core/Polymer Cladding Fibers for Respiration and Body Temperature Monitoring.",    "text": "Wearable thermal sensors based on thermoelectric (TE) materials with high sensitivity and temperature resolution are extensively used in medical diagnosis, human-machine interfaces, and advanced artificial intelligence. However, their development is greatly limited by the lack of materials with both a high Seebeck coefficient and superior anticrystallization ability. Here, a new inorganic amorphous TE material, Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub>, with a high Seebeck coefficient of 1109 \u03bcV/K is reported. Owing to the large difference between

In [31]:
!curl -X GET "https://localhost:9200/pub_med/_search" -ku admin:admin -H 'Content-Type: application/json' -d '{"query":{"term":{"is_active":{"value": true}}}}'

{"took":2,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":0,"relation":"eq"},"max_score":null,"hits":[]}}

In [33]:
!curl -X GET "https://localhost:9200/pub_med/_search" -ku admin:admin -H 'Content-Type: application/json' -d '{"query":{"term":{"text": "demonstrated"}}}'

{"took":4,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":1,"relation":"eq"},"max_score":0.2876821,"hits":[{"_index":"pub_med","_id":"XRyVA40BZKko4otCQKu-","_score":0.2876821,"_source":{    "id": "38085539",    "title": "High Seebeck Coefficient Inorganic Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub> Core/Polymer Cladding Fibers for Respiration and Body Temperature Monitoring.",    "text": "Wearable thermal sensors based on thermoelectric (TE) materials with high sensitivity and temperature resolution are extensively used in medical diagnosis, human-machine interfaces, and advanced artificial intelligence. However, their development is greatly limited by the lack of materials with both a high Seebeck coefficient and superior anticrystallization ability. Here, a new inorganic amorphous TE material, Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub>, with a high Seebeck coefficient of 1109 \u03bcV/K is reported. Owing to the large differe

In [58]:
!curl -X GET "https://localhost:9200/pub_med/_search" -ku admin:admin -H 'Content-Type: application/json' -d '{"query":{"term":{"text": "intelligence"}}}'

{"took":2,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":20,"relation":"eq"},"max_score":0.29416478,"hits":[{"_index":"pub_med","_id":"bBwrBI0BZKko4otC1av5","_score":0.29416478,"_source":{    "id": "38084803",    "title": "The ASQ did not predict low IQ scores when children born VPT were six years.",    "text": "There is a need for methods that can provide valid assessment tools in a follow-up programme without great financial costs. This study assessed the accuracy of the 60-month Ages and Stages Questionnaire as a screening tool to predict a low intelligence quotient score at 6\u2009years in children born very preterm. Totally, 54 children participated in a six-year follow-up study, which included an intelligence quotient test at 6\u2009years of age and a 60-month Ages and Stages Questionnaire at four and a half or 5\u2009years of age at respond. We used the receiver operating characteristic curve and evaluated the optimal cut-o

In [59]:
data = !curl -X GET "https://localhost:9200/pub_med/_search" -ku admin:admin -H 'Content-Type: application/json' -d '{"query":{"term":{"text": "intelligence"}}}'

In [63]:
for date in data[0:10]:
    print(date)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0{"took":2,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":20,"relation":"eq"},"max_score":0.29416478,"hits":[{"_index":"pub_med","_id":"bBwrBI0BZKko4otC1av5","_score":0.29416478,"_source":{    "id": "38084803",    "title": "The ASQ did not predict low IQ scores when children born VPT were six years.",    "text": "There is a need for methods that can provide valid assessment tools in a follow-up programme without great financial costs. This study assessed the accuracy of the 60-month Ages and Stages Questionnaire as a screening tool to predict a low intelligence quotient score at 6\u2009years in children born very preterm. Totally, 54 children participated in a six-year follow-up study, which include

In [67]:
data[5]

'orm SNN without modularity by employing them to classify cortical spike trains. For the first time, a significant improvement was found in our modular SNN. Further, we probed into the factors influencing the performance of the modular SNN and found: (a). The modular SNN outperformed the uniform SNN more significantly when the number of neurons in the networks increased; (b). The performance of the modular SNNs increased as the number of modules dropped. These preliminary but novel findings suggest that modularity may help develop better artificial intelligence and brain-machine interfaces. Also, the modular SNN may serve as a model for the study of neuronal spike synchrony."}},{"_index":"pub_med","_id":"eBwrBI0BZKko4otC3Kvq","_score":0.17788038,"_source":{    "id": "38083782",    "title": "Abnormal Respiratory Sound Identification Using Audio-Spectrogram Vision Transformer.",    "text": "Respiratory disease, the third leading cause of deaths globally, is considered a high-priority ail

In [69]:
!curl -X GET "https://localhost:9200/pub_med/_search" -H 'Content-Type: application/json' -d '{"query":{"term":{"text": "intelligence"}}}' 2>/dev/null > response.json

In [76]:
!curl -X GET "https://localhost:9200/pub_med/_search" -ku admin:admin -H 'Content-Type: application/json'

{"took":1,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":23,"relation":"eq"},"max_score":1.0,"hits":[{"_index":"pub_med","_id":"XRyVA40BZKko4otCQKu-","_score":1.0,"_source":{    "id": "38085539",    "title": "High Seebeck Coefficient Inorganic Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub> Core/Polymer Cladding Fibers for Respiration and Body Temperature Monitoring.",    "text": "Wearable thermal sensors based on thermoelectric (TE) materials with high sensitivity and temperature resolution are extensively used in medical diagnosis, human-machine interfaces, and advanced artificial intelligence. However, their development is greatly limited by the lack of materials with both a high Seebeck coefficient and superior anticrystallization ability. Here, a new inorganic amorphous TE material, Ge<sub>15</sub>Ga<sub>10</sub>Te<sub>75</sub>, with a high Seebeck coefficient of 1109 \u03bcV/K is reported. Owing to the large difference between

In [77]:
!curl -X GET "https://localhost:9200/pub_med/_search" -ku admin:admin -d '{"query":{"term":{"text": "intelligence"}}}' -H 'Content-Type: application/json' > response.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 15988  100 15945  100    43  1161k   3207 --:--:-- --:--:-- --:--:-- 1201k


In [78]:
import json

# Read the content of the response.json file
with open('response.json', 'r') as file:
    response_content = file.read()

# Parse the JSON content
data = json.loads(response_content)

# Process the data as needed
hits = data.get('hits', {}).get('hits', [])
for hit in hits:
    source = hit.get('_source', {})
    print(source)  # or do something else with the data


{'id': '38084803', 'title': 'The ASQ did not predict low IQ scores when children born VPT were six years.', 'text': 'There is a need for methods that can provide valid assessment tools in a follow-up programme without great financial costs. This study assessed the accuracy of the 60-month Ages and Stages Questionnaire as a screening tool to predict a low intelligence quotient score at 6\u2009years in children born very preterm. Totally, 54 children participated in a six-year follow-up study, which included an intelligence quotient test at 6\u2009years of age and a 60-month Ages and Stages Questionnaire at four and a half or 5\u2009years of age at respond. We used the receiver operating characteristic curve and evaluated the optimal cut-off score to predict a low intelligence quotient score. At four and a half years, the optimal cut-off value for predicting a low intelligence quotient score was 242, with a sensitivity of 67% and a specificity of 59%. At 5\u2009years, only one child had 

# Python

In [None]:
!pip install opensearch-py

In [None]:
from opensearchpy import OpenSearch

In [None]:
host = 'localhost'
port = 9200

# Create the client with SSL/TLS and hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    use_ssl = False,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

In [None]:
index_name = 'python-test-index'
index_body = {
  'settings': {
    'index': {
      'number_of_shards': 4
    }
  }
}

response = client.indices.create(index_name, body=index_body)