## Notebook to serve as interface with Opensearch

Contributions:
- Matteo: \
-- Docker Setup \
-- Task oriented completions of the script \
-- Loading of data on OpenSearch. 
- Yusuf: \
-- Solving issue of connection with curl, switching to python

### Setup and basic definitions

In [1]:
# pip install opensearch-py
from opensearchpy import OpenSearch, helpers
import pandas as pd
import os

path_to_data_folder="/Users/matteom/shared-folder/nlpt_group/project/data"          # <======================

!pwd

/Users/matteom/shared-folder/nlpt_group/project/databases


In [2]:
class OpenSearchHandler:
    def __init__(self, host='localhost', port='9200', username='admin', password='admin', index_name='pubmed_data'):
        self.host = host
        self.port = port
        self.username = username
        self.password = password
        self.index_name = index_name
        self.client = self.create_connection()

    def create_connection(self):
        return OpenSearch(
            hosts=f"https://{self.host}:{self.port}",
            http_auth=(self.username, self.password),
            verify_certs=False  # Set to True if you have a valid SSL certificate
        )

    def create_index(self, index_body=None):
        if not self.client.indices.exists(index=self.index_name):
            self.client.indices.create(index=self.index_name, body=index_body)
        else:
            print("It has already been created")

    def generate_data(self, df):
        for _, row in df.iterrows():
            yield {
                "_index": self.index_name,
                "_source": row.to_dict(),
            }

    def bulk_upload(self, df):
        helpers.bulk(self.client, self.generate_data(df))
        print("Data uploaded to OpenSearch successfully.")

    def search(self, query):
        response = self.client.search(index=self.index_name, body=query)
        for doc in response['hits']['hits']:
            print(doc['_source'])
        return response

    @staticmethod
    def response_to_dataframe(response):
        # Extract data from response
        data = [doc['_source'] for doc in response['hits']['hits']]
        # Create a DataFrame
        return pd.DataFrame(data)

In [4]:
os_handler = OpenSearchHandler(index_name="pubmed_data")



### Load DataFrame on OpenSource

Import DataFrames

In [None]:
df_part1 = pd.read_csv(os.path.join(path_to_data_folder,"processed_data_part1.csv"))
df_part2 = pd.read_csv(os.path.join(path_to_data_folder,"processed_data_part2.csv"))

Utility: Wipe out all documents fromo index

In [None]:
delete_query = {
    "query": {
        "match_all": {}
    }
}
os_handler.client.delete_by_query("pubmed_data",delete_query)

Automatically create index

In [None]:
os_handler.create_index()

Last refinment of the DataFrame from previously escaped impurities due to csv conversion

In [None]:
df_part1.loc[df_part1['Abstract'].isnull(),'Abstract']='missing'

Upload in two tranches the DataFrame

In [None]:
os_handler.bulk_upload(df_part1)
os_handler.bulk_upload(df_part2)

### Queries

In [15]:
query = {
    "query": {
        "match_all": {}
    },
    "size": 50
}
response = os_handler.search(query)
os_handler.response_to_dataframe(response)

{'PMID': '24645995', 'Title': 'α-1 antitrypsin and chronic fatigue syndrome: a case study from pathophysiology to clinical practice.', 'Abstract': 'SUMMARY Several lines of evidence support the involvement of inflammatory and immunologic abnormalities in chronic fatigue syndrome CFS Since recent studies have shown that α1 antitrypsin AAT possesses antiinflammatory properties the potential therapeutic effect of AAT treatment on CFS has been investigated A 49yearold woman diagnosed with CFS was treated with intravenous infusions of a human plasmaderived AAT concentrate 60 mgkg body weight weekly for 8 consecutive weeks The patients monocyte elastase a regulator of inflammatory processes was 1170 Umg At completion of treatment improvement in maximal workload was observed 540717 of predicted Additionally amelioration in working memory scores 8394 and perceptual organization scores 7583 were detected on the Wechsler Adult Intelligence ScaleIII test Monocyte elastase decreased to a normal ra



Unnamed: 0,PMID,Title,Abstract,Authors,Publication Date,DOI
0,24645995,α-1 antitrypsin and chronic fatigue syndrome: ...,SUMMARY Several lines of evidence support the ...,Jose Alegre Sandra Camprubi Ana GarciaQuintana,2013-Mar,10.2217/pmt.12.84
1,24565439,A data-driven acute inflammation therapy.,Acute inflammation is a severe medical conditi...,Vladan Radosavljevic Kosta Ristovski Zoran Obr...,2013,10.1186/1755-8794-6-S3-S7
2,24505723,Voxelwise spectral diffusional connectivity an...,Human brain connectivity can be studied using ...,Junning Li Yan Jin Yonggang Shi Ivo D Dinov Da...,2013,10.1007/978-3-642-40811-3_82
3,24472488,Systems integrity in health and aging - an ani...,Human lifespan is positively correlated with c...,Marije Oostindjer Gro V Amdam,2013-Jan-07,10.1186/2046-2395-2-2
4,24460364,Multi-agent systems: effective approach for ca...,Physicians in order to study the causes of can...,Niloofar Mohammadzadeh Reza Safdari Azin Rahimi,2013,10.7314/apjcp.2013.14.12.7757
5,24457322,Cyber security: a critical examination of info...,Cyber threats are growing and evolving at an u...,Jason Mallinder Peter Drabwell,Unknown,Unknown
6,24453490,Effectiveness and safety of Nintendo Wii Fit P...,Migraine without aura MoA is a painful syndrom...,Maria Esposito Maria Ruberto Francesca Gimigli...,2013,10.2147/NDT.S53853
7,24408338,The use of nouns and verbs by children with Do...,To verify the use of nouns and verbs by childr...,Suelly Cecilia Olivan Limongi Emilia de Faria ...,2013,10.1590/s2317-17822013000300012
8,24408238,Phonological awareness and the working memory ...,To investigate phonological awareness and work...,Andreia Martins de Souza Cardoso Monica Marins...,2013,10.1590/s2317-17822013000200004
9,24399975,Dysconnectivity in the frontoparietal attentio...,Cognitive impairment is common in patients wit...,Jonathan P Roiser Rebekah Wigton James M Kilne...,2013,10.3389/fpsyt.2013.00176
