## Notebook to serve as interface with Opensearch

Contributions:
- Matteo: \
-- Docker Setup \
-- Task oriented completions of the script \
-- Loading of data on OpenSearch. 
- Yusuf: \
-- Solving issue of connection with curl, switching to python

### Setup and basic definitions

In [None]:
# pip install opensearch-py
from opensearchpy import OpenSearch, helpers
import pandas as pd
import os

path_to_data_folder="/Users/matteom/shared-folder/nlpt_group/project/data"          # <======================

!pwd

In [None]:
class OpenSearchHandler:
    def __init__(self, host='localhost', port='9200', username='admin', password='admin', index_name='pubmed_data'):
        self.host = host
        self.port = port
        self.username = username
        self.password = password
        self.index_name = index_name
        self.client = self.create_connection()

    def create_connection(self):
        return OpenSearch(
            hosts=f"https://{self.host}:{self.port}",
            http_auth=(self.username, self.password),
            verify_certs=False  # Set to True if you have a valid SSL certificate
        )

    def create_index(self, index_body=None):
        if not self.client.indices.exists(index=self.index_name):
            self.client.indices.create(index=self.index_name, body=index_body)
        else:
            print("It has already been created")

    def generate_data(self, df):
        for _, row in df.iterrows():
            yield {
                "_index": self.index_name,
                "_source": row.to_dict(),
            }

    def bulk_upload(self, df):
        helpers.bulk(self.client, self.generate_data(df))
        print("Data uploaded to OpenSearch successfully.")

    def search(self, query):
        response = self.client.search(index=self.index_name, body=query)
        for doc in response['hits']['hits']:
            print(doc['_source'])
        return response

    @staticmethod
    def response_to_dataframe(response):
        # Extract data from response
        data = [doc['_source'] for doc in response['hits']['hits']]
        # Create a DataFrame
        return pd.DataFrame(data)

### Main

Import DataFrames and define OpenSearchHandler

In [None]:
df_part1 = pd.read_csv(os.path.join(path_to_data_folder,"processed_data_part1.csv"))
df_part2 = pd.read_csv(os.path.join(path_to_data_folder,"processed_data_part2.csv"))

os_handler = OpenSearchHandler(index_name="pubmed_data")

Utility: Wipe out all documents fromo index

In [None]:
delete_query = {
    "query": {
        "match_all": {}
    }
}
os_handler.client.delete_by_query("pubmed_data",delete_query)

Automatically create index

In [None]:
os_handler.create_index()

Last refinment of the DataFrame from previously escaped impurities due to csv conversion

In [None]:
df_part1.loc[df_part1['Abstract'].isnull(),'Abstract']='missing'

Upload in two tranches the DataFrame

In [None]:
os_handler.bulk_upload(df_part1)
os_handler.bulk_upload(df_part2)