In [None]:
# library
from elasticsearch import Elasticsearch

import numpy as np
import transformers
from transformers import BertJapaneseTokenizer

In [None]:
#INDEX has the same role as Table in SQL.
INDEX_ID = "INDEX_ID"

In [None]:
# create index
def create_user(index_id):
    es = Elasticsearch(
        "END POINT URL",
        http_auth=("USER_ID", "USER PASSWORD")
    )
    # Describe the content and data type you want to save
    # This mapping is example.
    mapping = {
        "mappings": {
            "properties": {
                "timestamp": {"type": "date"},
                "COLUMN_NAME_1": {"type": "text"},
                "COLUMN_NAME_2": {"type": "text"},
                "COLUMN_NAME_3": {"type": "long"},
            }
        }
    }
    es.indices.create(index=index_id, body=mapping)

In [None]:
# fetch all data from index
def fetch_all(index_id):
    es = Elasticsearch(
        "END POINT URL",
        http_auth = ("USER ID", "USER PASSWORD")
    )
    query = {
        "query": {
            "match_all":{},
        },
        "sort": [{"timestamp":"desc"}]
    }
    result = es.search(index=user_id, body=query, size=10000)
    output = []

    # output like json
    for document in result["hits"]["hits"]:
        temp = {}
        temp["id"] = document["_id"]
        temp["COLUMN_NAME_1"] = document["_source"]["COLUMN_NAME_1"]
        temp["COLUMN_NAME_2"] = document["_source"]["COLUMN_NAME_2"]
        output.append(t_output)
    return output

In [None]:
# The big difference from other database services is that they can calculate cos similarity.
# Our service use this feature for calculating text similarity with Bert

In [None]:
# Bert

class BertSequenceVectorizer:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 128
            

    def vectorize(self, sentence : str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)
        
        # seq_out, pooled_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out = self.bert_model(inputs_tensor, masks_tensor).last_hidden_state

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            # print(seq_out[0][0])
            return seq_out[0][0].detach().numpy()

In [None]:
# search with text similarity
def search(self, target_text):
        BSV = BertSequenceVectorizer()
        query_vector = BSV.vectorize(target_text)

        script_query = {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": "cosineSimilarity(params.query_vector, doc['content_vector']) + 1.0",
                    "params": {"query_vector": query_vector}
                }
            }
        }

        response = es.search(
            index=INDEX_NAME,
            body = {
                "size": SEARCH_SIZE,
                "query": script_query,
                "_source": {"includes": ["COLUMN_NAME_1", "COLUMN_NAME_2"]}
            }
        )
        result = []
        for hit in response["hits"]["hits"]:
            t_dict = {}
            print("id: {}, score: {}".format(hit["_id"], hit["_score"]))
            print(hit["_source"]["COLUMN_NAME_1"])
            t_dict["id"] = hit["_id"]
            t_dict["score"] = hit["_score"]
            t_dict["title"] = hit["_source"]["COLUMN_NAME_1"]
            t_dict["content"] = hit["_source"]["COLUMN_NAME_2"]
            result.append(t_dict)
        
        return result