In [None]:
from collections import OrderedDict
import pandas as pd
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

In [None]:
model = SentenceTransformer('cl-tohoku/bert-base-japanese-whole-word-masking')

In [None]:
es = Elasticsearch('https://localhost:9200',
                   basic_auth=('elastic', 'elastic'),
                   verify_certs=False,
                   timeout=60,
                   max_retries=5)
index = 'wiki_index'

In [None]:
def vector_search(query):

    query_vector = list(model.encode(query, normalize_embeddings=True))
    response = es.search(
        index=index,
        size=10,
        source_includes=['title'],
        knn={
            'field': 'vector',
            'query_vector': query_vector,
            'k': 100,
            'num_candidates': 1000
        }
    )

    return response


def keyword_search(query):

    response = es.search(
        index=index,
        size=10,
        query={
            'match': {'abstract': query}
        }
    )

    return response


def hybrid_search(query):

    query_vector = list(model.encode(query, normalize_embeddings=True))
    response = es.search(
        index=index,
        size=10,
        source_includes=['title'],
        query={'match': {'abstract': query}},
        knn={
            'field': 'vector',
            'query_vector': query_vector,
            'k': 3,
            'num_candidates': 10
        }
    )

    return response


def format_response(response):

    return pd.DataFrame([
        OrderedDict({'title': row['_source']['title'],
                     'score': row['_score']
                     }) for _, row in pd.DataFrame(response['hits']['hits']).iterrows()])

In [None]:
query = 'コンピュータ'
response = keyword_search(query)
print(format_response(response))

In [None]:
query = 'コンピュータ'
response = vector_search(query)
print(format_response(response))

In [None]:
query = 'コンピュータ'
response = hybrid_search(query)
print(format_response(response))

In [None]:
es.cat.shards()

In [None]:
es.cat.health()

In [None]:
es.close()