**Generating Elasticsearch Top-k Results for Re-ranking**

In [1]:
!pip3 install elasticsearch



In [2]:
!pip3 install elasticsearch-dsl



In [3]:
from elasticsearch import Elasticsearch

import sys
sys.path.insert(0, '../../../BERT-FAQ/')

# import required classes and functions
from reranker import ReRanker
from evaluation import get_relevance_label_df

# import utility functions
from shared.utils import load_from_json
from shared.utils import dump_to_json
from shared.utils import make_dirs

In [4]:
# Generate list of test queries, relevance labels for ReRanker class
query_answer_pair_filepath = '../../../BERT-FAQ/data/CovidFAQ/query_answer_pairs.json'
relevance_label_df = get_relevance_label_df(query_answer_pair_filepath)
test_queries = relevance_label_df[relevance_label_df['query_type'] == 'user_query'].question.unique()

In [5]:
# check first 5 occurences for test_queries
test_queries[:5]

array(['What does Corona and Covid mean?',
       'when will the social distancing end? and what is the economic consequent of the pandemic?',
       'How do I go grocery shopping? ', 'what is covid 19',
       "What is COVID-19's definition?"], dtype=object)

In [6]:
# check relevance_label_df
relevance_label_df

Unnamed: 0,label,query_type,question,answer,id
0,1,faq,What is COVID-19?,COVID-19 is a new coronavirus that we have not...,1
1,1,faq,What is a coronavirus?,Coronavirus are a type of virus - there are ma...,2
2,1,faq,Any advice about how to minimize risk during g...,"Currently, there is no evidence of food or foo...",3
3,1,faq,Why do I have to stay at home?,Socially distancing is a practice that aims to...,4
4,1,faq,What is social distancing and how does it help...,Social distancing is one of the most effective...,5
...,...,...,...,...,...
1450,1,user_query,will coronavirus stop in summer,"We do not know. Some viruses, like the common ...",1451
1451,1,user_query,will coronavirus stop in the summer,"We do not know. Some viruses, like the common ...",1452
1452,1,user_query,will coronavirus survive in the summer,"We do not know. Some viruses, like the common ...",1453
1453,1,user_query,will coronavirus survive on surfaces,A recent study shows that the virus can live i...,1454


In [7]:
# Define instance of ReRanker class
r = ReRanker(
    bert_model_path='', 
    test_queries=test_queries, relevance_label_df=relevance_label_df
)

In [8]:
# create output path to save Elasticsearch top-k results
output_path = "../../../BERT-FAQ/data/CovidFAQ/rank_results/unsupervised"
make_dirs(output_path)

In [9]:
# Select top-k value
top_k = 100

In [10]:
# Get top-k Elasticsearch results 

es = Elasticsearch([{'host':'localhost','port':9200}], http_auth=('elastic', 'ACQLGl2SMwH2hqwbHnZA')) 

es_query_by_question = r.get_es_topk_results(es=es, index='covidfaq', query_by=['question'], top_k=top_k)
es_query_by_answer = r.get_es_topk_results(es=es, index='covidfaq', query_by=['answer'], top_k=top_k)
es_query_by_question_answer = r.get_es_topk_results(es=es, index='covidfaq', query_by=['question', 'answer'], top_k=top_k)
es_query_by_question_answer_concat = r.get_es_topk_results(es=es, index='covidfaq', query_by=['question_answer'], top_k=top_k)

2021-06-07 14:05:52 - Generating ES top-k results ...


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
# Save Elasticsearch results to json files
dump_to_json(es_query_by_question, output_path + '/es_query_by_question.json')
dump_to_json(es_query_by_answer, output_path + '/es_query_by_answer.json')
dump_to_json(es_query_by_question_answer, output_path + '/es_query_by_question_answer.json')
dump_to_json(es_query_by_question_answer_concat, output_path + '/es_query_by_question_answer_concat.json')