In [1]:
# install required libraries
!pip3 install sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/35/aa/f672ce489063c4ee7a566ebac1b723c53ac0cea19d9e36599cc241d8ed56/sentence-transformers-1.0.4.tar.gz (74kB)
[K     |████████████████████████████████| 81kB 6.0MB/s 
[?25hCollecting transformers<5.0.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 12.9MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 55.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |

In [2]:
!pip3 install elasticsearch

Collecting elasticsearch
[?25l  Downloading https://files.pythonhosted.org/packages/09/93/461a042becf2a35a666fb7dbb2fa31f0f766dfd1b01e7d971f4ad51f0d69/elasticsearch-7.12.0-py2.py3-none-any.whl (334kB)
[K     |█                               | 10kB 25.5MB/s eta 0:00:01[K     |██                              | 20kB 14.2MB/s eta 0:00:01[K     |███                             | 30kB 12.8MB/s eta 0:00:01[K     |████                            | 40kB 11.3MB/s eta 0:00:01[K     |█████                           | 51kB 7.6MB/s eta 0:00:01[K     |█████▉                          | 61kB 7.8MB/s eta 0:00:01[K     |██████▉                         | 71kB 8.9MB/s eta 0:00:01[K     |███████▉                        | 81kB 9.4MB/s eta 0:00:01[K     |████████▉                       | 92kB 8.3MB/s eta 0:00:01[K     |█████████▉                      | 102kB 7.4MB/s eta 0:00:01[K     |██████████▊                     | 112kB 7.4MB/s eta 0:00:01[K     |███████████▊                    | 

In [3]:
from google.colab import drive

In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/BERT-FAQ

/content/drive/MyDrive/BERT-FAQ


In [6]:
!ls

data			     __init__.py  reranker.py
evaluation.py		     metric.py	  searcher.py
faq_bert_finetuning.py	     notebook	  shared
faq_bert.py		     output	  training_data_generator.py
faq_bert_ranker.py	     parser	  webserver.py
hard_negatives_generator.py  __pycache__
indexer.py		     README.md


In [7]:
# import required dependencies
from evaluation import get_relevance_label_df
from shared.utils import load_from_json
from shared.utils import dump_to_json
from shared.utils import make_dirs
from reranker import ReRanker

In [8]:
output_path="data/StackFAQ/rank_results"

# load user_query ES results from json files
es_output_path = output_path + "/unsupervised"
es_query_by_question = load_from_json(es_output_path + '/es_query_by_question.json')
es_query_by_answer = load_from_json(es_output_path + '/es_query_by_answer.json')
es_query_by_question_answer = load_from_json(es_output_path + '/es_query_by_question_answer.json')
es_query_by_question_answer_concat = load_from_json(es_output_path + '/es_query_by_question_answer_concat.json')

In [9]:
# load test_queries, relevance_label_df for ReRanker
query_answer_pair_filepath = 'data/StackFAQ/query_answer_pairs.json'
relevance_label_df = get_relevance_label_df(query_answer_pair_filepath)
test_queries = relevance_label_df[relevance_label_df['query_type'] == 'user_query'].question.unique()

In [10]:
test_queries[:10]

array(['How to make font strikethrough on github.',
       'Is it possible to get  strikethrough letter formatting on github markdown.',
       'Making the text on github crossed out.',
       'Introducing stikethrough formatting on markdown for github.',
       'The <s> tag for font on github markdown doesnt work, is there an alternative?',
       'Making the letters i write on github striked through.',
       'Producing strikethrough text in github.',
       'Does github support strikethrough letters?',
       'How can I cross out my text on git hub?',
       'I want to have strikethrough text on github, is this possible?'],
      dtype=object)

In [11]:
# total number of test queries
len(test_queries)

1249

**1. Generating BERT prediction results from Answer (BERT-Q-a)"**

In [12]:
# define rank_field parameter
rank_field="BERT-Q-a"

**query_type="user_query"; neg_type="hard"; loss_type='triplet'**

In [None]:
# define variables
query_type="user_query"; neg_type="hard"; version="1.1"; loss_type='triplet'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [None]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [None]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-05 07:33:27 - Generating BERT top-k results ...
2021-04-05 07:33:27 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_hard_user_query_1.1
2021-04-05 07:33:27 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_hard_user_query_1.1
2021-04-05 07:33:46 - Use pytorch device: cuda
100%|██████████| 1249/1249 [38:18<00:00,  1.84s/it]
2021-04-05 08:12:08 - Generating BERT top-k results ...
2021-04-05 08:12:08 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_hard_user_query_1.1
2021-04-05 08:12:08 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_hard_user_query_1.1
2021-04-05 08:12:10 - Use pytorch device: cuda
100%|██████████| 1249/1249 [42:14<00:00,  2.03s/it]
2021-04-05 08:54:28 - Generating BERT top-k results ...
2021-04-05 08:54:28 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_hard_user_query_1.1
2021-04-05 08:54:28 - Load SentenceTransformer from folder: output/StackFAQ/models

**query_type="user_query"; neg_type="simple"; loss_type='triplet'**

In [13]:
# define variables
query_type="user_query"; neg_type="simple"; version="1.1"; loss_type='triplet'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [14]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [15]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-07 17:05:07 - Generating BERT top-k results ...
2021-04-07 17:05:07 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_simple_user_query_1.1
2021-04-07 17:05:07 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_simple_user_query_1.1
2021-04-07 17:05:23 - Use pytorch device: cuda
100%|██████████| 1249/1249 [38:35<00:00,  1.85s/it]
2021-04-07 17:44:01 - Generating BERT top-k results ...
2021-04-07 17:44:01 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_simple_user_query_1.1
2021-04-07 17:44:01 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_simple_user_query_1.1
2021-04-07 17:44:03 - Use pytorch device: cuda
100%|██████████| 1249/1249 [42:48<00:00,  2.06s/it]
2021-04-07 18:26:55 - Generating BERT top-k results ...
2021-04-07 18:26:55 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_simple_user_query_1.1
2021-04-07 18:26:55 - Load SentenceTransformer from folder: output/Stack

**query_type="faq"; neg_type="hard"; loss_type='triplet'**

In [16]:
# define variables
query_type="faq"; neg_type="hard"; version="1.1"; loss_type='triplet'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [17]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [18]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-07 19:49:42 - Generating BERT top-k results ...
2021-04-07 19:49:42 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_hard_faq_1.1
2021-04-07 19:49:42 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_hard_faq_1.1
2021-04-07 19:49:54 - Use pytorch device: cuda
100%|██████████| 1249/1249 [37:48<00:00,  1.82s/it]
2021-04-07 20:27:45 - Generating BERT top-k results ...
2021-04-07 20:27:45 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_hard_faq_1.1
2021-04-07 20:27:45 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_hard_faq_1.1
2021-04-07 20:27:47 - Use pytorch device: cuda
100%|██████████| 1249/1249 [42:19<00:00,  2.03s/it]
2021-04-07 21:10:09 - Generating BERT top-k results ...
2021-04-07 21:10:09 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_hard_faq_1.1
2021-04-07 21:10:09 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_hard_faq_1.1
2021-04-07 21

**query_type="faq"; neg_type="simple"; loss_type='triplet'**

In [19]:
# define variables
query_type="faq"; neg_type="simple"; version="1.1"; loss_type='triplet'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [20]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [21]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-07 22:32:26 - Generating BERT top-k results ...
2021-04-07 22:32:26 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_simple_faq_1.1
2021-04-07 22:32:26 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_simple_faq_1.1
2021-04-07 22:32:35 - Use pytorch device: cuda
100%|██████████| 1249/1249 [38:07<00:00,  1.83s/it]
2021-04-07 23:10:46 - Generating BERT top-k results ...
2021-04-07 23:10:46 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_simple_faq_1.1
2021-04-07 23:10:46 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_simple_faq_1.1
2021-04-07 23:10:48 - Use pytorch device: cuda
100%|██████████| 1249/1249 [42:51<00:00,  2.06s/it]
2021-04-07 23:53:43 - Generating BERT top-k results ...
2021-04-07 23:53:43 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_simple_faq_1.1
2021-04-07 23:53:43 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_simple_faq_1.1
2

**query_type="user_query"; neg_type="hard"; loss_type='softmax'**

In [13]:
# define variables
query_type="user_query"; neg_type="hard"; version="1.1"; loss_type='softmax'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [14]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [None]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-08 01:16:38 - Generating BERT top-k results ...
2021-04-08 01:16:57 - Use pytorch device: cuda
100%|██████████| 1249/1249 [33:11<00:00,  1.59s/it]
2021-04-08 01:50:11 - Generating BERT top-k results ...
2021-04-08 01:50:14 - Use pytorch device: cuda
100%|██████████| 1249/1249 [39:02<00:00,  1.88s/it]
2021-04-08 02:29:19 - Generating BERT top-k results ...
2021-04-08 02:29:22 - Use pytorch device: cuda
 99%|█████████▉| 1242/1249 [37:16<00:12,  1.74s/it]

In [15]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-08 08:32:18 - Generating BERT top-k results ...
2021-04-08 08:32:31 - Use pytorch device: cuda
100%|██████████| 1249/1249 [38:11<00:00,  1.83s/it]
2021-04-08 09:10:46 - Generating BERT top-k results ...
2021-04-08 09:10:49 - Use pytorch device: cuda
100%|██████████| 1249/1249 [36:30<00:00,  1.75s/it]


**query_type="user_query"; neg_type="simple"; loss_type='softmax'**

In [16]:
# define variables
query_type="user_query"; neg_type="simple"; version="1.1"; loss_type='softmax'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [17]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [18]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-08 09:47:23 - Generating BERT top-k results ...
2021-04-08 09:47:37 - Use pytorch device: cuda
100%|██████████| 1249/1249 [33:23<00:00,  1.60s/it]
2021-04-08 10:21:03 - Generating BERT top-k results ...
2021-04-08 10:21:06 - Use pytorch device: cuda
100%|██████████| 1249/1249 [39:02<00:00,  1.88s/it]
2021-04-08 11:00:12 - Generating BERT top-k results ...
2021-04-08 11:00:15 - Use pytorch device: cuda
100%|██████████| 1249/1249 [37:40<00:00,  1.81s/it]
2021-04-08 11:37:58 - Generating BERT top-k results ...
2021-04-08 11:38:01 - Use pytorch device: cuda
100%|██████████| 1249/1249 [36:13<00:00,  1.74s/it]


**query_type="faq"; neg_type="hard"; loss_type='softmax'**

In [19]:
# define variables
query_type="faq"; neg_type="hard"; version="1.1"; loss_type='softmax'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [20]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [21]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-08 12:14:18 - Generating BERT top-k results ...
2021-04-08 12:14:32 - Use pytorch device: cuda
100%|██████████| 1249/1249 [32:59<00:00,  1.58s/it]
2021-04-08 12:47:34 - Generating BERT top-k results ...
2021-04-08 12:47:37 - Use pytorch device: cuda
100%|██████████| 1249/1249 [38:58<00:00,  1.87s/it]
2021-04-08 13:26:39 - Generating BERT top-k results ...
2021-04-08 13:26:42 - Use pytorch device: cuda
100%|██████████| 1249/1249 [37:56<00:00,  1.82s/it]
2021-04-08 14:04:41 - Generating BERT top-k results ...
2021-04-08 14:04:44 - Use pytorch device: cuda
100%|██████████| 1249/1249 [36:00<00:00,  1.73s/it]


**query_type="faq"; neg_type="simple"; loss_type='softmax'**

In [22]:
# define variables
query_type="faq"; neg_type="simple"; version="1.1"; loss_type='softmax'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [23]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [24]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-08 14:40:48 - Generating BERT top-k results ...
2021-04-08 14:40:57 - Use pytorch device: cuda
100%|██████████| 1249/1249 [33:13<00:00,  1.60s/it]
2021-04-08 15:14:13 - Generating BERT top-k results ...
2021-04-08 15:14:16 - Use pytorch device: cuda
100%|██████████| 1249/1249 [39:18<00:00,  1.89s/it]
2021-04-08 15:53:38 - Generating BERT top-k results ...
2021-04-08 15:53:41 - Use pytorch device: cuda
100%|██████████| 1249/1249 [37:56<00:00,  1.82s/it]
2021-04-08 16:31:41 - Generating BERT top-k results ...
2021-04-08 16:31:43 - Use pytorch device: cuda
100%|██████████| 1249/1249 [37:14<00:00,  1.79s/it]


**2. Generating BERT prediction results from Question (BERT-Q-q)"**


In [12]:
# define rank_field parameter
rank_field="BERT-Q-q"

**query_type="user_query"; neg_type="hard"; loss_type='triplet'**

In [13]:
# define variables
query_type="user_query"; neg_type="hard"; version="1.1"; loss_type='triplet'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [14]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [None]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-08 17:09:03 - Generating BERT top-k results ...
2021-04-08 17:09:03 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_hard_user_query_1.1
2021-04-08 17:09:04 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_hard_user_query_1.1
2021-04-08 17:09:17 - Use pytorch device: cuda
100%|██████████| 1249/1249 [23:00<00:00,  1.11s/it]
2021-04-08 17:32:21 - Generating BERT top-k results ...
2021-04-08 17:32:21 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_hard_user_query_1.1
2021-04-08 17:32:21 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_hard_user_query_1.1
2021-04-08 17:32:22 - Use pytorch device: cuda
100%|██████████| 1249/1249 [22:33<00:00,  1.08s/it]
2021-04-08 17:55:00 - Generating BERT top-k results ...
2021-04-08 17:55:00 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_hard_user_query_1.1
2021-04-08 17:55:00 - Load SentenceTransformer from folder: output/StackFAQ/models

In [16]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-08 19:25:06 - Generating BERT top-k results ...
2021-04-08 19:25:06 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_hard_user_query_1.1
2021-04-08 19:25:06 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_hard_user_query_1.1
2021-04-08 19:25:17 - Use pytorch device: cuda
100%|██████████| 1249/1249 [14:27<00:00,  1.44it/s]


**query_type="user_query"; neg_type="simple"; loss_type='triplet'**

In [17]:
# define variables
query_type="user_query"; neg_type="simple"; version="1.1"; loss_type='triplet'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [18]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [19]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-08 19:39:47 - Generating BERT top-k results ...
2021-04-08 19:39:47 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_simple_user_query_1.1
2021-04-08 19:39:47 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_simple_user_query_1.1
2021-04-08 19:39:57 - Use pytorch device: cuda
100%|██████████| 1249/1249 [14:12<00:00,  1.47it/s]
2021-04-08 19:54:11 - Generating BERT top-k results ...
2021-04-08 19:54:11 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_simple_user_query_1.1
2021-04-08 19:54:11 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_simple_user_query_1.1
2021-04-08 19:54:13 - Use pytorch device: cuda
100%|██████████| 1249/1249 [14:06<00:00,  1.47it/s]
2021-04-08 20:08:22 - Generating BERT top-k results ...
2021-04-08 20:08:22 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_simple_user_query_1.1
2021-04-08 20:08:22 - Load SentenceTransformer from folder: output/Stack

**query_type="faq"; neg_type="hard"; loss_type='triplet'**

In [20]:
# define variables
query_type="faq"; neg_type="hard"; version="1.1"; loss_type='triplet'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [21]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [22]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-08 20:37:22 - Generating BERT top-k results ...
2021-04-08 20:37:22 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_hard_faq_1.1
2021-04-08 20:37:22 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_hard_faq_1.1
2021-04-08 20:37:33 - Use pytorch device: cuda
100%|██████████| 1249/1249 [14:04<00:00,  1.48it/s]
2021-04-08 20:51:40 - Generating BERT top-k results ...
2021-04-08 20:51:40 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_hard_faq_1.1
2021-04-08 20:51:40 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_hard_faq_1.1
2021-04-08 20:51:42 - Use pytorch device: cuda
100%|██████████| 1249/1249 [14:11<00:00,  1.47it/s]
2021-04-08 21:05:56 - Generating BERT top-k results ...
2021-04-08 21:05:56 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_hard_faq_1.1
2021-04-08 21:05:56 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_hard_faq_1.1
2021-04-08 21

**query_type="faq"; neg_type="simple"; loss_type='triplet'**

In [16]:
# define variables
query_type="faq"; neg_type="simple"; version="1.1"; loss_type='triplet'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [17]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [None]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-08 21:34:16 - Generating BERT top-k results ...
2021-04-08 21:34:16 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_simple_faq_1.1
2021-04-08 21:34:16 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_simple_faq_1.1
2021-04-08 21:34:25 - Use pytorch device: cuda
100%|██████████| 1249/1249 [14:03<00:00,  1.48it/s]
2021-04-08 21:48:30 - Generating BERT top-k results ...
2021-04-08 21:48:30 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_simple_faq_1.1
2021-04-08 21:48:30 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_simple_faq_1.1
2021-04-08 21:48:32 - Use pytorch device: cuda
100%|██████████| 1249/1249 [14:07<00:00,  1.47it/s]
2021-04-08 22:02:42 - Generating BERT top-k results ...
2021-04-08 22:02:42 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_simple_faq_1.1
2021-04-08 22:02:42 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_simple_faq_1.1
2

In [18]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-09 04:37:02 - Generating BERT top-k results ...
2021-04-09 04:37:02 - Load pretrained SentenceTransformer: output/StackFAQ/models/triplet_simple_faq_1.1
2021-04-09 04:37:02 - Load SentenceTransformer from folder: output/StackFAQ/models/triplet_simple_faq_1.1
2021-04-09 04:37:11 - Use pytorch device: cuda
100%|██████████| 1249/1249 [13:27<00:00,  1.55it/s]


**query_type="user_query"; neg_type="hard"; loss_type='softmax'**

In [19]:
# define variables
query_type="user_query"; neg_type="hard"; version="1.1"; loss_type='softmax'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [20]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [21]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-09 04:50:41 - Generating BERT top-k results ...
2021-04-09 04:50:53 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:27<00:00,  1.67it/s]
2021-04-09 05:03:23 - Generating BERT top-k results ...
2021-04-09 05:03:25 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:32<00:00,  1.66it/s]
2021-04-09 05:16:00 - Generating BERT top-k results ...
2021-04-09 05:16:02 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:32<00:00,  1.66it/s]
2021-04-09 05:28:37 - Generating BERT top-k results ...
2021-04-09 05:28:39 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:34<00:00,  1.66it/s]


**query_type="user_query"; neg_type="simple"; loss_type='softmax'**

In [22]:
# define variables
query_type="user_query"; neg_type="simple"; version="1.1"; loss_type='softmax'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [23]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [24]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-09 05:41:16 - Generating BERT top-k results ...
2021-04-09 05:41:30 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:32<00:00,  1.66it/s]
2021-04-09 05:54:05 - Generating BERT top-k results ...
2021-04-09 05:54:07 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:33<00:00,  1.66it/s]
2021-04-09 06:06:43 - Generating BERT top-k results ...
2021-04-09 06:06:45 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:36<00:00,  1.65it/s]
2021-04-09 06:19:24 - Generating BERT top-k results ...
2021-04-09 06:19:26 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:41<00:00,  1.64it/s]


**query_type="faq"; neg_type="hard"; loss_type='softmax'**

In [25]:
# define variables
query_type="faq"; neg_type="hard"; version="1.1"; loss_type='softmax'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [26]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [27]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-09 06:32:10 - Generating BERT top-k results ...
2021-04-09 06:32:22 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:39<00:00,  1.64it/s]
2021-04-09 06:45:04 - Generating BERT top-k results ...
2021-04-09 06:45:06 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:38<00:00,  1.65it/s]
2021-04-09 06:57:47 - Generating BERT top-k results ...
2021-04-09 06:57:49 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:38<00:00,  1.65it/s]
2021-04-09 07:10:31 - Generating BERT top-k results ...
2021-04-09 07:10:33 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:38<00:00,  1.65it/s]


**query_type="faq"; neg_type="simple"; loss_type='softmax'**

In [28]:
# define variables
query_type="faq"; neg_type="simple"; version="1.1"; loss_type='softmax'
bert_model_path='output/StackFAQ/models/' + loss_type + '_' + neg_type + '_' + query_type + '_' + version

In [29]:
# create instance of ReRanker class
r = ReRanker(
    bert_model_path=bert_model_path, 
    test_queries=test_queries, relevance_label_df=relevance_label_df,
    rank_field=rank_field
)

In [30]:
# generate directory structure
pred_output_path = output_path + "/supervised/" + rank_field + "/" + loss_type + "/" + query_type + "/" + neg_type
make_dirs(pred_output_path)

# next, generate BERT, Re-ranked top-k results and dump to files
bert_query_by_question = r.get_bert_topk_preds(es_query_by_question)
dump_to_json(bert_query_by_question, pred_output_path + '/bert_query_by_question.json')

bert_query_by_answer = r.get_bert_topk_preds(es_query_by_answer)
dump_to_json(bert_query_by_answer, pred_output_path + '/bert_query_by_answer.json')

bert_query_by_question_answer = r.get_bert_topk_preds(es_query_by_question_answer)
dump_to_json(bert_query_by_question_answer, pred_output_path + '/bert_query_by_question_answer.json')

bert_query_by_question_answer_concat = r.get_bert_topk_preds(es_query_by_question_answer_concat)
dump_to_json(bert_query_by_question_answer_concat, pred_output_path + '/bert_query_by_question_answer_concat.json')

2021-04-09 07:23:15 - Generating BERT top-k results ...
2021-04-09 07:23:24 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:35<00:00,  1.65it/s]
2021-04-09 07:36:02 - Generating BERT top-k results ...
2021-04-09 07:36:04 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:41<00:00,  1.64it/s]
2021-04-09 07:48:48 - Generating BERT top-k results ...
2021-04-09 07:48:50 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:43<00:00,  1.64it/s]
2021-04-09 08:01:37 - Generating BERT top-k results ...
2021-04-09 08:01:39 - Use pytorch device: cuda
100%|██████████| 1249/1249 [12:41<00:00,  1.64it/s]
