# Question-Answering using Simple Wikipedia Index

This examples demonstrates the setup for Query / Question-Answer-Retrieval.

You can input a query or a question. The script then uses semantic search
to find relevant passages in Simple English Wikipedia (as it is smaller and fits better in RAM).

For semantic search, we use SentenceTransformer('msmarco-distilbert-base-v2') and retrieve
100 potentially passages that answer the input query.

Next, we use a more powerful CrossEncoder (cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')) that
scores the query and all retrieved passages for their relevancy. The cross-encoder is neccessary to filter out certain noise
that might be retrieved from the semantic search step.


In [None]:
# !df -h
# !cat /proc/cpuinfo
# !cat /proc/meminfo

In [None]:
!pip install -U sentence-transformers rank_bm25

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/c4/87/49dc49e13ac107ce912c2f3f3fd92252c6d4221e88d1e6c16747044a11d8/sentence-transformers-1.1.0.tar.gz (78kB)
[K     |████████████████████████████████| 81kB 6.0MB/s 
[?25hCollecting rank_bm25
  Downloading https://files.pythonhosted.org/packages/16/5a/23ed3132063a0684ea66fb410260c71c4ffda3b99f8f1c021d1e245401b5/rank_bm25-0.2.1-py3-none-any.whl
Collecting transformers<5.0.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 18.7MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 42.1MB/s 
Collecting sacre

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import time
import gzip
import os
import torch

if not torch.cuda.is_available():
  print("Warning: No GPU found. Please add GPU to your notebook")


#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
model_name = 'msmarco-distilbert-base-v2'

# model_name = "/content/drive/MyDrive/SBRT_output/training_biker_distilroberta_base_bi-encoder-min_50distilroberta-base-2021-02-23_02-33-56"
# model_name = "/content/drive/MyDrive/SBRT_output/training_biker_bi-encoder-min_5_max_10_ir_10_distilroberta-base_3_iter"
# model_name = "/content/drive/MyDrive/SBRT_output/training_biker_bi-encoder-min_5_max_10_ir_10_distilroberta-base-full-best"
bi_encoder = SentenceTransformer(model_name)
top_k = 100     #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('/content/drive/MyDrive/SBRT_output/training_biker_cross-encoder-30_iter_TinyBERT-full-best')


In [None]:
### Now we read the MS Marco dataset
data_folder = '/content/drive/MyDrive/biker_data/min_5_max_10_ir_10_30k'

os.makedirs(data_folder, exist_ok=True)
import json 

# in the order of 1 to 30k
corpus = []
collection_filepath = os.path.join(data_folder, 'Corpus_dict.json')
#"evaluate_Corpus_min_2_max_10.json"
with open(collection_filepath, 'r', encoding='utf8') as fIn:
  data = json.load(fIn)
  for k in range(len(data)):
    corpus.append(data[str(k)])

# in the order of 1 to 30k
Answers = []
collection_filepath = os.path.join(data_folder, 'Answers_dict.json')
with open(collection_filepath, 'r', encoding='utf8') as fIn:
  data = json.load(fIn)
  for k in range(len(data)):
    Answers.append(data[str(k)])


evaluate_corpus = []
evaluate_answers = []
collection_filepath = os.path.join(data_folder, 'evaluate_Corpus.json')
with open(collection_filepath, 'r', encoding='utf8') as fIn:
  data = json.load(fIn)
  for k in data:
    evaluate_corpus.append(corpus[k])
    evaluate_answers.append(Answers[k])

queries = []
queries_answers = []
collection_filepath = os.path.join(data_folder, 'evaluate_multi_queries.json')
with open(collection_filepath, 'r', encoding='utf8') as fIn:
  data = json.load(fIn)
  for k in data:
    queries.append(corpus[k])
    queries_answers.append(Answers[k])


In [None]:
import pandas as pd

In [None]:
# df_test= pd.read_csv("/content/drive/MyDrive/biker_data/min_5_max_10_ir_10_30k/Biker_test_filtered.csv")
# # df_test= pd.read_csv("/content/drive/MyDrive/biker_data/min_5_max_10_ir_10_30k/SO_test_filtered.csv")
# queries = df_test["title"].to_list()
# queries_answers = df_test["answer"].to_list()
# queries_answers=[str(list(eval(x))) for x in queries_answers]

In [None]:
len(queries)

1000

In [None]:
queries[:10]

['Sorting custom class array-list string using Collections.sort',
 'String IdentityHashMap vs HashMap performance',
 'Thread.sleep() VS Executor.scheduleWithFixedDelay()',
 'Number of subfolders in a folder directory',
 'How to test whether a char is NOT in a string? (java, junit)',
 'String.format option for locale specific double formatting like Double.toString()?',
 'Is there a way to know if a Java program was started from the command line or from a jar file?',
 'How to escape special characters in the regex ***(.*)',
 'GWT interaction with external standalone application',
 'Java Collection compare generic class that extends interface that extends comparable']

In [None]:
# df= pd.read_csv("/content/drive/MyDrive/biker_data/min_5_max_10_ir_10_30k/Big_train.QApair.csv")

filtered_evaluate_corpus =[]
filtered_evaluate_answers =[]
print(len(evaluate_corpus))
print(len(evaluate_answers))
for idx,q in enumerate(evaluate_corpus):
  if not q in queries:
    filtered_evaluate_corpus.append(evaluate_corpus[idx])
    filtered_evaluate_answers.append(evaluate_answers[idx])
evaluate_corpus = filtered_evaluate_corpus
evaluate_answers = filtered_evaluate_answers
print(len(evaluate_corpus))
print(len(evaluate_answers))
#   queries_answers[idx] = str(list(eval(queries_answers[idx])))
#   df=df[~(df["title"].isin(queries))]
# evaluate_corpus=df["title"].to_list()
# evaluate_answers=df["answer"].to_list()

20481
20481
19532
19532


In [None]:
passages = evaluate_corpus
# passages = corpus
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)
# passages = evaluate_corpus
# passages = corpus

HBox(children=(FloatProgress(value=0.0, description='Batches', max=611.0, style=ProgressStyle(description_widt…




In [None]:
# We also compare the results to lexical search (keyword search). Here, we use 
# the BM25 algorithm which is implemented in the rank_bm25 package.

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np

# We lower case our text and remove stop-words from indexing
def bm25_tokenizer(text):
  tokenized_doc = []
  for token in text.lower().split():
    token = token.strip(string.punctuation)

    if len(token) > 0 and token not in stop_words.ENGLISH_STOP_WORDS:
      tokenized_doc.append(token)
  return tokenized_doc

tokenized_corpus = []
for passage in tqdm(passages):
  tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)



HBox(children=(FloatProgress(value=0.0, max=19532.0), HTML(value='')))




In [None]:
#This function will search all wikipedia articles for passages that
#answer the query
def evaluate(query,answer):


  top_k=50
  answer=eval(answer)
  print(query, answer)

  bm25_scores = bm25.get_scores(bm25_tokenizer(query))
  top_n = np.argpartition(bm25_scores, -50)[-50:]
  bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
  bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)

  BM25_counter = -1 
  BM25_tmp_map = 0
  BM25_tmp_mrr = 0
  temp_hits = 0
  tmep_answer = answer[:]
  for idx, hit in enumerate(bm25_hits[0:50]):
      candidate = eval(evaluate_answers[hit['corpus_id']].replace("\n", " "))
      for c in candidate:
        if c in tmep_answer:
          if BM25_counter == -1: BM25_counter = idx + 1
          temp_hits+=1
          
          BM25_tmp_map += temp_hits/(idx+1)
          tmep_answer.remove(c)

  # print(temp_hits)
  # print(BM25_tmp_map)
  BM25_tmp_map /= len(answer)
  BM25_tmp_mrr = 0.0
  if BM25_counter!= -1:
    BM25_tmp_mrr = 1/BM25_counter



  question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
  Encoder_hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
  Encoder_hits = Encoder_hits[0]  # Get the hits for the first query

  cross_inp = [[query, passages[hit['corpus_id']]] for hit in Encoder_hits]
  cross_scores = cross_encoder.predict(cross_inp)
  for idx in range(len(cross_scores)):
      Encoder_hits[idx]['cross-score'] = cross_scores[idx]

  
  Bi_Encoder_counter = -1 
  Bi_Encoder_tmp_map = 0
  Bi_Encoder_tmp_mrr = 0

  Bi_Encoder_hit_list=[0]*top_k
  Bi_Encoder_hit_recall_list=[0]*top_k

  temp_hits = 0
  Encoder_hits = sorted(Encoder_hits, key=lambda x: x['score'], reverse=True) 
  tmep_answer = answer[:]
  for idx,hit in enumerate(Encoder_hits[0:top_k]):
      candidate= eval(evaluate_answers[hit['corpus_id']].replace("\n", " "))
      for c in candidate:

        if c in answer:
          if not query==passages[hit['corpus_id']].replace("\n", " ").replace("?",""):
            Bi_Encoder_hit_list[idx]=1

        if c in tmep_answer:
          if not query==passages[hit['corpus_id']].replace("\n", " ").replace("?",""):
            

            if Bi_Encoder_counter == -1: 
              Bi_Encoder_counter = idx + 1

              
              print("\t{:.3f}\t{}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " "), evaluate_answers[hit['corpus_id']].replace("\n", " ")))
            temp_hits+=1


            Bi_Encoder_hit_recall_list[idx]=1


            Bi_Encoder_tmp_map += temp_hits/(idx+1)
            tmep_answer.remove(c)
  Bi_Encoder_tmp_map /= len(answer)
  Bi_Encoder_tmp_mrr = 0.0
  if Bi_Encoder_counter!= -1:
    Bi_Encoder_tmp_mrr = 1/Bi_Encoder_counter

  # answer_api= Answers[hit['corpus_id']]
  Cross_Encoder_counter = -1 
  Cross_Encoder_tmp_map = 0
  Cross_Encoder_tmp_mrr = 0

  Cross_Encoder_hit_list=[0]*top_k
  Cross_Encoder_hit_recall_list=[0]*top_k

  temp_hits = 0
  Encoder_hits = sorted(Encoder_hits, key=lambda x: x['cross-score'], reverse=True)
  tmep_answer = answer[:]
  for idx,hit in enumerate(Encoder_hits[0:50]):
      # print("\t{:.3f}\t{}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " "), Answers[hit['corpus_id']].replace("\n", " ")))
      candidate= eval(evaluate_answers[hit['corpus_id']].replace("\n", " "))
      # print(candidate,answer)
      for c in candidate:

        if c in answer:
          if not query==passages[hit['corpus_id']].replace("\n", " ").replace("?",""):
            Cross_Encoder_hit_list[idx]=1

        if c in tmep_answer:

          if Cross_Encoder_counter == -1: Cross_Encoder_counter = idx + 1
          temp_hits+=1


          Cross_Encoder_hit_recall_list[idx]=1


          Cross_Encoder_tmp_map += temp_hits/(idx+1)
          tmep_answer.remove(c)
  Cross_Encoder_tmp_map /= len(answer)
  Cross_Encoder_tmp_mrr = 0.0
  if Cross_Encoder_counter!= -1:
    Cross_Encoder_tmp_mrr = 1/Cross_Encoder_counter



  return BM25_counter,BM25_tmp_mrr,BM25_tmp_map,Bi_Encoder_counter,Bi_Encoder_tmp_mrr,Bi_Encoder_tmp_map,Cross_Encoder_counter,Cross_Encoder_tmp_mrr,Cross_Encoder_tmp_map,str(answer),Bi_Encoder_hit_list,Bi_Encoder_hit_recall_list,Cross_Encoder_hit_list,Cross_Encoder_hit_recall_list

In [None]:
BM25_mrr = 0
BM25_map = 0

Bi_Encoder_mrr = 0
Bi_Encoder_map = 0

Cross_Encoder_mrr = 0
Cross_Encoder_map = 0

api_list =[]
good_result=[]


Bi_Encoder_precision=[0]*4
Bi_Encoder_recall=[0]*4

Cross_Encoder_precision=[0]*4
Cross_Encoder_recall=[0]*4

for idx in range(len(queries)):
# for idx in range(10):
  
  BM25_counter,BM25_tmp_mrr,BM25_tmp_map,Bi_Encoder_counter,Bi_Encoder_tmp_mrr,Bi_Encoder_tmp_map,Cross_Encoder_counter,Cross_Encoder_tmp_mrr,Cross_Encoder_tmp_map,answer_api,Bi_Encoder_hit_list,Bi_Encoder_hit_recall_list,Cross_Encoder_hit_list,Cross_Encoder_hit_recall_list = evaluate(query = queries[idx], answer = queries_answers[idx])
  
  print(BM25_counter, Bi_Encoder_counter, Cross_Encoder_counter)

  len_api = len(eval(answer_api))
  print()
  
  temp_precision=[0]*4
  temp_recall=[0]*4
  for idx, n in enumerate([1,3,5,10]):
    temp_precision[idx] = sum(Bi_Encoder_hit_list[:n])/n
    temp_recall[idx] = sum(Bi_Encoder_hit_recall_list[:n])/(len_api)

  Bi_Encoder_precision = [x + y for (x, y) in zip(Bi_Encoder_precision, temp_precision)] 
  Bi_Encoder_recall = [x + y for (x, y) in zip(Bi_Encoder_recall, temp_recall)] 


  temp_precision=[0]*4
  temp_recall=[0]*4
  for idx, n in enumerate([1,3,5,10]):
    temp_precision[idx] = sum(Cross_Encoder_hit_list[:n])/n
    temp_recall[idx] = sum(Cross_Encoder_hit_recall_list[:n])/(len_api)

  Cross_Encoder_precision = [x + y for (x, y) in zip(Cross_Encoder_precision, temp_precision)] 
  Cross_Encoder_recall = [x + y for (x, y) in zip(Cross_Encoder_recall, temp_recall)] 


  # print(BM25_tmp_mrr, Bi_Encoder_tmp_mrr, Cross_Encoder_tmp_mrr)
  # print(BM25_tmp_map, Bi_Encoder_tmp_map, Cross_Encoder_tmp_map)
  # if -1<Bi_Encoder_counter < 3 or -1<Cross_Encoder_counter < 3:
  #   good_result.append([queries[idx], queries_answers[idx]])

  api_list.append(answer_api)
  BM25_mrr+=BM25_tmp_mrr
  BM25_map+=BM25_tmp_map

  Bi_Encoder_mrr+=Bi_Encoder_tmp_mrr
  Bi_Encoder_map+=Bi_Encoder_tmp_map
  
  Cross_Encoder_mrr+=Cross_Encoder_tmp_mrr
  Cross_Encoder_map+=Cross_Encoder_tmp_map

BM25_mrr/=len(queries)
BM25_map/=len(queries)

Bi_Encoder_mrr/=len(queries)
Bi_Encoder_map/=len(queries)

Cross_Encoder_mrr/=len(queries)
Cross_Encoder_map/=len(queries)

Bi_Encoder_precision = [x/len(queries) for x in Bi_Encoder_precision]
Bi_Encoder_recall = [x/len(queries) for x in Bi_Encoder_recall]


Cross_Encoder_precision = [x/len(queries) for x in Cross_Encoder_precision]
Cross_Encoder_recall = [x/len(queries) for x in Cross_Encoder_recall]

print("Bi_Encoder_precision")
print(Bi_Encoder_precision)

print("Bi_Encoder_recall")
print(Bi_Encoder_recall)

print("Cross_Encoder_precision")
print(Cross_Encoder_precision)

print("Cross_Encoder_recall")
print(Cross_Encoder_recall)

print(BM25_mrr,Bi_Encoder_mrr,Cross_Encoder_mrr)
print(BM25_map,Bi_Encoder_map,Cross_Encoder_map)
print(len(list(set(api_list))))

In [None]:
good_result

In [None]:
from google.colab import drive
drive.mount('/content/drive')