In [None]:
import pandas as pd
import numpy as np
!pip install pickle5
import pickle5 as pickle
import os
from collections import Counter
import re

from google.colab import drive



## Files and Folders


In [None]:
drive.mount('/content/drive', force_remount=True)  # use force_remount=True param after upload of new data
# !ls "/content/drive/My Drive/Master/2 - FSS 2021/Information Retrieval/IR Projekt/"

Mounted at /content/drive


In [None]:
# file and folder names
ir_project_drive_folder = "IR Projekt"  # TODO: geht das für alle?
full_ir_project_drive_folder = "/content/drive/My Drive/{}/data/wikipedia".format(ir_project_drive_folder)

# preprocessed files
preprocessed_folder = full_ir_project_drive_folder + '/no-pron/preprocessed'

preprocessed_wikipedia_evidence_file = preprocessed_folder + '/preprocessed_wikipedia_evidence_dict.pkl'
preprocessed_qa_wikipedia_verified_dev_filename = preprocessed_folder + '/qa/verified-wikipedia-dev.pkl'
preprocessed_qa_wikipedia_dev_filename = preprocessed_folder + '/qa/wikipedia-dev.pkl'
preprocessed_qa_wikipedia_test_without_answers_filename = preprocessed_folder + '/qa/wikipedia-test-without-answers.pkl'
preprocessed_qa_wikipedia_train_filename = preprocessed_folder + '/qa/wikipedia-train.pkl'

## retrieval w/ bm25
bm25_results_folder = full_ir_project_drive_folder + '/no-pron/bm25'

bm25_retrieval_wiki_dev_docs_scores = bm25_results_folder + "/retrieval_wiki_dev_docs_scores.pkl"
bm25_retrieval_wiki_dev_verified_docs_scores = bm25_results_folder + "/retrieval_wiki_dev_verified_docs_scores.pkl"
bm25_retrieval_wiki_test_docs_scores = bm25_results_folder + "/retrieval_wiki_test_docs_scores.pkl"
bm25_retrieval_wiki_train_docs_scores = bm25_results_folder + "/retrieval_wiki_train_docs_50_scores.pkl" # limited to top 50 documents per queries, as otherwise exceeds RAM


## retrieval w/ VSM
vsm_results_folder = full_ir_project_drive_folder + '/no-pron/vsm'

vsm_retrieval_wiki_dev_docs_scores = vsm_results_folder + "/retrieval_wiki_dev_docs_scores.pkl"
vsm_retrieval_wiki_dev_verified_docs_scores = vsm_results_folder + "/retrieval_wiki_dev_verified_docs_scores.pkl"
vsm_retrieval_wiki_test_docs_scores = vsm_results_folder + "/retrieval_wiki_test_docs_scores.pkl"
vsm_retrieval_wiki_train_docs_scores = vsm_results_folder + "/retrieval_wiki_train_docs_scores.pkl"

## Prep for smart irrelevant samples during NN training
bm25_retrieved_train_TP_FP = full_ir_project_drive_folder + "/neural-retrieval-datasets/bm25_tp_fp_train.pkl"
bm25_retrieved_dev_TP_FP = full_ir_project_drive_folder + "/neural-retrieval-datasets/bm25_tp_fp_dev.pkl"


In [None]:
def save_as_pickle(obj, filename):
    """
    save an object in a pickle file dump
    :param obj: object to dump
    :param filename: target file
    :return:
    """
    with open(filename, 'wb') as file:
        pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)


def load_pickle(filename):
    """
    load an object from a given pickle file
    :param filename: source file
    :return: loaded object
    """
    with open(filename, 'rb') as file:
        return pickle.load(file)

## Load Data

In [None]:
def to_retrieval_ready(qas):
    retrieval_ready = {}
    
    for qa in qas:
        question_id = qa["QuestionId"]
        entity_pages = qa["EntityPages"]
        matching_documents = [entity_page["Filename"] for entity_page in entity_pages]
        preprocessed_question = qa["Question_preprocessed"]
        
        retrieval_ready[question_id] = {"matching_documents": matching_documents, 'preprocessed_question': preprocessed_question}
        
    return retrieval_ready

In [None]:
# Query Data: Wiki Dev Set (Preprocessed)
retrieval_ready_qa_wiki_dev_dict = to_retrieval_ready(load_pickle(preprocessed_qa_wikipedia_dev_filename))

# Query Data: Wiki Dev Set - Verified (Preprocessed)
retrieval_ready_qa_wiki_dev_verified_dict = to_retrieval_ready(load_pickle(preprocessed_qa_wikipedia_verified_dev_filename))

# Query Data: Wiki Train Set(Preprocessed)
retrieval_ready_qa_wiki_train_dict = to_retrieval_ready(load_pickle(preprocessed_qa_wikipedia_train_filename))

# Query Data: Wiki Test Set(Preprocessed)
retrieval_ready_qa_wiki_test_dict = to_retrieval_ready(load_pickle(preprocessed_qa_wikipedia_test_without_answers_filename))

In [None]:
retrieval_ready_qa_wiki_train_dict

BM25

In [None]:
# Retrieved Documents: Wiki Dev Set (Preprocessed) - top 500 documents per qurey
bm25_retrieval_wiki_dev_docs_scores_dict = load_pickle(bm25_retrieval_wiki_dev_docs_scores)

# Retrieved Documents: Wiki Dev Set - Verified (Preprocessed) - top 500 documents per qurey
bm25_retrieval_wiki_dev_verified_docs_scores_dict = load_pickle(bm25_retrieval_wiki_dev_verified_docs_scores)

# Retrieved Documents: Wiki Train Set(Preprocessed) - top 50 documents per qurey
bm25_retrieval_wiki_train_docs_scores_dict = load_pickle(bm25_retrieval_wiki_train_docs_scores)

# Retrieved Documents: Wiki Test Set(Preprocessed)
bm25_retrieval_wiki_test_docs_scores_dict = load_pickle(bm25_retrieval_wiki_test_docs_scores)

VSM

In [None]:
## Run this cell locally, exceeds colab RAM

# Retrieved Documents: Wiki Dev Set (Preprocessed)
vsm_retrieval_wiki_dev_docs_scores_dict = load_pickle(vsm_retrieval_wiki_dev_docs_scores)

# Retrieved Documents: Wiki Dev Set - Verified (Preprocessed)
vsm_retrieval_wiki_dev_verified_docs_scores_dict = load_pickle(vsm_retrieval_wiki_dev_verified_docs_scores)

# Retrieved Documents: Wiki Train Set(Preprocessed)
vsm_retrieval_wiki_train_docs_scores_dict = load_pickle(vsm_retrieval_wiki_train_docs_scores)

# Retrieved Documents: Wiki Test Set(Preprocessed)
vsm_retrieval_wiki_test_docs_scores_dict = load_pickle(vsm_retrieval_wiki_test_docs_scores)

## Evaluation Utils

Precision = tp / (tp + fp)
Recall = tp / (tp + fn)

In [None]:
## number of retrieved docs (tp + fp)
def number_of_retrieved_docs(query_results, output):
  for key in query_results:
    output[key] = {"TP+FP": len(query_results[key])}

In [None]:
## Calculate number of true positive retrievals - EVALUATION ONLY
def true_pos(query_dict, query_results, output):
  for key in query_results:
    true_pos = 0
    true_available = len(query_dict[key]["matching_documents"])
    output[key]["TP+FN"]=true_available
    found = len(query_results[key])
    for i in range(true_available):
      for j in range(found):
        if query_dict[key]["matching_documents"][i] == query_results[key][j][0]:
          true_pos +=1
        else: j +=1
      i +=1
    output[key]["TP"]=true_pos

In [None]:
## Calculate number of true positive retrievals - PREP FOR NEURAL RETRIEVAL
def true_pos_NN(query_dict, query_results, output, query_dict_forNN):
  for key in query_results:
    query_dict_forNN[key]={'matching_documents':[]}
    query_dict_forNN[key]["false positive document"] = []
    true_pos = 0
    true_available = len(query_dict[key]["matching_documents"])
    output[key]["TP+FN"]=true_available
    found = len(query_results[key])
    for i in range(true_available):
      for j in range(found):
        if query_dict[key]["matching_documents"][i] == query_results[key][j][0]:
          true_pos +=1
          query_dict_forNN[key]['matching_documents'].append(query_dict[key]["matching_documents"][i])
        else: 
          query_dict_forNN[key]["false positive document"].append(query_results[key][j][0])
          j +=1
      i +=1
    output[key]["TP"]=true_pos

In [None]:
def precision_recall(bm25_wiki_dev_eval):
  for key in bm25_wiki_dev_eval:
    if bm25_wiki_dev_eval[key]['TP+FP'] > 0:
      bm25_wiki_dev_eval[key]["Precision"]=bm25_wiki_dev_eval[key]['TP']/bm25_wiki_dev_eval[key]['TP+FP']
    else:
      bm25_wiki_dev_eval[key]["Precision"]=0
    bm25_wiki_dev_eval[key]["Recall"]=bm25_wiki_dev_eval[key]['TP']/bm25_wiki_dev_eval[key]['TP+FN']

In [None]:
# Average Precision
def MAP(query_dict, query_results, output, map_unnorm, MAP):
  map_unnorm = 0
  MAP = 0
  for q_id, documents in (query_results.items()):
    ap = 0
    pos=1
    correct=0
    doc_list= []
    for j in range(len(query_results[q_id])):
      doc = query_results[q_id][j][0]
      doc_list.append(doc)
    for i in doc_list:
      if i in query_dict[q_id]["matching_documents"]:
        correct+=1
        ap = ap + (correct/pos)
        pos+=1
      else:
        pos+=1
      
    output[q_id]["AP"] = ap / output[q_id]["TP+FN"]
    map_unnorm+= ap / output[q_id]["TP+FN"]
  MAP= map_unnorm/len(query_dict)
  print(f"MAP: {MAP}")

In [None]:
def eval_at_k(actual, predicted, k=50):
    predicted_at_k = {key: result[:k] for key, result in predicted.items()}
    
    eval_result = {}
    unnorm = 0
    map_score = 0
    
    number_of_retrieved_docs(predicted_at_k, eval_result)    
    true_pos(actual, predicted_at_k, eval_result)
    precision_recall(eval_result)
    MAP(actual, predicted_at_k, eval_result, unnorm, map_score)

    print(list(eval_result.items())[:10])
    check_perf(eval_result)

In [None]:
## Check how many queries could not be answered
def check_perf(evaluation):
  not_found = 0

  for key in evaluation:
    if evaluation[key]["TP"]==0:
      not_found +=1

  print(f"For {not_found} queries out of {len(evaluation)} no correct document was retrieved ({round(not_found*100/len(evaluation),2)} %)")

## BM25 Evaluation


Wiki dev set


In [None]:
bm25_retrieval_wiki_dev_docs_scores_dict["dpql_452"]

[('Rib_cage.txt', 33.270124132973216),
 ('Thoracic_diaphragm.txt', 29.476921507776108),
 ('Cat_anatomy.txt', 28.295665931495602),
 ('Intercostal_arteries.txt', 28.08823428306207),
 ('Book_Muscular_system.txt', 27.56462241715836),
 ('Intercostal_muscle.txt', 26.334377454065105),
 ('Respiratory_system.txt', 26.282152405528134),
 ('Internal_intercostal_muscles.txt', 26.279012909280617),
 ('Human_back.txt', 26.272276204133757),
 ('Sternum.txt', 25.04875056501594),
 ('Pleural_cavity.txt', 24.918872881117373),
 ('Thorax.txt', 24.73558665975589),
 ('Pulmonary_pleurae.txt', 23.806616238740023),
 ('Shortness_of_breath.txt', 22.49101249966233),
 ('Breathing.txt', 22.460497552744087),
 ('Lung.txt', 21.396523080682098),
 ('Medulla_oblongata.txt', 20.943089684401762),
 ('General_anaesthesia.txt', 20.237431522431727),
 ('Esophagus.txt', 20.213168513870574),
 ('Inhalation.txt', 19.78215353876555),
 ('Turtle_shell.txt', 18.770611325627357),
 ('Biofeedback.txt', 18.356239280584532),
 ('Spinal_cord.txt'

In [None]:
# WIKI DEV Set
eval_at_k(retrieval_ready_qa_wiki_dev_dict, bm25_retrieval_wiki_dev_docs_scores_dict)

MAP: 0.5452675859909932
[('tc_33', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.3333333333333333}), ('tc_40', {'TP+FP': 50, 'TP+FN': 2, 'TP': 2, 'Precision': 0.04, 'Recall': 1.0, 'AP': 0.28125}), ('tc_397', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 1.0}), ('tc_455', {'TP+FP': 50, 'TP+FN': 4, 'TP': 4, 'Precision': 0.08, 'Recall': 1.0, 'AP': 0.8041666666666667}), ('tc_217', {'TP+FP': 50, 'TP+FN': 2, 'TP': 2, 'Precision': 0.04, 'Recall': 1.0, 'AP': 0.16666666666666669}), ('tc_219', {'TP+FP': 50, 'TP+FN': 2, 'TP': 2, 'Precision': 0.04, 'Recall': 1.0, 'AP': 0.19642857142857142}), ('tc_510', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 1.0}), ('tc_515', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.3333333333333333}), ('tc_280', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 1.0}), ('tc_282', {'TP+FP': 50, 'TP+FN': 2, 'TP': 0, 'Precision': 

In [None]:
## check 1 example
print(bm25_retrieval_wiki_dev_docs_scores_dict["tc_33"])
print(retrieval_ready_qa_wiki_dev_dict["tc_33"])

[('Julian_Lloyd_Webber.txt', 29.39136583238563), ('Adelphi_Theatre.txt', 29.352536085427317), ('Andrew_Lloyd_Webber.txt', 29.001668204913347), ('Sunset_Boulevard_(musical).txt', 28.975076592109684), ('The_Phantom_of_the_Opera_(1986_musical).txt', 28.945679198257686), ("I_Don't_Know_How_to_Love_Him.txt", 28.188574982451904), ('Michael_Ball_(singer).txt', 28.114409499610833), ('Michael_Crawford.txt', 27.429953679132367), ('Sarah_Brightman.txt', 27.384611555644376), ('Marti_Webb.txt', 26.723444003361852), ('Sing_(Gary_Barlow_song).txt', 26.688838912025385), ('Joseph_and_the_Amazing_Technicolor_Dreamcoat.txt', 26.23422836326275), ('Requiem_(Lloyd_Webber).txt', 25.8668404655725), ('Whistle_Down_the_Wind_(musical).txt', 25.6791474939417), ("Don't_Cry_for_Me_Argentina.txt", 25.168766749006544), ('Palace_Theatre,_London.txt', 24.91785442698989), ('Tim_Rice.txt', 24.90574620133379), ('Stephen_Ward_(musical).txt', 24.835449601238025), ('Gielgud_Theatre.txt', 24.723981890761305), ('Cameron_Mackin

Wiki dev set (verified)


In [None]:
## WIKI DEV Set (VERIFIED) - SUBSET OF DEV SET [Exclude in Overall MAP Calculation]
eval_at_k(retrieval_ready_qa_wiki_dev_verified_dict, bm25_retrieval_wiki_dev_verified_docs_scores_dict)

MAP: 0.5655986571298662
[('tc_2957', {'TP+FP': 50, 'TP+FN': 2, 'TP': 1, 'Precision': 0.02, 'Recall': 0.5, 'AP': 0.25}), ('tc_3130', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 1.0}), ('tc_1250', {'TP+FP': 50, 'TP+FN': 1, 'TP': 0, 'Precision': 0.0, 'Recall': 0.0, 'AP': 0.0}), ('tc_1348', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 1.0}), ('tc_2090', {'TP+FP': 50, 'TP+FN': 2, 'TP': 2, 'Precision': 0.04, 'Recall': 1.0, 'AP': 0.5344827586206896}), ('tc_2580', {'TP+FP': 50, 'TP+FN': 2, 'TP': 2, 'Precision': 0.04, 'Recall': 1.0, 'AP': 0.5588235294117647}), ('qz_5999', {'TP+FP': 50, 'TP+FN': 4, 'TP': 4, 'Precision': 0.08, 'Recall': 1.0, 'AP': 0.24783549783549785}), ('qz_6687', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 1.0}), ('qz_3745', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 1.0}), ('qz_4056', {'TP+FP': 50, 'TP+FN': 3, 'TP': 3, 'Precision': 0.06, 'Recall': 1.0, 'A

Wiki test set

In [None]:
## WIKI TEST Set
eval_at_k(retrieval_ready_qa_wiki_test_dict, bm25_retrieval_wiki_test_docs_scores_dict)

MAP: 0.5427824388731309
[('tc_118', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.3333333333333333}), ('tc_121', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.5}), ('tc_51', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.5}), ('tc_62', {'TP+FP': 50, 'TP+FN': 1, 'TP': 0, 'Precision': 0.0, 'Recall': 0.0, 'AP': 0.0}), ('tc_293', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.5}), ('tc_312', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 1.0}), ('tc_233', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.030303030303030304}), ('tc_250', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.5}), ('tc_142', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 1.0}), ('tc_143', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.3333333333333333})]
F

Wiki train set

In [None]:
## WIKI TRAIN Set
eval_at_k(retrieval_ready_qa_wiki_train_dict, bm25_retrieval_wiki_train_docs_scores_dict)

MAP: 0.5431865737073746
[('tc_3', {'TP+FP': 50, 'TP+FN': 2, 'TP': 1, 'Precision': 0.02, 'Recall': 0.5, 'AP': 0.08333333333333333}), ('tc_8', {'TP+FP': 50, 'TP+FN': 3, 'TP': 2, 'Precision': 0.04, 'Recall': 0.6666666666666666, 'AP': 0.5}), ('tc_31', {'TP+FP': 50, 'TP+FN': 1, 'TP': 0, 'Precision': 0.0, 'Recall': 0.0, 'AP': 0.0}), ('tc_32', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.25}), ('tc_38', {'TP+FP': 50, 'TP+FN': 1, 'TP': 0, 'Precision': 0.0, 'Recall': 0.0, 'AP': 0.0}), ('tc_39', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 1.0}), ('tc_21', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 1.0}), ('tc_22', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 1.0}), ('tc_47', {'TP+FP': 50, 'TP+FN': 1, 'TP': 0, 'Precision': 0.0, 'Recall': 0.0, 'AP': 0.0}), ('tc_48', {'TP+FP': 50, 'TP+FN': 2, 'TP': 1, 'Precision': 0.02, 'Recall': 0.5, 'AP': 0.08333333333333333})]
For 4126 que

## VSM Evaluation

Wiki dev set

In [None]:
# WIKI DEV Set
eval_at_k(retrieval_ready_qa_wiki_dev_dict, vsm_retrieval_wiki_dev_docs_scores_dict)

MAP: 0.25114245817270914
[('tc_106', {'TP+FP': 50, 'TP+FN': 1, 'TP': 0, 'Precision': 0.0, 'Recall': 0.0, 'AP': 0.0}), ('tc_137', {'TP+FP': 50, 'TP+FN': 1, 'TP': 0, 'Precision': 0.0, 'Recall': 0.0, 'AP': 0.0}), ('tc_517', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.5}), ('tc_538', {'TP+FP': 50, 'TP+FN': 2, 'TP': 2, 'Precision': 0.04, 'Recall': 1.0, 'AP': 0.07045454545454546}), ('tc_217', {'TP+FP': 50, 'TP+FN': 2, 'TP': 0, 'Precision': 0.0, 'Recall': 0.0, 'AP': 0.0}), ('tc_219', {'TP+FP': 50, 'TP+FN': 2, 'TP': 2, 'Precision': 0.04, 'Recall': 1.0, 'AP': 0.05772357723577236}), ('tc_540', {'TP+FP': 50, 'TP+FN': 2, 'TP': 1, 'Precision': 0.02, 'Recall': 0.5, 'AP': 0.5}), ('tc_543', {'TP+FP': 50, 'TP+FN': 2, 'TP': 2, 'Precision': 0.04, 'Recall': 1.0, 'AP': 0.13333333333333333}), ('tc_241', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.3333333333333333}), ('tc_261', {'TP+FP': 50, 'TP+FN': 2, 'TP': 0, 'Precision': 0.0, 'Recall': 0.0

Wiki dev set (verified)


In [None]:
## WIKI DEV Set (VERIFIED)
eval_at_k(retrieval_ready_qa_wiki_dev_verified_dict, vsm_retrieval_wiki_dev_verified_docs_scores_dict)

MAP: 0.2749917425247687
[('tc_2957', {'TP+FP': 50, 'TP+FN': 2, 'TP': 2, 'Precision': 0.04, 'Recall': 1.0, 'AP': 0.5238095238095238}), ('tc_3130', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.022727272727272728}), ('qz_611', {'TP+FP': 50, 'TP+FN': 4, 'TP': 4, 'Precision': 0.08, 'Recall': 1.0, 'AP': 0.3566270860077022}), ('qz_881', {'TP+FP': 50, 'TP+FN': 1, 'TP': 0, 'Precision': 0.0, 'Recall': 0.0, 'AP': 0.0}), ('qz_2866', {'TP+FP': 50, 'TP+FN': 3, 'TP': 1, 'Precision': 0.02, 'Recall': 0.3333333333333333, 'AP': 0.011494252873563218}), ('qz_3358', {'TP+FP': 50, 'TP+FN': 2, 'TP': 0, 'Precision': 0.0, 'Recall': 0.0, 'AP': 0.0}), ('qz_2135', {'TP+FP': 50, 'TP+FN': 2, 'TP': 2, 'Precision': 0.04, 'Recall': 1.0, 'AP': 0.7}), ('qz_2430', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.0625}), ('qz_5999', {'TP+FP': 50, 'TP+FN': 4, 'TP': 2, 'Precision': 0.04, 'Recall': 0.5, 'AP': 0.017208672086720868}), ('qz_6687', {'TP+FP': 50, 'TP+FN':

Wiki test set

In [None]:
## WIKI TEST Set
eval_at_k(retrieval_ready_qa_wiki_test_dict, vsm_retrieval_wiki_test_docs_scores_dict)

MAP: 0.2484221253801673
[('tc_118', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.022727272727272728}), ('tc_121', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.25}), ('tc_51', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.043478260869565216}), ('tc_62', {'TP+FP': 50, 'TP+FN': 1, 'TP': 0, 'Precision': 0.0, 'Recall': 0.0, 'AP': 0.0}), ('tc_293', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 1.0}), ('tc_312', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.06666666666666667}), ('tc_233', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.08333333333333333}), ('tc_250', {'TP+FP': 50, 'TP+FN': 1, 'TP': 1, 'Precision': 0.02, 'Recall': 1.0, 'AP': 0.14285714285714285}), ('tc_142', {'TP+FP': 50, 'TP+FN': 1, 'TP': 0, 'Precision': 0.0, 'Recall': 0.0, 'AP': 0.0}), ('tc_143', {'TP+FP': 50, 'TP+FN': 1, 'TP': 0, 'Precision': 

Wiki train set

In [None]:
## WIKI TRAIN Set
vsm_wiki_train_eval = {}
vsm_wiki_train_map_unnorm = 0
vsm_wiki_train_map = 0

number_of_retrieved_docs(vsm_retrieval_wiki_train_docs_scores_dict, vsm_wiki_train_eval)
true_pos(retrieval_ready_qa_wiki_train_dict, vsm_retrieval_wiki_train_docs_scores_dict, vsm_wiki_train_eval)
precision_recall(vsm_wiki_train_eval)
MAP(retrieval_ready_qa_wiki_train_dict, vsm_retrieval_wiki_train_docs_scores_dict ,vsm_wiki_train_eval, vsm_wiki_train_map_unnorm, vsm_wiki_train_map)

#print(list(vsm_wiki_train_eval.items())[:10])
check_perf(vsm_wiki_train_eval)

## Prep smart negatives for BM25 Re-ranking

In [None]:
def rerank_topX(retrieved, X, topX):
  for query, list_ret in retrieved.items():
    topX[query] = list_ret[:X]

In [None]:
# BM25: Shorten length to 50 documents to compare it with neural reranking
bm25_retrieval_wiki_train = bm25_retrieval_wiki_dev_docs_scores_dict.copy()

top_50_bm25={}

rerank_topX(bm25_retrieval_wiki_test, 50, top_50_bm25)
print(len(bm25_retrieval_wiki_test))
bm25_retrieval_wiki_test= None

len(top_50_bm25)

In [None]:
## BM25 - WIKI DEV Set - 50 document retrieved per query
bm25_wiki_dev_eval_50 = {}
bm25_relevant_docs_for_NN_dev = {}
bm25_wiki_test_map_unnorm = 0
bm25_wiki_test_map = 0
AP_bm25_50 = []

number_of_retrieved_docs(bm25_retrieval_wiki_dev_docs_scores_dict, bm25_wiki_dev_eval_50)
true_pos_NN(retrieval_ready_qa_wiki_dev_dict, bm25_retrieval_wiki_dev_docs_scores_dict, bm25_wiki_dev_eval_50, bm25_relevant_docs_for_NN_dev)
#precision_recall(bm25_wiki_dev_eval_50)
#MAP(retrieval_ready_qa_wiki_dev_dict, bm25_retrieval_wiki_dev_docs_scores_dict ,bm25_wiki_dev_eval_50, bm25_wiki_test_map_unnorm, bm25_wiki_test_map)

#print(list(bm25_wiki_dev_eval_50.items())[:10])
# check_perf(bm25_wiki_dev_eval_50)
# APhistogram(bm25_wiki_dev_eval_50, AP_bm25_50)
#AP_bm25_50
#bm25_relevant_docs_for_NN

save_as_pickle(bm25_relevant_docs_for_NN_dev, bm25_retrieved_dev_TP_FP)


In [None]:
## BM25 - WIKI TRAIN Set - 50 document retrieved per query
bm25_wiki_train_eval_50 = {}
bm25_relevant_docs_for_NN_train = {}
bm25_wiki_test_map_unnorm = 0
bm25_wiki_test_map = 0
AP_bm25_50 = []

number_of_retrieved_docs(bm25_retrieval_wiki_train_docs_scores_dict, bm25_wiki_train_eval_50)
true_pos_NN(retrieval_ready_qa_wiki_train_dict, bm25_retrieval_wiki_train_docs_scores_dict, bm25_wiki_train_eval_50, bm25_relevant_docs_for_NN_train)
# precision_recall(bm25_wiki_train_eval_50)
# MAP(retrieval_ready_qa_wiki_train_dict, bm25_retrieval_wiki_train_docs_scores_dict ,bm25_wiki_train_eval_50, bm25_wiki_test_map_unnorm, bm25_wiki_test_map)

#print(list(bm25_wiki_train_eval_50.items())[:10])
# check_perf(bm25_wiki_train_eval_50)
# APhistogram(bm25_wiki_train_eval_50, AP_bm25_50)
#AP_bm25_50
#bm25_relevant_docs_for_NN

save_as_pickle(bm25_relevant_docs_for_NN_train,bm25_retrieved_train_TP_FP)