# BM25

For reference see: [Improvements to BM25 and Language Models Examined](http://www.cs.otago.ac.nz/homepages/andrew/papers/2014-2.pdf) 

In [None]:
import pandas as pd
import numpy as np
!pip install pickle5
import pickle5 as pickle
import os
from collections import Counter
import re


from google.colab import drive



In [None]:
 !python --version

Python 3.7.10


## Files and Folders


In [None]:
drive.mount('/content/drive', force_remount=True)  # use force_remount=True param after upload of new data

# file and folder names
ir_project_drive_folder = "IR Projekt"  # TODO: geht das für alle?
full_ir_project_drive_folder = "/content/drive/My Drive/{}/data/wikipedia".format(ir_project_drive_folder)
#full_ir_project_drive_folder = '../data/wikipedia'

# preprocessed files
preprocessed_folder = full_ir_project_drive_folder + '/no-pron/preprocessed'

preprocessed_wikipedia_evidence_file = preprocessed_folder + '/preprocessed_wikipedia_evidence_dict.pkl'
preprocessed_qa_wikipedia_verified_dev_filename = preprocessed_folder + '/qa/verified-wikipedia-dev.pkl'
preprocessed_qa_wikipedia_dev_filename = preprocessed_folder + '/qa/wikipedia-dev.pkl'
preprocessed_qa_wikipedia_test_without_answers_filename = preprocessed_folder + '/qa/wikipedia-test-without-answers.pkl'
preprocessed_qa_wikipedia_train_filename = preprocessed_folder + '/qa/wikipedia-train.pkl'

## retrieval
bm25_results_folder = full_ir_project_drive_folder + '/no-pron/bm25'

bm25_retrieval_wiki_dev_docs_scores = bm25_results_folder + "/retrieval_wiki_dev_docs_scores.pkl"
bm25_retrieval_wiki_dev_verified_docs_scores = bm25_results_folder + "/retrieval_wiki_dev_verified_docs_scores.pkl"
bm25_retrieval_wiki_test_docs_scores = bm25_results_folder + "/retrieval_wiki_test_docs_scores.pkl"
bm25_retrieval_wiki_train_docs_scores = bm25_results_folder + "/retrieval_wiki_train_docs_scores.pkl"

Mounted at /content/drive


In [None]:
def save_as_pickle(obj, filename):
    """
    save an object in a pickle file dump
    :param obj: object to dump
    :param filename: target file
    :return:
    """
    directory = os.path.dirname(filename)
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    with open(filename, 'wb') as file:
        pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)


def load_pickle(filename):
    """
    load an object from a given pickle file
    :param filename: source file
    :return: loaded object
    """
    with open(filename, 'rb') as file:
        return pickle.load(file)

## Load Data

In [None]:
# Evidence data: Wiki (Preprocessed)

documents_dict_wiki = load_pickle(preprocessed_wikipedia_evidence_file)
print(len(documents_dict_wiki))


73930


In [None]:
# Query Data: Wiki Dev Set (Preprocessed)
qa_wiki_dev_dict = load_pickle(preprocessed_qa_wikipedia_dev_filename)

# Query Data: Wiki Dev Set - Verified (Preprocessed)
qa_wiki_dev_verified_dict = load_pickle(preprocessed_qa_wikipedia_verified_dev_filename)

# Query Data: Wiki Train Set(Preprocessed)
qa_wiki_train_dict = load_pickle(preprocessed_qa_wikipedia_train_filename)

# Query Data: Wiki Test Set(Preprocessed)
qa_wiki_test_dict = load_pickle(preprocessed_qa_wikipedia_test_without_answers_filename)


In [None]:
qa_wiki_dev_dict

[{'Answer': {'Aliases': ['Sunset Blvd',
    'West Sunset Boulevard',
    'Sunset Boulevard',
    'Sunset Bulevard',
    'Sunset Blvd.'],
   'MatchedWikiEntityName': 'Sunset Boulevard',
   'NormalizedAliases': ['sunset boulevard',
    'sunset bulevard',
    'west sunset boulevard',
    'sunset blvd'],
   'NormalizedMatchedWikiEntityName': 'sunset boulevard',
   'NormalizedValue': 'sunset boulevard',
   'Type': 'WikipediaEntity',
   'Value': 'Sunset Boulevard'},
  'EntityPages': [{'DocSource': 'TagMe',
    'Filename': 'Andrew_Lloyd_Webber.txt',
    'Title': 'Andrew Lloyd Webber'}],
  'Question': 'Which Lloyd Webber musical premiered in the US on 10th December 1993?',
  'QuestionId': 'tc_33',
  'QuestionSource': 'http://www.triviacountry.com/',
  'Question_preprocessed': 'lloyd webber music premier 10th decemb 1993'},
 {'Answer': {'Aliases': ['Sir Henry Campbell-Bannerman',
    'Campbell-Bannerman',
    'Campbell Bannerman',
    'Sir Henry Campbell Bannerman',
    'Henry Campbell Bannerma

### Helper functions

In [None]:
def term_counter_for_text(text):
  
  words = text.split(" ")
  terms_counter = Counter(words) # count words in the text
  del terms_counter['']
  
  return terms_counter

function that prints the memory space for a dictionary:


In [None]:
import sys

def print_size_of_dict(name, dictionary):
  sum = sys.getsizeof(dictionary)

  for key, value in dictionary.items():
    sum +=  sys.getsizeof(value)

  print('Size of dict',name, ':', int(sum/1024/1024), 'MB')

### Inverted Index

In [None]:
inverted_index_wiki_dict = {} # will hold the postings lists for each term

term_counters_by_document = {} # will hold the raw term counts for each document
document_lengths = {} 

progress = 0
for document, text in list(documents_dict_wiki.items()):
  
  term_counter = term_counter_for_text(text)

  if len(term_counter) == 0: # there are some empty docs in the corpus
    print(document)
    continue

  term_counters_by_document[document] = term_counter # add the term counts for the doc to the dictionary of docs
  document_lengths[document] = sum(term_counter.values())

  for term in term_counter.keys():
    inverted_index_wiki_dict.setdefault(term, []).append(document)

print("Number of words in corpus: ", len(inverted_index_wiki_dict.keys()))

AFAIK.txt
Against_All_Odds_(Take_a_Look_at_Me_Now).txt
Ailurophile.txt
Alexander_I_of_Yugoslavia.txt
Bathophobia.txt
Bernicia.txt
Category_Australian_poets.txt
Category_Battles_of_the_Franco-Prussian_War.txt
Category_BBC_Radio_5_Live_presenters.txt
Category_Chess_notation.txt
Category_Children's_television_channels_in_India.txt
Category_Detective_television_series.txt
Category_Directors_of_the_National_Gallery,_London.txt
Category_Landlocked_countries.txt
Category_Lists_of_lakes_by_country.txt
Category_Lists_of_rivers_by_country.txt
Category_Mexican_bullfighters.txt
Category_New_Zealand_opera_singers.txt
Category_Olympic_gold_medalists_for_Finland.txt
Category_Severn_drainage_basin.txt
Category_Stop-motion_animated_television_series.txt
Category_1945_films.txt
Category_1950s_3D_films.txt
Category_19th-century_French_people.txt
Category_Airports_in_the_United_States.txt
Category_Terrestrial_planets.txt
Category_Tributaries_of_the_River_Severn.txt
Category_Units_of_measurement.txt
Daisy_

In [None]:
print_size_of_dict('inverted index', inverted_index_wiki_dict)
print_size_of_dict('term_counters_by_document', term_counters_by_document)

Size of dict inverted index : 511 MB
Size of dict term_counters_by_document : 1738 MB


###Compute idf weights for all terms:

idf taken from https://en.wikipedia.org/wiki/Okapi_BM25#The_ranking_function

In [None]:
idfs = {}
number_of_documents = len(term_counters_by_document)
for term, posting_list in inverted_index_wiki_dict.items():
  number_of_docs_with_term = len(posting_list)
  idfs[term] = np.log(1 + (number_of_documents - number_of_docs_with_term + 0.5) / (number_of_docs_with_term + 0.5))

#print_size_of_dict('idfs', idfs)
#list(idfs.items())[:10]

### BM25 Scoring function

In [None]:
average_document_length = sum(document_lengths.values()) / number_of_documents

def score(term, document, k1, b):
  raw_count = term_counters_by_document[document][term]
  return idfs[term] * ((raw_count * (k1 + 1)) / (raw_count + (k1 * (1 - b + b * document_lengths[document] / average_document_length))))

In [None]:
def BM25rank(query, top_k, k1 = 1.5, b = 0.75):
  scores = {}
  query_terms = term_counter_for_text(query).keys()

  for term in query_terms:
    posting_list = inverted_index_wiki_dict.get(term, []) # only look at documents that actually contain the term
    for document in posting_list:
      scores[document] = scores.get(document, 0) + score(term, document, k1, b)

  sorted_scores = sorted(scores.items(), reverse=True, key=lambda item: item[1])
  return sorted_scores[:top_k]


In [None]:
BM25rank("lloyd webber music premier 10th decemb 1993", 10)

[('Julian_Lloyd_Webber.txt', 29.39136583238563),
 ('Adelphi_Theatre.txt', 29.352536085427317),
 ('Andrew_Lloyd_Webber.txt', 29.001668204913347),
 ('Sunset_Boulevard_(musical).txt', 28.975076592109684),
 ('The_Phantom_of_the_Opera_(1986_musical).txt', 28.945679198257686),
 ("I_Don't_Know_How_to_Love_Him.txt", 28.188574982451904),
 ('Michael_Ball_(singer).txt', 28.114409499610833),
 ('Michael_Crawford.txt', 27.429953679132367),
 ('Sarah_Brightman.txt', 27.384611555644376),
 ('Marti_Webb.txt', 26.723444003361852)]

## Save documents retrieved for query w/ score to access for evaluation

In [None]:
import multiprocessing as mp
import datetime

def runBM25(idxKeyQuestion):
    (idx, (key, question)) = idxKeyQuestion

    if idx % 500 == 0:
        print(idx, datetime.datetime.now())

    return key, BM25rank(question, 500)

def query_retrieved_docs(qas):
    print("querying docs:", len(qas))
    
    with mp.Pool(mp.cpu_count()) as pool:
        simple_qas = list(map(lambda qa: (qa['QuestionId'], qa["Question_preprocessed"]), qas))
        key_answers = pool.imap_unordered(runBM25, enumerate(simple_qas), chunksize=2)
        result = dict(key_answers)
        
    return result

In [None]:
## Retrieve Documents for dev set
wiki_dev_query_results = query_retrieved_docs(qa_wiki_dev_dict)
save_as_pickle(wiki_dev_query_results, bm25_retrieval_wiki_dev_docs_scores)
print_size_of_dict('wiki_dev_query_results', wiki_dev_query_results)

querying docs: 7993
0 2021-05-02 17:46:26.302403
500 2021-05-02 17:46:54.561611
1000 2021-05-02 17:47:20.058926
1500 2021-05-02 17:47:45.744798
2000 2021-05-02 17:48:14.009692
2500 2021-05-02 17:48:49.026961
3000 2021-05-02 17:49:20.642811
3500 2021-05-02 17:49:58.235993
4000 2021-05-02 17:50:27.700489
4500 2021-05-02 17:50:54.731572
5000 2021-05-02 17:51:21.909946
5500 2021-05-02 17:51:50.045499
6000 2021-05-02 17:52:19.026224
6500 2021-05-02 17:52:53.354395
7000 2021-05-02 17:53:22.672165
7500 2021-05-02 17:53:47.204547
Size of dict wiki_dev_query_results : 35 MB


In [None]:
## Retrieve Documents for verified dev set - Subset of DEV SET!!
wiki_dev_ver_query_results = query_retrieved_docs(qa_wiki_dev_verified_dict)
save_as_pickle(wiki_dev_ver_query_results, bm25_retrieval_wiki_dev_verified_docs_scores)
print_size_of_dict('wiki_dev_ver_query_results', wiki_dev_ver_query_results)

querying docs: 318
0 2021-05-02 17:45:29.426422
Size of dict wiki_dev_ver_query_results : 1 MB


In [None]:
## Retrieve Documents for train set
wiki_train_query_results = query_retrieved_docs(qa_wiki_train_dict)
save_as_pickle(wiki_train_query_results, bm25_retrieval_wiki_train_docs_scores)
print_size_of_dict('wiki_train_query_results', wiki_train_query_results)

querying docs: 61888
0 2021-05-02 17:54:50.615154
500 2021-05-02 17:55:15.026945
1000 2021-05-02 17:55:38.485572
1500 2021-05-02 17:56:01.901730
2000 2021-05-02 17:56:23.669999
2500 2021-05-02 17:56:38.497276
3000 2021-05-02 17:56:51.650109
3500 2021-05-02 17:57:05.152251
4000 2021-05-02 17:57:18.026136
4500 2021-05-02 17:57:30.435087
5000 2021-05-02 17:57:42.456308
5500 2021-05-02 17:57:54.665583
6000 2021-05-02 17:58:06.956980
6500 2021-05-02 17:58:19.243857
7000 2021-05-02 17:58:31.929078
7500 2021-05-02 17:58:43.969402
8000 2021-05-02 17:58:56.316406
8500 2021-05-02 17:59:09.766235
9000 2021-05-02 17:59:23.011739
9500 2021-05-02 17:59:36.983235
10000 2021-05-02 17:59:49.605091
10500 2021-05-02 18:00:00.787952
11000 2021-05-02 18:00:12.234456
11500 2021-05-02 18:00:22.827254
12000 2021-05-02 18:00:34.427009
12500 2021-05-02 18:00:46.188072
13000 2021-05-02 18:00:57.683369
13500 2021-05-02 18:01:10.922699
14000 2021-05-02 18:01:28.169462
14500 2021-05-02 18:01:46.112994
15000 2021-05

In [None]:
## Retrieve Documents for test set
wiki_test_query_results = query_retrieved_docs(qa_wiki_test_dict)
save_as_pickle(wiki_test_query_results, bm25_retrieval_wiki_test_docs_scores)
print_size_of_dict('wiki_test_query_results', wiki_test_query_results)

querying docs: 7701
0 2021-05-02 18:25:28.834834
500 2021-05-02 18:25:41.102183
1000 2021-05-02 18:25:53.547694
1500 2021-05-02 18:26:05.584208
2000 2021-05-02 18:26:19.597481
2500 2021-05-02 18:26:34.462380
3000 2021-05-02 18:26:50.997676
3500 2021-05-02 18:27:07.555352
4000 2021-05-02 18:27:19.178095
4500 2021-05-02 18:27:31.533639
5000 2021-05-02 18:27:44.250248
5500 2021-05-02 18:27:57.586517
6000 2021-05-02 18:28:13.736967
6500 2021-05-02 18:28:28.335430
7000 2021-05-02 18:28:41.625128
7500 2021-05-02 18:28:53.770278
Size of dict wiki_test_query_results : 34 MB
