# VSM

In [3]:
import pandas as pd
import numpy as np
!pip install pickle5
import pickle5 as pickle
import os
from collections import Counter
import re

from google.colab import drive



## Files and Folders


In [4]:
drive.mount('/content/drive', force_remount=True)  # use force_remount=True param after upload of new data
# !ls "/content/drive/My Drive/Master/2 - FSS 2021/Information Retrieval/IR Projekt/"

Mounted at /content/drive


In [5]:
# file and folder names
ir_project_drive_folder = "IR Projekt" 
full_ir_project_drive_folder = "/content/drive/My Drive/{}/data/wikipedia".format(ir_project_drive_folder)
# full_ir_project_drive_folder = '../data/wikipedia' # run locally

# preprocessed files
preprocessed_folder = full_ir_project_drive_folder + '/no-pron/preprocessed'

preprocessed_wikipedia_evidence_file = preprocessed_folder + '/preprocessed_wikipedia_evidence_dict.pkl'
preprocessed_qa_wikipedia_verified_dev_filename = preprocessed_folder + '/qa/verified-wikipedia-dev.pkl'
preprocessed_qa_wikipedia_dev_filename = preprocessed_folder + '/qa/wikipedia-dev.pkl'
preprocessed_qa_wikipedia_test_without_answers_filename = preprocessed_folder + '/qa/wikipedia-test-without-answers.pkl'
preprocessed_qa_wikipedia_train_filename = preprocessed_folder + '/qa/wikipedia-train.pkl'

## retrieval
vsm_results_folder = full_ir_project_drive_folder + '/no-pron/vsm'

vsm_retrieval_wiki_dev_docs_scores = vsm_results_folder + "/retrieval_wiki_dev_docs_scores.pkl"
vsm_retrieval_wiki_dev_verified_docs_scores = vsm_results_folder + "/retrieval_wiki_dev_verified_docs_scores.pkl"
vsm_retrieval_wiki_test_docs_scores = vsm_results_folder + "/retrieval_wiki_test_docs_scores.pkl"
vsm_retrieval_wiki_train_docs_scores = vsm_results_folder + "/retrieval_wiki_train_docs_scores.pkl"

In [6]:
def save_as_pickle(obj, filename):
    """
    save an object in a pickle file dump
    :param obj: object to dump
    :param filename: target file
    :return:
    """
    directory = os.path.dirname(filename)
    if not os.path.exists(directory):
        os.makedirs(directory)

    with open(filename, 'wb') as file:
        pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)


def load_pickle(filename):
    """
    load an object from a given pickle file
    :param filename: source file
    :return: loaded object
    """
    with open(filename, 'rb') as file:
        return pickle.load(file)

## Load Data

In [7]:
# Evidence data: Wiki (Preprocessed)

documents_dict_wiki = load_pickle(preprocessed_wikipedia_evidence_file)
print(len(documents_dict_wiki))


73859


In [8]:
# Query Data: Wiki Dev Set (Preprocessed)
qa_wiki_dev_dict = load_pickle(preprocessed_qa_wikipedia_dev_filename)

# Query Data: Wiki Dev Set - Verified (Preprocessed)
qa_wiki_dev_verified_dict = load_pickle(preprocessed_qa_wikipedia_verified_dev_filename)

# Query Data: Wiki Train Set(Preprocessed)
qa_wiki_train_dict = load_pickle(preprocessed_qa_wikipedia_train_filename)

# Query Data: Wiki Test Set(Preprocessed)
qa_wiki_test_dict = load_pickle(preprocessed_qa_wikipedia_test_without_answers_filename)

# VSM


function that counts all terms in a given text (document or query)

In [9]:
def term_counter_for_text(text):
  
  words = text.split(" ")
  terms_counter = Counter(words) # count words in the text
  del terms_counter[''] 
  
  return terms_counter

function that prints the memory space for a dictionary:


In [10]:
import sys

def print_size_of_dict(name, dictionary):
  sum = sys.getsizeof(dictionary)

  for key, value in dictionary.items():
    sum += sys.getsizeof(value)

  print('Size of dict', name, ':', int(sum/1024/1024), 'MB')

Inverted index

In [11]:
inverted_index_wiki_dict = {} # will hold the postings lists for each term
term_counters_by_document = {} # will hold the raw term counts for each document

progress = 0
for document, text in list(documents_dict_wiki.items()):
  
  term_counter = term_counter_for_text(text)

  if len(term_counter) == 0: # in case there are some empty docs in the corpus
    print(document)
    continue

  term_counters_by_document[document] = term_counter # add the term counts for the doc to the dictionary of docs

  for term in term_counter.keys():
    inverted_index_wiki_dict.setdefault(term, []).append(document)

print("Number of words in corpus: ", len(inverted_index_wiki_dict.keys()))

Number of words in corpus:  1079375


In [None]:
print_size_of_dict('documents_dict', documents_dict_wiki)
print_size_of_dict('inverted_index', inverted_index_wiki_dict)
print_size_of_dict('term_counters_by_document', term_counters_by_document)

Size of dict documents_dict : 946 MB
Size of dict inverted_index : 495 MB
Size of dict term_counters_by_document : 1737 MB


Compute idf values for all terms:

In [None]:
idfs = {}
number_of_documents = len(documents_dict_wiki)
for term, posting_list in inverted_index_wiki_dict.items():
  idfs[term] = np.log10(number_of_documents / len(posting_list))

print_size_of_dict('idfs', idfs)
list(idfs.items())[:10]

Size of dict idfs : 72 MB


[('gb', 2.1336035937176927),
 ('reserv', 0.957405706995897),
 ('internet', 1.2253457395550869),
 ('countri', 0.414672393802246),
 ('code', 1.0100467180087664),
 ('top', 0.4927946710610235),
 ('level', 0.554852551973035),
 ('domain', 1.4988165325701954),
 ('cctld', 3.3121009225392526),
 ('unit', 0.2781301755107998)]

Define function that computes tf_idf weights for a document and the document norm needed for cosine ranking. Then compute for all documents:

In [None]:
tf_idfs_by_document = {}
document_norms = {}

def tfidfs_for_counter(counter):
  tf_idfs = {}
  _, max_count = counter.most_common(1)[0]
  square_sum = 0

  for term, count in counter.items():
    tf = (1 + np.log10(count))/(1 + np.log10(max_count))
    tf_idf = tf * idfs.get(term, 0)
    tf_idfs[term] = tf_idf
    square_sum += tf_idf**2

  norm = np.sqrt(square_sum)

  return norm, tf_idfs


for document, counter in term_counters_by_document.items():
  norm, tf_idfs = tfidfs_for_counter(counter)

  tf_idfs_by_document[document] = tf_idfs
  document_norms[document] = norm


print_size_of_dict('tf_idfs_by_document', tf_idfs_by_document)
print_size_of_dict('document_norms', document_norms)

Size of dict tf_idfs_by_document : 1736 MB
Size of dict document_norms : 4 MB


In [None]:
term_counters_by_document = None
documents_dict_wiki = None

Implemt cosine_rank function from lecture. For a given query returns the top_k highest ranked documents. 

In [None]:
def cosine_rank(query, top_k):
  scores = {}
  query_counter = term_counter_for_text(query)
  _, query_tf_idfs = tfidfs_for_counter(query_counter)

  for term, query_weight in query_tf_idfs.items():
    posting_list = inverted_index_wiki_dict.get(term, [])
    for document in posting_list:
      scores[document] = scores.get(document, 0) + (query_weight * tf_idfs_by_document[document][term])

  normalized_scores = {}
  for document, score in scores.items():
    normalized_scores[document] = score / document_norms[document]

  sorted_scores = sorted(normalized_scores.items(), reverse=True, key=lambda item: item[1])
  return sorted_scores[:top_k]


In [None]:
cosine_rank("lloyd webber music premier 10th decemb 1993", 10)

[('Madeleine_Gurdon.txt', 0.7819336768107725),
 ('Romantic_Cello_Concertos.txt', 0.634674245007402),
 ('Gonna_Make_You_a_Star.txt', 0.6040957156827715),
 ('The_Music_of_the_Night.txt', 0.5933090167198797),
 ('Jiaxin_Cheng.txt', 0.582354207763073),
 ('Sing_(Gary_Barlow_song).txt', 0.5608383908228931),
 ('Evita.txt', 0.5422402714718866),
 ('All_I_Ask_of_You.txt', 0.523602732239101),
 ('Love_Changes_Everything_(song).txt', 0.5181813140982563),
 ('Memory_(song).txt', 0.510088600526465)]

In [None]:
import multiprocessing as mp
import datetime

def runVSM(idxKeyQuestion):
    (idx, (key, question)) = idxKeyQuestion

    if idx % 500 == 0:
        print(idx, datetime.datetime.now())

    return key, cosine_rank(question, 500)

def query_retrieved_docs(qas):
    print("querying docs:", len(qas))
    
    with mp.Pool(mp.cpu_count()) as pool:
        simple_qas = list(map(lambda qa: (qa['QuestionId'], qa["Question_preprocessed"]), qas))
        key_answers = pool.imap_unordered(runVSM, enumerate(simple_qas))
        result = dict(key_answers)
        
    return result

In [None]:
## Retrieve Documents for dev set
wiki_dev_query_results = query_retrieved_docs(qa_wiki_dev_dict)
save_as_pickle(wiki_dev_query_results, vsm_retrieval_wiki_dev_docs_scores)
print_size_of_dict('wiki_dev_query_results', wiki_dev_query_results)

querying docs: 7993
0 2021-05-04 19:24:57.316562
500 2021-05-04 19:25:23.052765
1000 2021-05-04 19:25:45.585865
1500 2021-05-04 19:26:08.457848
2000 2021-05-04 19:26:32.897769
2500 2021-05-04 19:26:58.714072
3000 2021-05-04 19:27:23.553695
3500 2021-05-04 19:27:52.670824
4000 2021-05-04 19:28:16.625500
4500 2021-05-04 19:28:37.589670
5000 2021-05-04 19:28:59.725532
5500 2021-05-04 19:29:23.206697
6000 2021-05-04 19:29:48.144946
6500 2021-05-04 19:30:17.081328
7000 2021-05-04 19:30:42.355476
7500 2021-05-04 19:31:01.961133
Size of dict wiki_dev_query_results : 35 MB


In [None]:
## Retrieve Documents for verified dev set - Subset of DEV SET!!
wiki_dev_ver_query_results = query_retrieved_docs(qa_wiki_dev_verified_dict)
save_as_pickle(wiki_dev_ver_query_results, vsm_retrieval_wiki_dev_verified_docs_scores)
print_size_of_dict('wiki_dev_ver_query_results', wiki_dev_ver_query_results)

querying docs: 318
0 2021-05-04 20:16:10.639889
Size of dict wiki_dev_ver_query_results : 1 MB


In [None]:
## Retrieve Documents for train set
wiki_train_query_results = query_retrieved_docs(qa_wiki_train_dict)
save_as_pickle(wiki_train_query_results, vsm_retrieval_wiki_train_docs_scores)
print_size_of_dict('wiki_train_query_results', wiki_train_query_results)

querying docs: 61888
5000  1000 2021-05-04 21:01:11.190218
2021-05-04 21:01:11.1888522021-05-04 21:01:11.189533

1500 2021-05-04 21:01:38.604615
2000 2021-05-04 21:01:39.613225
2500 2021-05-04 21:02:03.416906
3000 2021-05-04 21:02:10.374110
3500 2021-05-04 21:02:16.968913
4000 2021-05-04 21:02:38.977365
4500 2021-05-04 21:02:42.381566
5000 2021-05-04 21:03:05.474806
5500 2021-05-04 21:03:08.864655
6000 2021-05-04 21:03:27.973895
6500 2021-05-04 21:03:35.112308
7000 2021-05-04 21:03:40.926265
7500 2021-05-04 21:04:01.828336
8000 2021-05-04 21:04:05.129817
8500 2021-05-04 21:04:27.280284
9000 2021-05-04 21:04:31.325813
9500 2021-05-04 21:04:44.152226
10000 2021-05-04 21:04:59.685698
10500 2021-05-04 21:05:10.003845
11000 2021-05-04 21:05:24.727076
11500 2021-05-04 21:05:28.260220
12000 2021-05-04 21:05:47.252979
12500 2021-05-04 21:05:52.190940
13000 2021-05-04 21:06:02.080534
13500 2021-05-04 21:06:16.490449
14000 2021-05-04 21:06:23.609551
14500 2021-05-04 21:06:49.075847
15000 2021-05

In [None]:
## Retrieve Documents for test set
wiki_test_query_results = query_retrieved_docs(qa_wiki_test_dict)
save_as_pickle(wiki_test_query_results, vsm_retrieval_wiki_test_docs_scores)
print_size_of_dict('wiki_test_query_results', wiki_test_query_results)

querying docs: 7701
0 2021-05-04 20:16:49.017808
500 2021-05-04 20:17:02.166108
1000 2021-05-04 20:17:13.505320
1500 2021-05-04 20:17:24.562705
2000 2021-05-04 20:17:37.230710
2500 2021-05-04 20:17:50.501456
3000 2021-05-04 20:18:05.085566
3500 2021-05-04 20:18:19.596811
4000 2021-05-04 20:18:30.338580
4500 2021-05-04 20:18:41.665075
5000 2021-05-04 20:18:53.262511
5500 2021-05-04 20:19:05.456959
6000 2021-05-04 20:19:19.797412
6500 2021-05-04 20:19:32.855823
7000 2021-05-04 20:19:44.839884
7500 2021-05-04 20:19:56.079928
Size of dict wiki_test_query_results : 34 MB
