In [45]:
import operator

from collections import defaultdict
from math import log

# Vector space model (VSM)

## Different instantiations


In [46]:
def format_results(results):
    sorted_results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
    for doc_id, rank in sorted_results:
        print('\t{}: {}'.format(doc_id, rank))

def word_frequency(documents):
    w_freq = defaultdict(int)
    for document in documents:
        for word in document.split():
            w_freq[word.strip().lower()] += 1
    return dict(w_freq)

def doc_presence(documents):
    d_freq = defaultdict(int)
    for document in documents:
        words = set([word.lower() for word in document.split()])
        for word in words:
            d_freq[word.strip().lower()] += 1
    return dict(d_freq)

query = 'news about presidential campaign'
docs = {
    'd1': 'news about',
    'd2': 'news about organic food campaign',
    'd3': 'news of presidential campaign',
    'd4': 'news of presidential campaign presidential candidate ',
    'd5': 'news of organic food campaign campaign campaign campaign',
}
total_docs = len(docs)
doc_presences = doc_presence(docs.values())
print('doc_presences:')
format_results(doc_presences)

print('\ntotal_docs: {}'.format(total_docs))

doc_presences:
	news: 5
	campaign: 4
	of: 3
	organic: 2
	presidential: 2
	food: 2
	about: 2
	candidate: 1

total_docs: 5


### Simple VSM

In [47]:
def ranking_simple(query, document):
    query_freq = word_frequency([query])
    doc_freq = word_frequency([document])
    shared_w = set(query_freq.keys()).intersection(set(doc_freq.keys()))
    ranking = 0
    for word in shared_w:
        w_query_presence = presence(query_freq[word])
        w_doc_presence = presence(doc_freq[word])
        word_ranking = w_query_presence * w_doc_presence
        ranking += word_ranking
    return ranking

def presence(value):
    if value > 0:
        return 1
    else:
        return 0

In [48]:
results = {}
for doc_id in sorted(docs):
    results[doc_id] = ranking_simple(query, docs[doc_id])
format_results(results)

	d4: 3
	d3: 3
	d2: 3
	d5: 2
	d1: 2


### TF VSM

In [49]:
def ranking_tf(query, document, total_docs, doc_presences):
    query_freq = word_frequency([query])
    doc_freq = word_frequency([document])
    shared_w = set(query_freq.keys()).intersection(set(doc_freq.keys()))
    ranking = 0
    for word in shared_w:
        w_query_freq = query_freq[word]
        w_doc_freq = doc_freq[word]
        w_overall_freq = doc_presences[word]
        word_ranking = w_query_freq * w_doc_freq 
        ranking += word_ranking
    return ranking

In [50]:
results = {}
for doc_id in sorted(docs):
    results[doc_id] = ranking_tf(query, docs[doc_id], total_docs, doc_presences)
format_results(results)

	d5: 5
	d4: 4
	d3: 3
	d2: 3
	d1: 2


### TF-IDF VSM

In [51]:
def ranking_tfidf(query, document, total_docs, doc_presences):
    query_freq = word_frequency([query])
    doc_freq = word_frequency([document])
    shared_w = set(query_freq.keys()).intersection(set(doc_freq.keys()))
    ranking = 0
    for word in shared_w:
        w_query_freq = query_freq[word]
        w_doc_freq = doc_freq[word]
        w_overall_freq = doc_presences[word]
        idf_term = log((float(total_docs) + 1) / w_overall_freq)
        word_ranking = w_query_freq * w_doc_freq * idf_term
        ranking += word_ranking
    return ranking

In [52]:
results = {}
for doc_id in sorted(docs):
    results[doc_id] = ranking_tfidf(query, docs[doc_id], total_docs, doc_presences)
format_results(results)

	d4: 2.7850112422383386
	d5: 1.8041819892266122
	d3: 1.6863989535702286
	d2: 1.6863989535702286
	d1: 1.2809338454620645


### BM25 VSM

In [53]:
def ranking_bm25(query, document, total_docs, doc_presences, k=0.5):
    query_freq = word_frequency([query])
    doc_freq = word_frequency([document])
    shared_w = set(query_freq.keys()).intersection(set(doc_freq.keys()))
    ranking = 0
    for word in shared_w:
        w_query_freq = query_freq[word]
        w_doc_freq = doc_freq[word]
        w_overall_freq = doc_presences[word]
        idf_term = log((int(total_docs) + 1) / w_overall_freq)
        bm_term = ((k+1) * w_doc_freq)/ (w_doc_freq + k)
        word_ranking = w_query_freq * w_doc_freq * idf_term * bm_term
        ranking += word_ranking
    return ranking

In [54]:
results = {}
for doc_id in sorted(docs):
    results[doc_id] = ranking_bm25(query, docs[doc_id], total_docs, doc_presences)

format_results(results)

	d4: 3.2244561577055824
	d5: 2.344802133370831
	d3: 1.6863989535702286
	d2: 1.6863989535702286
	d1: 1.2809338454620645


### Pivoted length normalization VSM

In [55]:
def avg_doc_length(documents):
    tokens = 0
    for document in documents:
        tokens += len(document.split())
    return tokens / len(documents)

def ranking_pln(query, document, total_docs, doc_presences, avdl, b=0.25):
    query_freq = word_frequency([query])
    doc_freq = word_frequency([document])
    shared_w = set(query_freq.keys()).intersection(set(doc_freq.keys()))
    ranking = 0
    for word in shared_w:
        w_query_freq = query_freq[word]
        w_doc_freq = doc_freq[word]
        w_overall_freq = doc_presences[word]
        idf_term = log((int(total_docs) + 1) / w_overall_freq)
        pln_term = (log(1 + log(1 + w_doc_freq))) / (1 - b + (b * (len(document.split())/ avdl)))
        word_ranking = w_query_freq * idf_term * pln_term
        ranking += word_ranking
    return ranking

In [56]:
avdl = avg_doc_length(docs.values())
results = {}
for doc_id in sorted(docs):
    results[doc_id] = ranking_pln(query, docs[doc_id], total_docs, doc_presences, avdl)

format_results(results)

	d4: 1.0703783591674068
	d3: 0.9347781011933076
	d2: 0.8880391961336422
	d1: 0.7935596664445655
	d5: 0.421655864041908


### Okapi BM25 VSM

In [57]:
def ranking_okapi_bm25(query, document, total_docs, doc_presences, avdl, b=0.25, k=0.25):
    query_freq = word_frequency([query])
    doc_freq = word_frequency([document])
    shared_w = set(query_freq.keys()).intersection(set(doc_freq.keys()))
    ranking = 0
    for word in shared_w:
        w_query_freq = query_freq[word]
        w_doc_freq = doc_freq[word]
        w_overall_freq = doc_presences[word]
        idf_term = log((int(total_docs) + 1) / w_overall_freq)
        bm_term = ((k+1) * w_doc_freq)/ (w_doc_freq + (k * (1 - b + (b * (len(document.split())/ avdl)))))
        word_ranking = w_query_freq * idf_term * bm_term
        ranking += word_ranking
    return ranking

In [58]:
avdl = avg_doc_length(docs.values())
results = {}
for doc_id in sorted(docs):
    results[doc_id] = ranking_okapi_bm25(query, docs[doc_id], total_docs, doc_presences, avdl)
format_results(results)

	d4: 1.7959032255250897
	d3: 1.7034332864345747
	d2: 1.6863989535702286
	d1: 1.3205503561464584
	d5: 0.6498568273115293
