#### Import Data And Stop Words

In [204]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
docs = pd.read_csv('hw1_docs.csv')
queries = pd.read_csv('hw1_queries.csv')
qrels = pd.read_csv('hw1_qrels.csv')
with open('stop_words.txt','r') as f: stops = { w:1 for w in f.read().splitlines()}

In [205]:
# sanity check function
def check_results(query_index, ranking):
    print(f"[Query]: {queries.iloc[query_index]['query']}")
    RELATED_DOCS = ranking[query_index]
    print(f"[RELATED DOCS INDECES]: {RELATED_DOCS}")
    print(f"[RELATED DOCS IDS]: {[docs.iloc[i]['doc_id'] for i in RELATED_DOCS]}")
    print(f"TOP FIVE RESULT FOR 1'st QUERY")
    print(f"[{docs.iloc[RELATED_DOCS[0]]['doc_id']}]: {docs.iloc[RELATED_DOCS[0]]['document']}")
    print(f"[{docs.iloc[RELATED_DOCS[1]]['doc_id']}]: {docs.iloc[RELATED_DOCS[1]]['document']}")
    print(f"[{docs.iloc[RELATED_DOCS[2]]['doc_id']}]: {docs.iloc[RELATED_DOCS[2]]['document']}")
    print(f"[{docs.iloc[RELATED_DOCS[3]]['doc_id']}]: {docs.iloc[RELATED_DOCS[3]]['document']}")
    print(f"[{docs.iloc[RELATED_DOCS[4]]['doc_id']}]: {docs.iloc[RELATED_DOCS[4]]['document']}")
    print(f"[QREL]: {qrels[qrels['query_id']==(query_index+1)]}")

#### Preprocessing Document Dataset

In [206]:
# preprocessing : tokenization, lower case, stop-word removal
term_doc = {}
doc_term = {}
for index, row in docs.iterrows():
    term_doc[row['doc_id']] = []
    tokens_list = re.split(r'\s+', re.sub(r'[^\w\s]',' ',row['document'].lower()))
    for token in tokens_list:
        if token in doc_term: 
            doc_term[token] += [row['doc_id']]
        elif (((len(token)>=2) and any(not char.isdigit() for char in token)) or (len(token)>=4)) and (token not in stops): 
            doc_term[token] = [row['doc_id']]
        else: continue
        term_doc[row['doc_id']] += [token]

#### Term Frequency In Documents And Queries

In [207]:
# find most frequent terms
# constants
DIC_LENGTH = 10000
NUM_RELATD = 10
EPSILON = 1e-10

doc_term = dict(sorted(doc_term.items(), key=lambda item: len(item[1])))
# make dictionary of 1000 important words
dictionary = {term:index for index,term in enumerate(list(doc_term.keys())[0:DIC_LENGTH])}
# make dictionary-term tf-idf matrix for docs
# this operation will result the DT matrix of shape(1000,750), which is 750 doc vectors in dictionary space
TD = np.zeros((len(dictionary),len(term_doc)))
for index,(doc,terms) in enumerate(term_doc.items()):
    doc_in_dict = {}
    for term in terms:
        if term in dictionary: 
            if term in doc_in_dict:
                doc_in_dict[term] += 1
            else:
                doc_in_dict[term] = 1
    for term in doc_in_dict.keys():
        TD[dictionary[term],index] = (doc_in_dict[term]/len(doc_in_dict))
# make dictionary-term tf-idf matrix for queries
# this operation will result the DT matrix of shape(1000,50), which is 50 query vectors in dictionary space
query_term = {}
term_query = {}
for index, row in queries.iterrows():
    term_query[row['query_id']] = []
    tokens_list = re.split(r'\s+', re.sub(r'[^\w\s]',' ',row['query'].lower()))
    for token in tokens_list:
        if token in dictionary:
            if token in query_term:
                query_term[token] += [row['query_id']]
            else:
                query_term[token] = [row['query_id']]
            term_query[row['query_id']] += [token]

TQ = np.zeros((len(dictionary),queries.shape[0]))
for index,(query,terms) in enumerate(term_query.items()):
    query_in_dict = {}
    for term in terms:
        if term in query_in_dict:
            query_in_dict[term] += 1
        else:
            query_in_dict[term] = 1
    for term in query_in_dict.keys():
        TQ[dictionary[term],index] = (query_in_dict[term]/len(query_in_dict))

#### Unigram Model

In [208]:
# Jelinek-Mercer smoothing : given λ, will calculate and return smoothed TD vector
def JMS(TD=TD, λ=0.2): return λ*(TD/(np.sum(TD,axis=0)+EPSILON)) + (1-λ)*(np.sum(TD,axis=1)/np.sum(TD)).reshape(-1,1)
# Unigrams assumes independence of query words given document model
def Unigram(TD=TD,TQ=TQ, λ=0.2): return np.argsort(-np.exp(np.einsum("ij,ik->jk",np.log(JMS(TD=TD, λ=λ)),TQ/np.sum(TQ,axis=0))),axis=0).T[:, :NUM_RELATD]

In [210]:
UGM = [Unigram(λ=λ/10) for λ in range(1,10,1)]

check_results(0, UGM[1])

[Query]: what is the origin of COVID-19
[RELATED DOCS INDECES]: [637 690 668 389 425 557 435 748 609 278]
[RELATED DOCS IDS]: ['lj8t52yl', '2y452utz', '0cq5ee1i', '0xkz36bj', '22fc1qly', '0m5mc320', '41378qru', '105q161g', '1mjaycee', '1vkz0b0o']
TOP FIVE RESULT FOR 1'st QUERY
[lj8t52yl]: The origin of the SARS‐CoV‐2 virus remains enigmatic. It is likely to be a continuum resulting from inevitable mutations and recombination events. These genetic changes keep developing in the present epidemic. Mutations tending to deplete the genome in its cytosine content will progressively lead to attenuation as a consequence of Muller's ratchet, but this is counteracted by recombination when different mutants co‐infect the same host, in particular, in clusters of infection. Monitoring as a function of time the genome sequences in closely related cases is critical to anticipate the future of SARS‐CoV‐2 and hence of COVID‐19.
[2y452utz]: A novel coronavirus strain 2019-nCoV has caused a rapid global 

#### Bigram Model

##### Making a Reduced Dictionary for Bigram Model

In [202]:
# preprocessing : tokenization, lower case, stop-word removal
term_doc = {}
doc_term = {}
for index, row in docs.iterrows():
    term_doc[row['doc_id']] = []
    tokens_list = re.split(r'\s+', re.sub(r'[^\w\s]',' ',row['document'].lower()))
    for token in tokens_list:
        if token in doc_term: 
            doc_term[token] += [row['doc_id']]
        elif (((len(token)>=2) and any(not char.isdigit() for char in token)) or (len(token)>=4)) and (token not in stops): 
            doc_term[token] = [row['doc_id']]
        else: continue
        term_doc[row['doc_id']] += [token]
term_query = {}
query_term = {}
for index, row in queries.iterrows():
    term_query[row['query_id']] = []
    tokens_list = re.split(r'\s+', re.sub(r'[^\w\s]',' ',row['query'].lower()))
    for token in tokens_list:
        if token in query_term: 
            query_term[token] += [row['query_id']]
        elif (((len(token)>=2) and any(not char.isdigit() for char in token)) or (len(token)>=4)) and (token not in stops): 
            query_term[token] = [row['query_id']]
        else: continue
        term_query[row['query_id']] += [token]

In [203]:
# find most frequent terms
# constants
DIC_LENGTH = 700
NUM_RELATD = 10
EPSILON = 1e-10

doc_term = dict(sorted(doc_term.items(), key=lambda item: len(item[1])))
query_term = dict(sorted(query_term.items(), key=lambda item: len(item[1])))
# mergy two dictinaries
merged_dict = query_term.copy()  # Create a copy of dict1
for key, value in doc_term.items():
    if key in merged_dict:
        merged_dict[key] += value
    else:
        merged_dict[key] = value
# make dictionary of DIC_LENGTH important words
dictionary = {term:index for index,term in enumerate(list(merged_dict.keys())[0:DIC_LENGTH])}
# make dictionary-term tf-idf matrix for docs
# this operation will result the TD matrix of shape(500,750), which is 750 doc vectors in dictionary space
TD = np.zeros((len(dictionary),len(term_doc)))
# this operation will also result the TTD tensor of shape(500,500,750) which models the w_i,w_i-1,D term in bigram or equivalently doc vectors in term x prev_term space!
TTD = np.zeros((len(dictionary),len(dictionary),len(term_doc)))
for index,(doc,terms) in enumerate(term_doc.items()):
    doc_in_dict = {}
    doc_in_bi_dict = {}
    prev_term = None
    for term in terms:
        if term in dictionary: 
            # uni dicitonary
            if term in doc_in_dict:
                doc_in_dict[term] += 1
            else:
                doc_in_dict[term] = 1
            # bi dictionary
            if prev_term != None:
                if (term,prev_term) in doc_in_bi_dict:
                    doc_in_bi_dict[(term,prev_term)] += 1
                else:
                    doc_in_bi_dict[(term,prev_term)] = 1
            prev_term = term
        else:
            prev_term = None
    # assemble uni dictionary
    for term in doc_in_dict.keys():
        TD[dictionary[term],index] = (doc_in_dict[term]/len(doc_in_dict))
    # assemble bi dictionary
    for (term,prev_term) in doc_in_bi_dict.keys():
        TTD[dictionary[term],dictionary[prev_term],index] = (doc_in_bi_dict[(term,prev_term)]/doc_in_dict[prev_term])
# make dictionary-term tf-idf matrix for queries
# this operation will result the DT matrix of shape(1000,50), which is 50 query vectors in dictionary space
query_term = {}
term_query = {}
for index, row in queries.iterrows():
    term_query[row['query_id']] = []
    tokens_list = re.split(r'\s+', re.sub(r'[^\w\s]',' ',row['query'].lower()))
    for token in tokens_list:
        if token in dictionary:
            if token in query_term:
                query_term[token] += [row['query_id']]
            else:
                query_term[token] = [row['query_id']]
            term_query[row['query_id']] += [token]
# this operation will result the TQ matrix of shape(500,50), which is 50 query vectors in dictionary space
TQ = np.zeros((len(dictionary),queries.shape[0]))
# this operation will also result the TTQ tensor of shape(500,500,50) which models the w_i,w_i-1,Q term in bigram or equivalently query vectors in term x prev_term space!
TTQ = np.zeros((len(dictionary), len(dictionary), queries.shape[0]))
for index,(query,terms) in enumerate(term_query.items()):
    query_in_dict = {}
    query_in_bi_dict = {}
    prev_term = None
    for term in terms:
        # uni dictionary
        if term in query_in_dict:
            query_in_dict[term] += 1
        else:
            query_in_dict[term] = 1
        # bi dictionary
        if prev_term != None:
            if (term,prev_term) in query_in_bi_dict:
                query_in_bi_dict[(term,prev_term)] += 1
            else:
                query_in_bi_dict[(term,prev_term)] = 1
        prev_term = term
    # assemble uni dictionary
    for term in query_in_dict.keys():
        TQ[dictionary[term],index] = (query_in_dict[term]/len(query_in_dict))
    # assemble bi dictionary
    for (term,prev_term) in query_in_bi_dict.keys():
        TTQ[dictionary[term],dictionary[prev_term],index] = (query_in_bi_dict[(term,prev_term)]/query_in_dict[prev_term])

##### Bigram Module 

In [172]:
print(TQ.shape)
print(TTQ.shape)
print(TD.shape)
print(TTD.shape)

(700, 50)
(700, 700, 50)
(700, 750)
(700, 700, 750)


In [175]:
# Jelinek-Mercer smoothing : given λ_1, λ_2, will calculate and return smoothed TTD vector
def JMS(TTD=TTD, TD=TD, λ_1=0.2, λ_2=0.2): return λ_1*TTD + (λ_2*(TD/(np.sum(TD,axis=0)+EPSILON)) + (1-λ_1-λ_2)*(np.sum(TD,axis=1)/(np.sum(TD)+EPSILON)).reshape(-1,1)).reshape((DIC_LENGTH,1,-1))
# Bigrams assumes independence of query words given document model and one previous term if exists
def Bigram(TTD=TTD, TTQ=TTQ, TD=TD, TQ=TQ, λ_1=0.2, λ_2=0.2): return np.argsort(-np.einsum("ijd,ijq->dq",np.log(JMS(TTD=TTD, TD=TD, λ_1=λ_1, λ_2=λ_2)+1e-12),TTQ), axis=0).T[:, :NUM_RELATD]

In [None]:
BGM = [Bigram(λ_1=λ_1, λ_2=λ_2) for λ_1,λ_2 in [(0.6,0.1),(0.6,0.2),(0.6,0.3),(0.7,0.1),(0.7,0.2),(0.8,0.1),(0.9,0.05)]]

In [179]:
check_results(0, BGM[2])

[Query]: what is the origin of COVID-19
[RELATED DOCS INDECES]: [721 525 654 520 158 614 209 494 739 415]
[RELATED DOCS IDS]: ['1emlkii0', '0u00nhf2', '45bwzuqn', '0xqhm8a0', '04zbbyii', '2lebavgm', '16crg3k8', '1xj2sg4y', '42t0zriz', '4dnzjeyp']
TOP FIVE RESULT FOR 1'st QUERY
[1emlkii0]: COVID-19 is a bluff
[45bwzuqn]: OBJECTIVES: To investigate the clinical and chest CT characteristics of COVID-19 pneumonia and explore the radiological differences between COVID-19 and influenza. MATERIALS AND METHODS: A total of 122 patients (61 men and 61 women, 48 ± 15 years) confirmed with COVID-19 and 48 patients (23 men and 25 women, 47 ± 19 years) confirmed with influenza were enrolled in the study. Thin-section CT was performed. The clinical data and the chest CT findings were recorded. RESULTS: The most common symptoms of COVID-19 were fever (74%) and cough (63%), and 102 patients (83%) had Wuhan contact. Pneumonia in 50 patients with COVID-19 (45%) distributed in the peripheral regions of th

#### Word2Vec

In [192]:
# find most frequent terms
# constants
DIC_LENGTH = 10000
NUM_RELATD = 10
EPSILON = 1e-10

doc_term = dict(sorted(doc_term.items(), key=lambda item: len(item[1])))
query_term = dict(sorted(query_term.items(), key=lambda item: len(item[1])))
# mergy two dictinaries
merged_dict = query_term.copy()  # Create a copy of dict1
for key, value in doc_term.items():
    if key in merged_dict:
        merged_dict[key] += value
    else:
        merged_dict[key] = value
# make dictionary of DIC_LENGTH important words
dictionary = {term:index for index,term in enumerate(list(merged_dict.keys()))}
# make dictionary-term tf-idf matrix for docs
# this operation will result the DT matrix of shape(1000,750), which is 750 doc vectors in dictionary space
TD = np.zeros((len(dictionary),len(term_doc)))
for index,(doc,terms) in enumerate(term_doc.items()):
    doc_in_dict = {}
    for term in terms:
        if term in dictionary: 
            if term in doc_in_dict:
                doc_in_dict[term] += 1
            else:
                doc_in_dict[term] = 1
    for term in doc_in_dict.keys():
        TD[dictionary[term],index] = (doc_in_dict[term]/len(doc_in_dict)) * np.log10(len(term_doc)/len(doc_term[term])) 
# make dictionary-term tf-idf matrix for queries
# this operation will result the TQ matrix of shape(1000,50), which is 50 query vectors in dictionary space
query_term = {}
term_query = {}
for index, row in queries.iterrows():
    term_query[row['query_id']] = []
    tokens_list = re.split(r'\s+', re.sub(r'[^\w\s]',' ',row['query'].lower()))
    for token in tokens_list:
        if token in dictionary:
            if token in query_term:
                query_term[token] += [row['query_id']]
            else:
                query_term[token] = [row['query_id']]
            term_query[row['query_id']] += [token]

TQ = np.zeros((len(dictionary),queries.shape[0]))
for index,(query,terms) in enumerate(term_query.items()):
    query_in_dict = {}
    for term in terms:
        if term in query_in_dict:
            query_in_dict[term] += 1
        else:
            query_in_dict[term] = 1
    for term in query_in_dict.keys():
        TQ[dictionary[term],index] = (query_in_dict[term]/len(query_in_dict)) * np.log10(len(term_doc)/len(doc_term[term])) 

In [193]:
from gensim.models import Word2Vec
model = Word2Vec(list(term_doc.values())+list(term_query.values()), min_count=1, sg=1, vector_size=200, epochs=20)
# Arithmatic Mean vectors
doc_vec = np.array([np.array([model.wv[term] for term in terms]).mean(axis=0) for (doc,terms) in term_doc.items()])
query_vec = np.array([np.array([model.wv[term] for term in terms]).mean(axis=0) for (query,terms) in term_query.items()])

# Weighted Mean vectors
weighted_doc_vec = np.array([np.array([model.wv[term]*TD[dictionary[term],index] for term in terms]).mean(axis=0) for index,(doc,terms) in enumerate(term_doc.items())])
weighted_query_vec = np.array([np.array([model.wv[term]*TQ[dictionary[term],index] for term in terms]).mean(axis=0) for index,(query,terms) in enumerate(term_query.items())])

AW2V = np.argsort(-np.einsum("qi,di->qd",query_vec,doc_vec)/(np.linalg.norm(query_vec, axis=1).reshape((-1,1))*np.linalg.norm(doc_vec, axis=1).reshape((1,-1))), axis=1)[:, :NUM_RELATD]
WW2V = np.argsort(-np.einsum("qi,di->qd",weighted_query_vec,weighted_doc_vec)/(np.linalg.norm(weighted_query_vec, axis=1).reshape((-1,1))*np.linalg.norm(weighted_doc_vec, axis=1).reshape((1,-1))), axis=1)[:, :NUM_RELATD]

check_results(0, WW2V)

[Query]: what is the origin of COVID-19
[RELATED DOCS INDECES]: [396  20 690 748 425 736 490 122 217 701]
[RELATED DOCS IDS]: ['24yavi1w', '0t2a5500', '2y452utz', '105q161g', '22fc1qly', '0chuwvg6', '1sq2uvur', '1fxrmuzl', 'vk8s1f23', 'ig0rnbqb']
TOP FIVE RESULT FOR 1'st QUERY
[24yavi1w]: Mutation and adaptation have driven the co-evolution of coronaviruses (CoVs) and their hosts, including human beings, for thousands of years. Before 2003, two human CoVs (HCoVs) were known to cause mild illness, such as common cold. The outbreaks of severe acute respiratory syndrome (SARS) and the Middle East respiratory syndrome (MERS) have flipped the coin to reveal how devastating and life-threatening an HCoV infection could be. The emergence of SARS-CoV-2 in central China at the end of 2019 has thrusted CoVs into the spotlight again and surprised us with its high transmissibility but reduced pathogenicity compared to its sister SARS-CoV. HCoV infection is a zoonosis and understanding the zoonotic 

In [211]:
check_results(0, AW2V)

[Query]: what is the origin of COVID-19
[RELATED DOCS INDECES]: [276 721 425 690 609 435 420 131 351 683]
[RELATED DOCS IDS]: ['2lxs9laj', '1emlkii0', '22fc1qly', '2y452utz', '1mjaycee', '41378qru', '12sbikmx', '03pd9jtn', '0uvzy48c', '4r0t3q7j']
TOP FIVE RESULT FOR 1'st QUERY
[2lxs9laj]: Coronavirus disease 2019 (COVID-19), which causes serious respiratory illness such as pneumonia and lung failure, was first reported in Wuhan, the capital of Hubei, China. The etiological agent of COVID-19 has been confirmed as a novel coronavirus, now known as severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), which is most likely originated from zoonotic coronaviruses, like SARS-CoV, which emerged in 2002. Within a few months of the first report, SARS-CoV-2 had spread across China and worldwide, reaching a pandemic level. As COVID-19 has triggered enormous human casualties and serious economic loss posing global threat, an understanding of the ongoing situation and the development of stra

#### BERT

In [33]:
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
doc_ids = [torch.tensor([tokenizer.encode(sentence, add_special_tokens=True)])[:, :512] for sentence in list(docs['document'])]
query_ids = [torch.tensor([tokenizer.encode(sentence, add_special_tokens=True)])[:, :512] for sentence in list(queries['query'])]
with torch.no_grad():
    # results = []
    for doc_id in tqdm(doc_ids):
        try: results += [model(doc_id).last_hidden_state.mean(dim=1).squeeze()]
        except: print(f"[error] : {doc_id}")
    doc_vec = torch.stack(results)
    results = []
    for query_id in tqdm(query_ids):
        try: results += [model(query_id).last_hidden_state.mean(dim=1).squeeze()]
        except: print(f"[error] : {query_id}")
    query_vec = torch.stack(results)
import pickle
with open ("bert_vectors.pth", "wb") as f:
    pickle.dump((doc_vec, query_vec), f)

In [212]:
with open ("bert_vectors.pth", "rb") as f:
    (doc_vec, query_vec) = pickle.load(f)

In [197]:
DIC_LENGTH = 10000
NUM_RELATD = 10
EPSILON = 1e-10
BERTR = torch.argsort(-torch.einsum("qi,di->qd",query_vec,doc_vec)/(torch.norm(query_vec, dim=1).reshape((-1,1))*torch.norm(doc_vec, dim=1).reshape((1,-1))), axis=1)[:, :NUM_RELATD].numpy()
check_results(1, BERTR)

[Query]: how does the coronavirus respond to changes in the weather
[RELATED DOCS INDECES]: [602 104 261 637 717 285 701 559 139 476]
[RELATED DOCS IDS]: ['03s9spbi', '431ksdno', '15slu3kk', 'lj8t52yl', '0pujch9v', '04awj06g', 'ig0rnbqb', '6vln3erl', '0vlzwksu', '8r6u3e3i']
TOP FIVE RESULT FOR 1'st QUERY
[03s9spbi]: (1) Background: The virulence of coronavirus diseases due to viruses like SARS-CoV or MERS-CoV decreases in humid and hot weather. The putative temperature dependence of infectivity by the new coronavirus SARS-CoV-2 or covid-19 has a high predictive medical interest. (2) Methods: External temperature and new covid-19 cases in 21 countries and in the French administrative regions were collected from public data. Associations between epidemiological parameters of the new case dynamics and temperature were examined using an ARIMA model. (3) Results: We show that, in the first stages of the epidemic, the velocity of contagion decreases with country- or region-wise temperature. 

#### MRR Evaluation

In [198]:
def RR(rank, ground_truth):
    for i,doc_id in enumerate(rank):
        if doc_id in ground_truth : return 1/(i+1)
    return 0

BGλ = [(0.6,0.1),(0.6,0.2),(0.6,0.3),(0.7,0.1),(0.7,0.2),(0.8,0.1),(0.9,0.05)]

for ind,Rank in enumerate(UGM):
    print(f"[Unigram_λ={0.1*(ind+1):.1f}   MRR]: {np.array([RR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(Rank)]).mean():.2f}")
for ind,Rank in enumerate(BGM):
    print(f"[Bigram, λ_1={BGλ[ind][0]:.1f}, λ_2={BGλ[ind][1]:.1f}   MRR]: {np.array([RR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(Rank)]).mean():.2f}")
print(f"[Arithmatic W2V   MRR]: {np.array([RR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(AW2V)]).mean():.2f}")
print(f"[Weighted W2V   MRR]: {np.array([RR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(WW2V)]).mean():.2f}")
print(f"[BERT   MRR]: {np.array([RR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(BERTR)]).mean():.2f}")


[Unigram_λ=0.1   MRR]: 0.72
[Unigram_λ=0.2   MRR]: 0.74
[Unigram_λ=0.3   MRR]: 0.74
[Unigram_λ=0.4   MRR]: 0.74
[Unigram_λ=0.5   MRR]: 0.74
[Unigram_λ=0.6   MRR]: 0.73
[Unigram_λ=0.7   MRR]: 0.73
[Unigram_λ=0.8   MRR]: 0.73
[Unigram_λ=0.9   MRR]: 0.75
[Bigram, λ_1=0.6, λ_2=0.1   MRR]: 0.65
[Bigram, λ_1=0.6, λ_2=0.2   MRR]: 0.69
[Bigram, λ_1=0.6, λ_2=0.3   MRR]: 0.71
[Bigram, λ_1=0.7, λ_2=0.1   MRR]: 0.66
[Bigram, λ_1=0.7, λ_2=0.2   MRR]: 0.70
[Bigram, λ_1=0.8, λ_2=0.1   MRR]: 0.66
[Bigram, λ_1=0.9, λ_2=0.1   MRR]: 0.66
[Arithmatic W2V   MRR]: 0.69
[Weighted W2V   MRR]: 0.83
[BERT   MRR]: 0.42


#### MAP Evaluation

In [199]:
def PR(rank, ground_truth):
    score, correct = [], 1
    for i,doc_id in enumerate(rank):
        score += [(correct if doc_id in ground_truth else 0)/(i+1)]
        correct += 1
    return np.array(score).mean()

for ind,Rank in enumerate(UGM):
    print(f"[Unigram_λ={0.1*(ind+1):.1f}   MAP]: {np.array([PR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(Rank)]).mean():.2f}")
for ind,Rank in enumerate(BGM):
    print(f"[Bigram, λ_1={BGλ[ind][0]:.1f}, λ_2={BGλ[ind][1]:.1f}   MAP]: {np.array([PR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(Rank)]).mean():.2f}")
print(f"[Arithmatic W2V   MAP]: {np.array([PR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(AW2V)]).mean():.2f}")
print(f"[Weighted W2V   MAP]: {np.array([PR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(WW2V)]).mean():.2f}")
print(f"[BERT   MAP]: {np.array([PR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(BERTR)]).mean():.2f}")


[Unigram_λ=0.1   MAP]: 0.48
[Unigram_λ=0.2   MAP]: 0.49
[Unigram_λ=0.3   MAP]: 0.48
[Unigram_λ=0.4   MAP]: 0.46
[Unigram_λ=0.5   MAP]: 0.45
[Unigram_λ=0.6   MAP]: 0.45
[Unigram_λ=0.7   MAP]: 0.45
[Unigram_λ=0.8   MAP]: 0.45
[Unigram_λ=0.9   MAP]: 0.45
[Bigram, λ_1=0.6, λ_2=0.1   MAP]: 0.39
[Bigram, λ_1=0.6, λ_2=0.2   MAP]: 0.40
[Bigram, λ_1=0.6, λ_2=0.3   MAP]: 0.40
[Bigram, λ_1=0.7, λ_2=0.1   MAP]: 0.39
[Bigram, λ_1=0.7, λ_2=0.2   MAP]: 0.40
[Bigram, λ_1=0.8, λ_2=0.1   MAP]: 0.39
[Bigram, λ_1=0.9, λ_2=0.1   MAP]: 0.38
[Arithmatic W2V   MAP]: 0.42
[Weighted W2V   MAP]: 0.48
[BERT   MAP]: 0.18


#### P@K Evaluation

In [200]:
def PK(rank, ground_truth, k):
    correct = 0
    for _,doc_id in enumerate(rank[:k]):
        if doc_id in ground_truth: correct += 1
    return correct/k

for ind,Rank in enumerate(UGM):
    print(f"[Unigram_λ={0.1*(ind+1):.1f}   P@5]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=5) for i,rank in enumerate(Rank)]).mean():.2f}")
for ind,Rank in enumerate(BGM):
    print(f"[Bigram, λ_1={BGλ[ind][0]:.1f}, λ_2={BGλ[ind][1]:.1f}   P@5]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=5) for i,rank in enumerate(Rank)]).mean():.2f}")
print(f"[Arithmatic W2V   P@5]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=5) for i,rank in enumerate(AW2V)]).mean():.2f}")
print(f"[Weighted W2V   P@5]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=5) for i,rank in enumerate(WW2V)]).mean():.2f}")
print(f"[BERT   P@5]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=5) for i,rank in enumerate(BERTR)]).mean():.2f}")


[Unigram_λ=0.1   P@5]: 0.55
[Unigram_λ=0.2   P@5]: 0.56
[Unigram_λ=0.3   P@5]: 0.56
[Unigram_λ=0.4   P@5]: 0.55
[Unigram_λ=0.5   P@5]: 0.55
[Unigram_λ=0.6   P@5]: 0.53
[Unigram_λ=0.7   P@5]: 0.53
[Unigram_λ=0.8   P@5]: 0.53
[Unigram_λ=0.9   P@5]: 0.53
[Bigram, λ_1=0.6, λ_2=0.1   P@5]: 0.44
[Bigram, λ_1=0.6, λ_2=0.2   P@5]: 0.48
[Bigram, λ_1=0.6, λ_2=0.3   P@5]: 0.51
[Bigram, λ_1=0.7, λ_2=0.1   P@5]: 0.45
[Bigram, λ_1=0.7, λ_2=0.2   P@5]: 0.49
[Bigram, λ_1=0.8, λ_2=0.1   P@5]: 0.45
[Bigram, λ_1=0.9, λ_2=0.1   P@5]: 0.44
[Arithmatic W2V   P@5]: 0.49
[Weighted W2V   P@5]: 0.62
[BERT   P@5]: 0.18


In [201]:
for ind,Rank in enumerate(UGM):
    print(f"[Unigram_λ={0.1*(ind+1):.1f}   P@10]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=10) for i,rank in enumerate(Rank)]).mean():.2f}")
for ind,Rank in enumerate(BGM):
    print(f"[Bigram, λ_1={BGλ[ind][0]:.1f}, λ_2={BGλ[ind][1]:.1f}   P@10]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=10) for i,rank in enumerate(Rank)]).mean():.2f}")
print(f"[Arithmatic W2V   P@10]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=10) for i,rank in enumerate(AW2V)]).mean():.2f}")
print(f"[Weighted W2V   P@10]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=10) for i,rank in enumerate(WW2V)]).mean():.2f}")
print(f"[BERT   P@10]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=10) for i,rank in enumerate(BERTR)]).mean():.2f}")

[Unigram_λ=0.1   P@10]: 0.48
[Unigram_λ=0.2   P@10]: 0.49
[Unigram_λ=0.3   P@10]: 0.48
[Unigram_λ=0.4   P@10]: 0.46
[Unigram_λ=0.5   P@10]: 0.45
[Unigram_λ=0.6   P@10]: 0.45
[Unigram_λ=0.7   P@10]: 0.45
[Unigram_λ=0.8   P@10]: 0.45
[Unigram_λ=0.9   P@10]: 0.45
[Bigram, λ_1=0.6, λ_2=0.1   P@10]: 0.39
[Bigram, λ_1=0.6, λ_2=0.2   P@10]: 0.40
[Bigram, λ_1=0.6, λ_2=0.3   P@10]: 0.40
[Bigram, λ_1=0.7, λ_2=0.1   P@10]: 0.39
[Bigram, λ_1=0.7, λ_2=0.2   P@10]: 0.40
[Bigram, λ_1=0.8, λ_2=0.1   P@10]: 0.39
[Bigram, λ_1=0.9, λ_2=0.1   P@10]: 0.38
[Arithmatic W2V   P@10]: 0.42
[Weighted W2V   P@10]: 0.48
[BERT   P@10]: 0.18
