#### Import Data And Stop Words

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
docs = pd.read_csv('hw1_docs.csv')
queries = pd.read_csv('hw1_queries.csv')
qrels = pd.read_csv('hw1_qrels.csv')
with open('stop_words.txt','r') as f: stops = { w:1 for w in f.read().splitlines()}

In [140]:
# sanity check function
def check_results(query_index, ranking):
    print(f"[Query]: {queries.iloc[query_index]['query']}")
    RELATED_DOCS = ranking[query_index]
    print(f"[RELATED DOCS INDECES]: {RELATED_DOCS}")
    print(f"[RELATED DOCS IDS]: {[docs.iloc[i]['doc_id'] for i in RELATED_DOCS]}")
    print(f"TOP FIVE RESULT FOR 1'st QUERY")
    print(f"[{docs.iloc[RELATED_DOCS[0]]['doc_id']}]: {docs.iloc[RELATED_DOCS[0]]['document']}")
    print(f"[{docs.iloc[RELATED_DOCS[1]]['doc_id']}]: {docs.iloc[RELATED_DOCS[1]]['document']}")
    print(f"[{docs.iloc[RELATED_DOCS[2]]['doc_id']}]: {docs.iloc[RELATED_DOCS[2]]['document']}")
    print(f"[{docs.iloc[RELATED_DOCS[3]]['doc_id']}]: {docs.iloc[RELATED_DOCS[3]]['document']}")
    print(f"[{docs.iloc[RELATED_DOCS[4]]['doc_id']}]: {docs.iloc[RELATED_DOCS[4]]['document']}")
    print(f"[QREL]: {qrels[qrels['query_id']==(query_index+1)]}")

#### Preprocessing Document Dataset

In [None]:
# preprocessing : tokenization, lower case, stop-word removal
term_doc = {}
doc_term = {}
for index, row in docs.iterrows():
    term_doc[row['doc_id']] = []
    tokens_list = re.split(r'\s+', re.sub(r'[^\w\s]',' ',row['document'].lower()))
    for token in tokens_list:
        if token in doc_term: 
            doc_term[token] += [row['doc_id']]
        elif (((len(token)>=2) and any(not char.isdigit() for char in token)) or (len(token)>=4)) and (token not in stops): 
            doc_term[token] = [row['doc_id']]
        else: continue
        term_doc[row['doc_id']] += [token]

#### Space Vector Model Information Retrival

In [178]:
# find most frequent terms
# constants
DIC_LENGTH = 10000
NUM_RELATD = 10
EPSILON = 1e-10

doc_term = dict(sorted(doc_term.items(), key=lambda item: len(item[1])))
# make dictionary of 1000 important words
dictionary = {term:index for index,term in enumerate(list(doc_term.keys())[0:DIC_LENGTH])}
# make dictionary-term tf-idf matrix for docs
# this operation will result the DT matrix of shape(1000,750), which is 750 doc vectors in dictionary space
TD = np.zeros((len(dictionary),len(term_doc)))
for index,(doc,terms) in enumerate(term_doc.items()):
    doc_in_dict = {}
    for term in terms:
        if term in dictionary: 
            if term in doc_in_dict:
                doc_in_dict[term] += 1
            else:
                doc_in_dict[term] = 1
    for term in doc_in_dict.keys():
        TD[dictionary[term],index] = (doc_in_dict[term]/len(doc_in_dict)) * np.log10(len(term_doc)/len(doc_term[term])) 
# make dictionary-term tf-idf matrix for queries
# this operation will result the DT matrix of shape(1000,50), which is 50 query vectors in dictionary space
query_term = {}
term_query = {}
for index, row in queries.iterrows():
    term_query[row['query_id']] = []
    tokens_list = re.split(r'\s+', re.sub(r'[^\w\s]',' ',row['query'].lower()))
    for token in tokens_list:
        if token in dictionary:
            if token in query_term:
                query_term[token] += [row['query_id']]
            else:
                query_term[token] = [row['query_id']]
            term_query[row['query_id']] += [token]

TQ = np.zeros((len(dictionary),queries.shape[0]))
for index,(query,terms) in enumerate(term_query.items()):
    query_in_dict = {}
    for term in terms:
        if term in query_in_dict:
            query_in_dict[term] += 1
        else:
            query_in_dict[term] = 1
    for term in query_in_dict.keys():
        TQ[dictionary[term],index] = (query_in_dict[term]/len(query_in_dict)) * np.log10(len(term_doc)/len(doc_term[term])) 
# some black magick to find 10 smallest cosine norms! result is a 50*10 matrix of all queries with appropriate doc index
Rank = np.argsort(np.einsum('ij,ik->kj',TD,TQ)/np.einsum('i,j->ji',1/(EPSILON+np.linalg.norm(TD,axis=0)),1/(EPSILON+np.linalg.norm(TQ,axis=0))), axis=1)[:, -NUM_RELATD:]
# sanity check! just to be sure results are persistant
check_results(0, Rank)

[Query]: what is the origin of COVID-19
[RELATED DOCS INDECES]: [597 278 381 557 490 748 389 668 690 637]
[RELATED DOCS IDS]: ['1abp6oom', '1vkz0b0o', '2nvk7glh', '0m5mc320', '1sq2uvur', '105q161g', '0xkz36bj', '0cq5ee1i', '2y452utz', 'lj8t52yl']
TOP FIVE RESULT FOR 1'st QUERY
[1abp6oom]: Abstract Taiwan experienced a large number of severe acute respiratory syndrome (SARS) viral infections between March and July 2003; by September of that year, 346 SARS cases were confirmed by RT-PCR or serological tests. In order to better understand evolutionary relationships among SARS coronaviruses (SCoVs) from different international regions, we performed phylogenetic comparisons of full-length genomic and protein sequences from 45 human SCoVs (including 12 from Taiwan) and two civet SCoVs. All the Taiwanese SARS-CoV strains which associated with nosocomial infection formed a monophyletic clade within the late phase of the SARS epidemic. This Taiwanese clade could be further divided into two epid

#### Binary Independence Model

In [138]:
# main Word IDF
def WIDF(p): return np.log10(p/(1-p))+np.log10(len(term_doc)*np.array([1/len(doc_term[term]) for term in list(dictionary.keys())]))
# find which words are in which documents and then sum up the idf of present terms for each doc
TDB = np.where(TD != 0, 1, TD)
TQB = np.where(TQ != 0, 1, TQ)
def BIM(p): return np.argsort(np.einsum("ijk,j->ik",np.einsum("ij,ik->kij",TDB,TQB),WIDF(p)), axis=1)[:, -NUM_RELATD:]
BIM03 = BIM(p=0.3)
BIM05 = BIM(p=0.5)
BIM07 = BIM(p=0.7)
# sanity check! just to be sure results are persistant
check_results(0, BIM03)

[Query]: what is the origin of COVID-19
[RELATED DOCS INDECES]: [153 435 328 609 597 607 425 690 353 511]
[RELATED DOCS IDS]: ['34ytd87a', '41378qru', '2zaxn6tq', '1mjaycee', '1abp6oom', '0gmtnkbh', '22fc1qly', '2y452utz', '10ecm4wi', '5pv11lfo']
TOP FIVE RESULT FOR 1'st QUERY
[34ytd87a]: In early December, pneumonia cases of unknown origin started to appear and, on the 7thof January 2020, these cases were declared to be caused by a novel beta-coronavirus according to viral genome sequencing on the 11thof February, 2020. Coronaviruses are enveloped, single strand RNA viruses that have been known to have the ability to mutate rapidly, alter tissue tropism and adjust to different epidemiological situations. As of the end of April 2020, 122,392 laboratory-confirmed cases of COVID-19 had been detected in Turkey, of whom 3,258 died. From the beginning of the COVID-19 epidemic, children seem to be less affected than adults. Therefore, there are limited data regarding the clinical features of

#### Best Match 25

In [187]:
# parameters
# k_l = 1.4
# b = 0.75
l_avg = np.mean(np.array([len(terms) for terms in term_doc.values()]))

# Term-Document matrix for BM25 algorithm
def TDBM(b,k_l):
    TDBM = np.zeros((len(dictionary),len(term_doc)))
    for index,(doc,terms) in enumerate(term_doc.items()):
        doc_in_dict = {}
        for term in terms:
            if term in dictionary: 
                if term in doc_in_dict:
                    doc_in_dict[term] += 1
                else:
                    doc_in_dict[term] = 1
        for term in doc_in_dict.keys():
            tf = (doc_in_dict[term]/len(doc_in_dict))
            TDBM[dictionary[term],index] = ((k_l+1)*tf*np.log10(len(term_doc)/len(doc_term[term])))/(k_l*(1-b+b*(len(doc_in_dict)/l_avg))+tf)
    return TDBM
TQB = np.where(TQ != 0, 1, TQ)
BM2501 = np.argsort(np.einsum("ij,ik->kj",TDBM(k_l=1.4,b=0.75),TQB), axis=1)[:, -NUM_RELATD:]
BM2502 = np.argsort(np.einsum("ij,ik->kj",TDBM(k_l=1.6,b=0.75),TQB), axis=1)[:, -NUM_RELATD:]
BM2503 = np.argsort(np.einsum("ij,ik->kj",TDBM(k_l=1.8,b=0.75),TQB), axis=1)[:, -NUM_RELATD:]
# sanity check! just to be sure results are persistant
check_results(0, BM2501)

[Query]: what is the origin of COVID-19
[RELATED DOCS INDECES]: [748 435 425 609 557 278 389 668 690 637]
[RELATED DOCS IDS]: ['105q161g', '41378qru', '22fc1qly', '1mjaycee', '0m5mc320', '1vkz0b0o', '0xkz36bj', '0cq5ee1i', '2y452utz', 'lj8t52yl']
TOP FIVE RESULT FOR 1'st QUERY
[105q161g]: A number of virological, epidemiological and ethnographic arguments suggest that COVID-19 has a zoonotic origin. The pangolin, a species threatened with extinction due to poaching for both culinary purposes and traditional Chinese pharmacopoeia, is now suspected of being the “missing link” in the transmission to humans of a virus that probably originated in a species of bat. Our predation of wild fauna and the reduction in their habitats have thus ended up creating new interfaces that favour the transmission of pathogens (mainly viruses) to humans. Domesticated animals and wild fauna thus constitute a reservoir for almost 80% of emerging human diseases (SARS-CoV, MERS-CoV, Ebola). These diseases are a

#### MRR Evaluation

In [188]:
def RR(rank, ground_truth):
    for i,doc_id in enumerate(rank):
        if doc_id in ground_truth : return 1/(i+1)
    return 0
# check average for the methods
print(f"[VS   MRR]: {np.array([RR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(Rank)]).mean():.2f}")
print(f"[BIM03  MRR]: {np.array([RR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(BIM03)]).mean():.2f}")
print(f"[BIM05  MRR]: {np.array([RR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(BIM05)]).mean():.2f}")
print(f"[BIM07  MRR]: {np.array([RR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(BIM07)]).mean():.2f}")
print(f"[BM2501 MRR]: {np.array([RR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(BM2501)]).mean():.2f}")
print(f"[BM2502 MRR]: {np.array([RR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(BM2502)]).mean():.2f}")
print(f"[BM2503 MRR]: {np.array([RR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(BM2503)]).mean():.2f}")

[VS   MRR]: 0.52
[BIM03  MRR]: 0.41
[BIM05  MRR]: 0.45
[BIM07  MRR]: 0.46
[BM2501 MRR]: 0.56
[BM2502 MRR]: 0.57
[BM2503 MRR]: 0.57


#### MAP Evaluation

In [189]:
def PR(rank, ground_truth):
    score, correct = [], 1
    for i,doc_id in enumerate(rank):
        score += [(correct if doc_id in ground_truth else 0)/(i+1)]
        correct += 1
    return np.array(score).mean()
# check average for the methods
print(f"[VS   MPR]: {np.array([PR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(Rank)]).mean():.2f}")
print(f"[BIM03  MPR]: {np.array([PR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(BIM03)]).mean():.2f}")
print(f"[BIM05  MPR]: {np.array([PR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(BIM05)]).mean():.2f}")
print(f"[BIM07  MPR]: {np.array([PR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(BIM07)]).mean():.2f}")
print(f"[BM2501 MPR]: {np.array([PR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(BM2501)]).mean():.2f}")
print(f"[BM2502 MPR]: {np.array([PR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(BM2502)]).mean():.2f}")
print(f"[BM2503 MPR]: {np.array([PR(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list()) for i,rank in enumerate(BM2503)]).mean():.2f}")

[VS   MPR]: 0.46
[BIM03  MPR]: 0.36
[BIM05  MPR]: 0.37
[BIM07  MPR]: 0.37
[BM2501 MPR]: 0.45
[BM2502 MPR]: 0.46
[BM2503 MPR]: 0.46


#### P@K Evaluation

In [190]:
def PK(rank, ground_truth, k):
    correct = 0
    for _,doc_id in enumerate(rank[:k]):
        if doc_id in ground_truth: correct += 1
    return correct/k
# check average for the methods
print('[P@5 Results]')
print(f"[VS   P@5]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=5) for i,rank in enumerate(Rank)]).mean():.2f}")
print(f"[BIM03  P@5]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=5) for i,rank in enumerate(BIM03)]).mean():.2f}")
print(f"[BIM05  P@5]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=5) for i,rank in enumerate(BIM05)]).mean():.2f}")
print(f"[BIM07  P@5]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=5) for i,rank in enumerate(BIM07)]).mean():.2f}")
print(f"[BM2501 P@5]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=5) for i,rank in enumerate(BM2501)]).mean():.2f}")
print(f"[BM2502 P@5]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=5) for i,rank in enumerate(BM2502)]).mean():.2f}")
print(f"[BM2503 P@5]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=5) for i,rank in enumerate(BM2503)]).mean():.2f}")

# check average for the methods
print('\n[P@10 Results]')
print(f"[VS   P@10]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=10) for i,rank in enumerate(Rank)]).mean():.2f}")
print(f"[BIM03  P@10]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=10) for i,rank in enumerate(BIM03)]).mean():.2f}")
print(f"[BIM05  P@10]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=10) for i,rank in enumerate(BIM05)]).mean():.2f}")
print(f"[BIM07  P@10]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=10) for i,rank in enumerate(BIM07)]).mean():.2f}")
print(f"[BM2501 P@10]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=10) for i,rank in enumerate(BM2501)]).mean():.2f}")
print(f"[BM2502 P@10]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=10) for i,rank in enumerate(BM2502)]).mean():.2f}")
print(f"[BM2503 P@10]: {np.array([PK(docs.iloc[rank]['doc_id'].to_list(), qrels[qrels['query_id']==(i+1)]['doc_id'].to_list(), k=10) for i,rank in enumerate(BM2503)]).mean():.2f}")

[P@5 Results]
[VS   P@5]: 0.39
[BIM03  P@5]: 0.28
[BIM05  P@5]: 0.30
[BIM07  P@5]: 0.30
[BM2501 P@5]: 0.40
[BM2502 P@5]: 0.40
[BM2503 P@5]: 0.40

[P@10 Results]
[VS   P@10]: 0.46
[BIM03  P@10]: 0.36
[BIM05  P@10]: 0.37
[BIM07  P@10]: 0.37
[BM2501 P@10]: 0.45
[BM2502 P@10]: 0.46
[BM2503 P@10]: 0.46
