In [7]:
from pyserini.index import IndexReader 
from pyserini.search import SimpleSearcher
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader as api
import json
import torch
import numpy as np
import codecs
from bs4 import BeautifulSoup
from tqdm import tqdm
from utils import *

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

config = json.loads(open("config.json", "r").read())
index_path = config["index_path"]
topics_path = config["topics_path"]
qrels_path = config["qrels_path"]
index_path
device = torch.device('cpu')

In [8]:
# wiki_50 = api.load('glove-wiki-gigaword-50')
# wiki_300 = api.load('glove-wiki-gigaword-300')
# wiki_fast = api.load('fasttext-wiki-news-subwords-300')
# google = api.load('word2vec-google-news-300')

wiki_300 = KeyedVectors.load_word2vec_format('~/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.txt', binary=False)


In [9]:
wiki_300.most_similar(['international', 'organized', 'crime'])

[('organised', 0.6641563773155212),
 ('criminal', 0.5866537094116211),
 ('crimes', 0.5811451077461243),
 ('organizations', 0.5762122273445129),
 ('organization', 0.5691279172897339),
 ('trafficking', 0.5649696588516235),
 ('terrorism', 0.5468315482139587),
 ('activities', 0.5334997773170471),
 ('involved', 0.5092121958732605),
 ('corruption', 0.5076022744178772)]

In [10]:
topic = 'hubble space telescope'
et = expand_query(topic, wiki_300, len(topic.split()))

hits = Searcher.search(et, 20)
for h in hits:
    print(h.docid, h.score)


FT933-6678 27.772499084472656
FT934-5418 27.051700592041016
LA040190-0178 25.658899307250977
LA041090-0148 25.251100540161133
FBIS3-42547 25.09709930419922
FBIS4-46650 24.89459991455078
LA050390-0109 24.694799423217773
LA081090-0078 24.25629997253418
LA071490-0091 24.17329978942871
LA060890-0124 24.135099411010742
LA081790-0164 24.1112003326416
LA052890-0021 23.854400634765625
LA070390-0084 23.843700408935547
LA051590-0074 23.284799575805664
LA051490-0110 23.137399673461914
LA080990-0242 23.053499221801758
FT921-7107 22.52090072631836
FT933-6946 22.48889923095703
LA071090-0047 22.33839988708496
FT944-128 22.332000732421875


In [12]:
expanded_topic = expand_query(topic, wiki_300, len(topic.split()), 0)
expanded_topic

'hubble space telescope nasa spacecraft observatory'

# Results

In [9]:
topics = get_topics(topics_path)
#k = nro. results per topic
#n = nro. extra words
def make_results(model, n:int = 0, dynamic = False, k:int = 25, threshold = .7):
    results = ""
    for i in tqdm(topics):
        ranking = ""
        if dynamic:
            extra_words_count = 0;
            for word in topics[i].split():
                if word not in stopwords.words('english'):
                    extra_words_count+=1
            expanded_topic = expand_query(topics[i], model, extra_words_count, threshold)
            hits = Searcher.search(expanded_topic, k=k)
        elif n > 0 and not dynamic: 
            expanded_topic = expand_query(topics[i], model, n, threshold)
            hits = Searcher.search(expanded_topic, k=k)
        else:
            hits = Searcher.search(topics[i], k=k)
            
        for r, h in enumerate(hits):
            ranking += f"{i} 0 {h.docid} {r+1} {h.score} RUN1\n"
        results += ranking
        
    filename = ""
    filename = f'results/results_{n}_{k}_{threshold*100}.txt'
            
    f = open(filename, 'w')
    f.write(results)
    f.close()

make_results(wiki_300, 0, k=1000 ) # no expanded query
make_results(wiki_300, 'x', dynamic=True, k=1000)

100%|██████████| 250/250 [00:13<00:00, 18.61it/s]
100%|██████████| 250/250 [00:52<00:00,  4.77it/s]


In [10]:
'Poliomyelitis and Post-Polio polio measles diphtheria'.split()

['Poliomyelitis', 'and', 'Post-Polio', 'polio', 'measles', 'diphtheria']

In [17]:
metric_0 = 0
k=1000
labels_gen = query_labels_from_file(qrels_path, f'results/results_0_{k}_0.txt')
r = 0
for labels in labels_gen:
    metric_0+=NDCG(labels, k)
    r+=1
    
print(f"0 {k} {metric_0/r}")

0 1000 0.6708575925458083


In [16]:
metric = 0

labels_gen = query_labels_from_file(qrels_path, f'results/results_x_{k}_0.txt')
r = 0
for labels in labels_gen:
    metric+=NDCG(labels, k)
    r+=1
     
print(f"x {k} {metric/r}")

x 1000 0.5824161580572926


In [11]:
ks = [10, 25, 50] + list(range(100, 1100, 100))
ns = list(range(0, 11))
taus = [0.7, 0.8, 0.9]

print(ks)

for k in ks:
    for n in ns:
        for tau in taus:
            # run_query(k, n, tau)
            make_results(wiki_300, n,False,k,tau)

[10, 25, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]


100%|██████████| 250/250 [00:00<00:00, 492.12it/s]
100%|██████████| 250/250 [00:00<00:00, 525.74it/s]
100%|██████████| 250/250 [00:00<00:00, 570.28it/s]
  5%|▍         | 12/250 [00:00<00:06, 34.39it/s]


KeyError: "Key 'levitation-maglev' not present"

# Relevance feedback

In [15]:
def merge_doc_vectors(docs):
    tfs = dict()
    for d in docs:
        tf = IndexReader.reader.get_document_vector(d)
        for k in tf:
            if k in tfs:
                tfs[k] += tf[k]
            else:
                tfs[k] = tf[k]
    return tfs

def rank_words(doc_vectors, words):
    top_words = {}
    for w in words:
        if w in doc_vectors:
            if w in top_words:
                top_words[w] += doc_vectors[w]
            else:
                top_words[w] = doc_vectors[w]
    ordered_dict = dict(sorted(top_words.items(), key=lambda item: item[1], reverse=True))
    return list(ordered_dict.keys())

    
def expand_query_using_relevance_feedback(model, topic, n = None, top_docs = 10):
    if n is None:
        n = 0
        for word in topic.split():
            if word not in stopwords.words('english'):
                n += 1
        
    hits = Searcher.search(topic, top_docs)
    doc_ids = [h.docid for h in hits]
    doc_vector = merge_doc_vectors(doc_ids)
    top_words = []
    itr = 1
    while len(top_words) <= n:
        prev_len = len(top_words)
        new_words = expand_query(topic, model, 10*n*itr).split()[n:]
        potentials = rank_words(doc_vector, new_words)
        top_words += potentials[(itr-1)*n:itr*n]
        itr+=1
        if prev_len == len(top_words) or itr >= 10:
            break
        
    return topic + ' ' + ' '.join(top_words[:n])

expand_query_using_relevance_feedback(wiki_300, topic)

'hubble space telescope nasa mission launch'

In [18]:
#k = nro. results per topic
#n = nro. extra words
def make_results(model, n:int = 0, dynamic = False, k:int = 25, threshold = .7):
    topics = get_topics(topics_path)
    results = ""
    for i in tqdm(topics):
        ranking = ""
        if dynamic:
            extra_words_count = 0;
            for word in topics[i].split():
                if word not in stopwords.words('english'):
                    extra_words_count+=1
            expanded_topic = expand_query(topics[i], model, extra_words_count, threshold)
            hits = Searcher.search(expanded_topic, k=k)
        elif n > 0 and not dynamic: 
            expanded_topic = expand_query(topics[i], model, n, threshold)
            hits = Searcher.search(expanded_topic, k=k)
        else:
            hits = Searcher.search(topics[i], k=k)
            
        for r, h in enumerate(hits):
            ranking += f"{i} 0 {h.docid} {r+1} {h.score} RUN1\n"
        results += ranking
        
    filename = ""
    filename = f'results/results_{n}_{k}_{threshold*100}.txt'
            
    f = open(filename, 'w')
    f.write(results)
    f.close()

# make_results(wiki_300, 0, k=1000 ) # no expanded query
# make_results(wiki_300, 'x', dynamic=True, k=1000)
def make_relevancefeedback_results(model, k=25, top_docs=10):
    results = ""
    topics = get_topics(topics_path)
    for i in tqdm(topics):
        ranking = ""
        expanded_topic = expand_query_using_relevance_feedback(model, topics[i], top_docs=top_docs)
        hits = Searcher.search(expanded_topic, k=k)
        
        for r, h in enumerate(hits):
            ranking += f"{i} 0 {h.docid} {r+1} {h.score} RUN1\n"
        results += ranking
        
    filename = f"relevance_feedback_results/results_{top_docs}.txt"
    f = open(filename, 'w')
    f.write(results)
    f.close()
        
make_relevancefeedback_results(wiki_300, k=1000, top_docs=25) 

100%|██████████| 250/250 [04:14<00:00,  1.02s/it]


In [19]:
metric_0 = 0
k=1000
labels_gen = query_labels_from_file(qrels_path, f'results/results_0_{k}_0.txt')
r = 0
for labels in labels_gen:
    metric_0+=NDCG(labels, k)
    r+=1
    
print(f"0 {k} {metric_0/r}")

0 1000 0.6708575925458083


In [21]:
metric = 0

labels_gen = query_labels_from_file(qrels_path, f'results/results_x_{k}_0.txt')
r = 0
for labels in labels_gen:
    metric+=NDCG(labels, k)
    r+=1
     
print(f"x {k} {metric/r}")

x 1000 0.5824161580572926


In [20]:
metric_relevance_feedback = 0
p = 25
labels_gen = query_labels_from_file(qrels_path, f"relevance_feedback_results/results_{p}.txt")
r = 0
for labels in labels_gen:
    metric_relevance_feedback+= NDCG(labels, k)
    r+=1
    
print(f"relevance_feedback {k} {metric_relevance_feedback/r}")

relevance_feedback 1000 0.5983220912453281
