In [1]:
from pyserini.index import IndexReader 
from pyserini.search import SimpleSearcher, querybuilder
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader as api
import json
import torch
import numpy as np
import codecs
from bs4 import BeautifulSoup
from tqdm import tqdm
from utils import *
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

config = json.loads(open("config.json", "r").read())
index_path = config["index_path"]
topics_path = config["topics_path"]
qrels_path = config["qrels_path"]
index_path
device = torch.device('cpu')
regex = re.compile('[^a-zA-Z]')



In [30]:
# wiki_50 = api.load('glove-wiki-gigaword-50')
# wiki_300 = api.load('glove-wiki-gigaword-300')
# wiki_fast = api.load('fasttext-wiki-news-subwords-300')
# google = api.load('word2vec-google-news-300')

wiki_300 = KeyedVectors.load_word2vec_format('~/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.txt', binary=False)


In [3]:
wiki_300.most_similar(['international', 'organized', 'crime'])

[('organised', 0.6641563773155212),
 ('criminal', 0.5866537094116211),
 ('crimes', 0.5811451077461243),
 ('organizations', 0.5762122273445129),
 ('organization', 0.5691279172897339),
 ('trafficking', 0.5649696588516235),
 ('terrorism', 0.5468315482139587),
 ('activities', 0.5334997773170471),
 ('involved', 0.5092121958732605),
 ('corruption', 0.5076022744178772)]

In [4]:
topic = 'hubble space telescope'
et = expand_query(topic, wiki_300, len(topic.split()))
print(et)
hits = Searcher.search(et, 20)
for h in hits:
    print(h.docid, h.score)


hubble space telescope nasa spacecraft observatory
FT933-6678 27.772499084472656
FT934-5418 27.051700592041016
LA040190-0178 25.658899307250977
LA041090-0148 25.251100540161133
FBIS3-42547 25.09709930419922
FBIS4-46650 24.89459991455078
LA050390-0109 24.694799423217773
LA081090-0078 24.25629997253418
LA071490-0091 24.17329978942871
LA060890-0124 24.135099411010742
LA081790-0164 24.1112003326416
LA052890-0021 23.854400634765625
LA070390-0084 23.843700408935547
LA051590-0074 23.284799575805664
LA051490-0110 23.137399673461914
LA080990-0242 23.053499221801758
FT921-7107 22.52090072631836
FT933-6946 22.48889923095703
LA071090-0047 22.33839988708496
FT944-128 22.332000732421875


In [5]:
expanded_topic = expand_query(topic, wiki_300, len(topic.split()), 0)
expanded_topic

'hubble space telescope nasa spacecraft observatory'

# Results

In [6]:
hits = Searcher.weighted_search(expanded_topic, 10, len(topic.split()), (2.0, 1.0))
print(expanded_topic)
for h in hits:
    print(h.docid)

hubble space telescope nasa spacecraft observatory
FT934-5418
LA040190-0178
LA041090-0148
LA052890-0021
LA050390-0109
FT933-6678
LA081090-0078
FT921-7107
LA071090-0047
LA071490-0091


# Results

In [31]:
#k = nro. results per topic
#n = nro. extra words
def make_results(model, n:int = 0, dynamic = False, k:int = 25, threshold = .0):
    topics = get_topics(topics_path)
    results = ""
    for i in tqdm(topics):
        ranking = ""
        topic = topics[i].lower()
        topic = regex.sub('', topic)
        if dynamic:
            extra_words_count = 0;
            for word in topics[i].split():
                if word not in stopwords.words('english'):
                    extra_words_count+=1
            expanded_topic = expand_query(topics[i], model, extra_words_count, threshold)
            
            hits = Searcher.weighted_search(expanded_topic, k, len(expanded_topic.split())-extra_words_count, (1.0, 0.5))
        elif n > 0 and not dynamic: 
            expanded_topic = expand_query(topics[i], model, n, threshold)
            hits = Searcher.weighted_search(expanded_topic, k, len(expanded_topic.split())-n, (1.0, 0.5))
        else:
            hits = Searcher.search(topics[i], k=k)
            
        for r, h in enumerate(hits):
            ranking += f"{i} 0 {h.docid} {r+1} {h.score} RUN1\n"
        results += ranking
        
    filename = ""
    filename = f'results/results_{n}_{k}_{threshold*100}.txt'
            
    f = open(filename, 'w')
    f.write(results)
    f.close()

make_results(wiki_300, 0, k=1000 ) # no expanded query
make_results(wiki_300, 'x', dynamic=True, k=1000)

100%|██████████| 250/250 [00:16<00:00, 15.22it/s]
100%|██████████| 250/250 [07:09<00:00,  1.72s/it]


In [34]:
metric_0 = 0
k=1000
labels_gen = query_labels_from_file(qrels_path, f'results/results_0_{k}_0.0.txt')
r = 0
for labels in labels_gen:
    metric_0+=NDCG(labels, k)
    r+=1
    
print(f"0 {k} {metric_0/r}")

0 1000 0.671331913774708


In [35]:
metric = 0

labels_gen = query_labels_from_file(qrels_path, f'results/results_x_{k}_0.0.txt')
r = 0
for labels in labels_gen:
    metric+=NDCG(labels, k)
    r+=1
     
print(f"x {k} {metric/r}")

x 1000 0.6260634673142748


# Relevance feedback

In [10]:
expand_query_using_relevance_feedback(wiki_300, topic)

'hubble space telescope nasa mission launch'

In [36]:
#k = nro. results per topic
#n = nro. extra words




# make_results(wiki_300, 0, k=1000 ) # no expanded query
# make_results(wiki_300, 'x', dynamic=True, k=1000)
def make_relevancefeedback_results(model, k=25, top_docs=10):
    results = ""
    topics = get_topics(topics_path)
    for i in tqdm(topics):
        ranking = ""
        expanded_topic = expand_query_using_relevance_feedback(model, topics[i], top_docs=top_docs)
        # hits = Searcher.search(expanded_topic, k=k)
        extra_words_count = 0;
        for word in topics[i].split():
            if word not in stopwords.words('english'):
                extra_words_count+=1
        hits = Searcher.weighted_search(expanded_topic, k, len(expanded_topic.split())-extra_words_count, (1.0, 0.5))
        for r, h in enumerate(hits):
            ranking += f"{i} 0 {h.docid} {r+1} {h.score} RUN1\n"
        results += ranking
        
    filename = f"relevance_feedback_results/results_{top_docs}.txt"
    f = open(filename, 'w')
    f.write(results)
    f.close()
        
make_relevancefeedback_results(wiki_300, k=1000, top_docs=10) 

  2%|▏         | 4/250 [00:02<02:29,  1.64it/s]


KeyboardInterrupt: 

In [37]:
precision_0 = 0
ks = [1, 5, 10, 25, 100,200, 300]

for k in ks:
    ndcg_0 = 0
    recall_0 = 0
    labels_gen = query_labels_from_file(qrels_path, f'results/results_0_1000_0.0.txt')
    r = 0
    for labels in labels_gen:
        ndcg_0+=NDCG(labels, k)
        recall_0 += recall(labels, k)
        r+=1
    print(f"ndcg_0@{k}  {ndcg_0/r}")
    print(f"recall_0@{k} {recall_0/r}")
    print("-----------")

ndcg_0@1  0.552
recall_0@1 0.027064048526624005
-----------
ndcg_0@5  0.5130877014278695
recall_0@5 0.11534223207877335
-----------
ndcg_0@10  0.47710895495501693
recall_0@10 0.18188603903429035
-----------
ndcg_0@25  0.43750194853180036
recall_0@25 0.29842566152496647
-----------
ndcg_0@100  0.4881211795501935
recall_0@100 0.550466939579347
-----------
ndcg_0@200  0.546957195792057
recall_0@200 0.6875048288164866
-----------
ndcg_0@300  0.5831085484008359
recall_0@300 0.7723690801778619
-----------


In [38]:
precision_x = 0


for k in ks:
    ndcg_x = 0
    recall_x = 0
    labels_gen = query_labels_from_file(qrels_path, f'results/results_x_1000_0.0.txt')
    r = 0
    for labels in labels_gen:
        ndcg_x += NDCG(labels, k)
        recall_x += recall(labels, k)
        r+=1
     
    print(f"ndcg_x@{k} {ndcg_x/r}")
    print(f"recall_x@{k} {recall_x/r}")
    print("-----------")


ndcg_x@1 0.5
recall_x@1 0.02448819681811842
-----------
ndcg_x@5 0.4434274095660764
recall_x@5 0.0991372701455052
-----------
ndcg_x@10 0.4146531700022015
recall_x@10 0.16191717410800308
-----------
ndcg_x@25 0.38933405421848677
recall_x@25 0.2779996235177552
-----------
ndcg_x@100 0.4362074222107831
recall_x@100 0.5059767816469565
-----------
ndcg_x@200 0.4932198261456034
recall_x@200 0.6387332035018032
-----------
ndcg_x@300 0.5289282177717878
recall_x@300 0.7196643530145901
-----------


In [29]:
precision_relevance_feedback = 0

p = 10

for k in ks:
    ndcg_relevance_feedback = 0
    recall_relevance_feedback = 0
    labels_gen = query_labels_from_file(qrels_path, f"relevance_feedback_results/results_{p}.txt")
    r = 0
    for labels in labels_gen:
        ndcg_relevance_feedback += NDCG(labels, k)
        recall_relevance_feedback += recall(labels, k)
        r+=1
        
    print(f"ndcg_relevance_feedback@{k} {ndcg_relevance_feedback/r}")
    print(f"recall_relevance_feedback@{k} {recall_relevance_feedback/r}")
    print("-----------")



ndcg_relevance_feedback@1 0.52
recall_relevance_feedback@1 0.024944984636676354
-----------
ndcg_relevance_feedback@5 0.4663774167387816
recall_relevance_feedback@5 0.10550456951743935
-----------
ndcg_relevance_feedback@10 0.4313474558823501
recall_relevance_feedback@10 0.1624608094478695
-----------
ndcg_relevance_feedback@25 0.4005016355950077
recall_relevance_feedback@25 0.27547409630539477
-----------
ndcg_relevance_feedback@100 0.44708928845374113
recall_relevance_feedback@100 0.504578449162802
-----------
ndcg_relevance_feedback@200 0.5027170398696538
recall_relevance_feedback@200 0.6373877844122108
-----------
ndcg_relevance_feedback@300 0.5368036242116585
recall_relevance_feedback@300 0.7158612115260511
-----------
