In [2]:
import json
import sys
import random
import gc

sys.path.append("../")

from models.vector_model import vector_model
from tqdm import tqdm
from preprocessors.preprocessor import Preprocessor
from preprocessors.expander import Expander

import pandas as pd
import numpy as np

import IR_utils

random.seed(0)
np.random.seed(0)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/manoschatzakis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
preprocessor = Preprocessor(expander=Expander())

docs = preprocessor.load_docs("../../data/dataset/tokenized_corpus.jsonl")

test_queries_t1 = IR_utils.load_test_queries_t1(
    "../../data/dataset/queries.jsonl", "../../data/task1_test.tsv"
)[0]

test_queries_t2 = IR_utils.load_test_queries_t2(
    "../../data/dataset/queries.jsonl", "../../data/task2_test.tsv"
)[0]


print("Number of docs:", len(docs))
print("Number of queries (t1):", len(test_queries_t1))
print("Number of queries (t2):", len(test_queries_t2))

Number of docs: 1471406
Number of queries (t1): 7437
Number of queries (t2): 33


In [4]:
for query_data in tqdm(test_queries_t1, desc="Query Preprocessing and Expansion", unit=" queries"):
    query_text = query_data["text"]
    query_data["query_terms"] = preprocessor.preprocess_query(query_text, expand=True)

for query_data in tqdm(test_queries_t2, desc="Query Preprocessing and Expansion", unit=" queries"):
    query_text = query_data["text"]
    query_data["query_terms"] = preprocessor.preprocess_query(query_text, expand=True)

Query Preprocessing and Expansion: 100%|██████████| 7437/7437 [00:03<00:00, 1908.75 queries/s]
Query Preprocessing and Expansion: 100%|██████████| 33/33 [00:00<00:00, 8682.23 queries/s]


In [8]:
vm = vector_model.from_pretrained("../../models/vm/mdf30000")

In [9]:
csv_string = "id,corpus-id,score\n"
for query_data in tqdm(test_queries_t1, desc="Querying", unit=" queries"):
    query_index = query_data["id"]
    results = vm.find_similar(query_data["query_terms"], 10)

    doc_ids = [result[0] for result in results]
    csv_string += f'{query_index},"{doc_ids}",-1\n'

for query_data in tqdm(test_queries_t2, desc="Querying", unit=" queries"):
    query_index = query_data["id"]

    document_scores = list(
        vm.get_document_scores(
            query_data["relevant_doc_ids"], query_data["query_terms"]
        )
    )

    csv_string += f'{query_index},-1,"{document_scores}"\n'

with open("../../submissions/d2v_submission.csv", "w") as f:
    f.write(csv_string)

Querying:   0%|          | 0/7437 [00:00<?, ? queries/s]

Querying:  28%|██▊       | 2106/7437 [01:34<04:35, 19.37 queries/s]