# Basic functions and usage of Doc2Vec wrapper

In [4]:
import json
import sys
import random

sys.path.append("../")

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from models.doc2vec_model import Doc2VecModel, CompareBuildinAndCustomMostSimilar
from tqdm import tqdm
from preprocessors.preprocessor import Preprocessor
from preprocessors.expander import Expander

import pandas as pd
import numpy as np

import IR_utils

random.seed(0)
np.random.seed(0)

%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manoschatzakis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/manoschatzakis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/manoschatzakis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load the corpus data

In [5]:
preprocessor = Preprocessor(expander=Expander())

docs = preprocessor.load_docs("../../data/dataset/tokenized_corpus.jsonl")

test_queries_t1 = IR_utils.load_test_queries_t1(
    "../../data/dataset/queries.jsonl", "../../data/task1_test.tsv"
)[0]

test_queries_t2 = IR_utils.load_test_queries_t2(
    "../../data/dataset/queries.jsonl", "../../data/task2_test.tsv"
)[0]


print("Number of docs:", len(docs))
print("Number of queries (t1):", len(test_queries_t1))
print("Number of queries (t2):", len(test_queries_t2))

Number of docs: 1471406
Number of queries (t1): 7437
Number of queries (t2): 33


## Train or Load the Model

In [6]:
# Select to train or load a doc2vec model
vector_size = 30
window = 10
min_count = 60
workers = 16
epochs = 150

num_docs = len(docs)

path = f"../../models/d2v/doc2vec.docs{num_docs}.vs{vector_size}.win{window}.min{min_count}.ep{epochs}.model"

d2v = Doc2VecModel.from_pretrained(path)

In [8]:
for query_data in tqdm(test_queries_t1, desc="Query Preprocessing and Expansion", unit=" queries"):
    query_text = query_data["text"]
    query_data["query_terms"] = preprocessor.preprocess_query(query_text, expand=True)

for query_data in tqdm(test_queries_t2, desc="Query Preprocessing and Expansion", unit=" queries"):
    query_text = query_data["text"]
    query_data["query_terms"] = preprocessor.preprocess_query(query_text, expand=True)
    

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/manoschatzakis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Query Preprocessing and Expansion: 100%|██████████| 7437/7437 [00:00<00:00, 9186.10 queries/s]
Query Preprocessing and Expansion: 100%|██████████| 33/33 [00:00<00:00, 10086.14 queries/s]


In [None]:
csv_string = "id,corpus-id,score\n"
for query_data in tqdm(test_queries_t1, desc="Querying", unit=" queries"):
    query_index = query_data["id"]
    results = d2v.find_similar(query_data["query_terms"], 10)

    doc_ids = [result[0] for result in results]
    csv_string += f"{query_index},\"{doc_ids}\",-1\n"

for query_data in tqdm(test_queries_t2, desc="Querying", unit=" queries"):
    query_index = query_data["id"]

    document_scores = list(d2v.get_document_scores(query_data["relevant_doc_ids"], query_data["query_terms"]))

    csv_string += f"{query_index},-1,\"{document_scores}\"\n"

with open("../../submissions/d2v_submission.csv", "w") as f:
    f.write(csv_string)