# Basic functions and usage of Doc2Vec wrapper

In [2]:
import json
import sys
import random

sys.path.append("../")

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from models.doc2vec_model import Doc2VecModel, CompareBuildinAndCustomMostSimilar
from tqdm import tqdm

import pandas as pd
import numpy as np

import IR_utils

random.seed(0)
np.random.seed(0)

In [None]:
%load_ext autoreload
%autoreload 2

## Load the corpus data

In [None]:
data_path = "../../data/dataset/corpus.jsonl"
max_docs = -1
docs = {}
with open(data_path, "r") as file:
    for line in file:
        data = json.loads(line)
        docs[data["_id"]] = data["text"]

        if max_docs > 0 and len(docs) == max_docs:
            break

print("Number of documents in corpus: {}".format(len(docs)))

## Preprocess the documents

In [None]:
## Todo!

## Train or Load the Model

In [None]:
# Select to train or load a doc2vec model

vector_size = 30
window = 10
min_count = 50
workers = 16
epochs = 100

train_model = False
if train_model:
    d2v = Doc2VecModel.create_model(
        documents=docs,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        epochs=epochs,
    )

    d2v.fit(progress_bar=False)

    d2v.save(
        f"../../models/doc2vec.docs{len(d2v.model.dv)}.vs{vector_size}.win{window}.min{min_count}.ep{epochs}.model"
    )
else:
    num_docs = len(docs)
    
    path = f"../../models/doc2vec.docs{num_docs}.vs{vector_size}.win{window}.min{min_count}.ep{epochs}.model"
    
    d2v = Doc2VecModel.from_pretrained(path)

## Load the queries

In [None]:
# Load the query data
query_data_path = "../../data/dataset/queries.jsonl"
raw_queries = {}
with open(query_data_path, "r") as file:
    for line in file:
        data = json.loads(line)
        raw_queries[int(data["_id"])] = data["text"]

print("Number of queries: {}".format(len(raw_queries)))

In [None]:
query_ids_df = pd.read_csv("../../data/task1_train.tsv", delimiter="\t")
grouped_queries = query_ids_df.groupby("query-id")

queries = {}
for query_id, group in grouped_queries:
    relevant_doc_ids = group["corpus-id"].tolist()
    scores = group["score"].tolist()

    query_text = raw_queries[query_id]

    queries[query_id] = {
        "text": query_text,
        "relevant_doc_ids": relevant_doc_ids,
        "relevant_doc_scores": scores,
    }

print("Number of queries: {}".format(len(queries)))

In [None]:
TOP_K = 10

query_ids_sample = random.sample(list(queries.keys()),10)
query_ids = list(queries.keys())

for query_id in tqdm(query_ids):
    d2v_query_answers = d2v.find_similar(queries[query_id]["text"], TOP_K)
    
    retrieved_doc_ids = [id for id, score in d2v_query_answers]
    retrieved_doc_scores = [score for id, score in d2v_query_answers]

    queries[query_id]["retrieved_doc_ids"] = retrieved_doc_ids
    queries[query_id]["retrieved_doc_scores"] = retrieved_doc_scores

    queries[query_id]["precision@10"] = IR_utils.precision_K(
        retrieved_docs=retrieved_doc_ids,
        relevant_docs=queries[query_id]["relevant_doc_ids"],
        K=10,
    )

    queries[query_id]["recall@10"] = IR_utils.recall_K(
        retrieved_docs=retrieved_doc_ids,
        relevant_docs=queries[query_id]["relevant_doc_ids"],
        K=10,
    )
    
    #d2v_query_answers_buildin = d2v.find_similar(queries[query_id]["text"], TOP_K, True)
    #d2v_query_answers_buildin_ids = [x[0] for x in d2v_query_answers_buildin]
    #print(d2v_query_answers_buildin_ids)
    #print(retrieved_doc_ids)
    #print()
        
    #if queries[query_id]["precision@10"] > 0.0:
    #    print(queries[query_id]["precision@10"], queries[query_id]["recall@10"])