# Basic functions and usage of Doc2Vec wrapper

In [78]:
import json
import sys

sys.path.append("../")

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from models.doc2vec_model import Doc2VecModel
from tqdm import tqdm

import pandas as pd
import numpy as np

import IR_utils


In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the corpus data

In [21]:
data_path = "../../data/dataset/corpus.jsonl"
max_docs = -1
docs = {}
with open(data_path, "r") as file:
    for line in file:
        data = json.loads(line)
        docs[data["_id"]] = data["text"]

        if max_docs > 0 and len(docs) == max_docs:
            break

print("Number of documents in corpus: {}".format(len(docs)))

Number of documents in corpus: 1471406


## Preprocess the documents

In [22]:
## Todo!

## Train or Load the Model

### Training (disabled)

In [23]:
if False:
    vector_size = 10
    window = 5
    min_count = 60
    workers = 16
    epochs = 20
    d2v = Doc2VecModel(
        docs,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        epochs=epochs,
    )

    d2v.fit()

    d2v.save(
        f"../../models/doc2vec.docs{len(d2v.model.dv)}.vs{vector_size}.win{window}.min{min_count}.ep{epochs}.model"
    )

### Load the model from disk

In [24]:
vector_size = 20
window = 10
min_count = 60
workers = 16
epochs = 20
num_docs = len(docs)

path = f"../../models/doc2vec.docs{num_docs}.vs{vector_size}.win{window}.min{min_count}.ep{epochs}.model"

d2v = Doc2VecModel(path)

## Load the queries

In [33]:
# Load the query data
query_data_path = "../../data/dataset/queries.jsonl"
raw_queries = {}
with open(query_data_path, "r") as file:
    for line in file:
        data = json.loads(line)
        raw_queries[int(data["_id"])] = data["text"]

print("Number of queries: {}".format(len(raw_queries)))

Number of queries: 509962


In [72]:
query_ids_df = pd.read_csv("../../data/task1_train.tsv", delimiter="\t")
grouped_queries = query_ids_df.groupby("query-id")

queries = {}
for query_id, group in grouped_queries:
    relevant_doc_ids = group["corpus-id"].tolist()
    scores = group["score"].tolist()

    query_text = raw_queries[query_id]

    queries[query_id] = {
        "text": query_text,
        "relevant_doc_ids": relevant_doc_ids,
        "relevant_doc_scores": scores,
    }

print("Number of queries: {}".format(len(queries)))

Number of queries: 502939


In [87]:
TOP_K = 10

for query_id in tqdm(queries.keys()):
    d2v_query_answers = d2v.find_similar(queries[query_id]["text"], TOP_K)

    retrieved_doc_ids = []
    retrieved_doc_scores = []
    for doc_id, score in d2v_query_answers:
        retrieved_doc_ids.append(doc_id)
        retrieved_doc_scores.append(score)

    queries[query_id]["retrieved_doc_ids"] = retrieved_doc_ids
    queries[query_id]["retrieved_doc_scores"] = retrieved_doc_scores

    queries[query_id]["precision@10"] = IR_utils.precision_K(
        retrieved_docs=retrieved_doc_ids,
        relevant_docs=queries[query_id]["relevant_doc_ids"],
        K=10,
    )
    
    queries[query_id]["recall@10"] = IR_utils.recall_K(
        retrieved_docs=retrieved_doc_ids,
        relevant_docs=queries[query_id]["relevant_doc_ids"],
        K=10,
    )
    
    #print(retrieved_doc_ids)
    #print(queries[query_id]["relevant_doc_ids"])
    #print(queries[query_id]["precision@10"], queries[query_id]["recall@10"])

    if queries[query_id]["precision@10"] > 0.0:
        print(queries[query_id]["precision@10"], queries[query_id]["recall@10"])

  0%|          | 1804/502939 [00:41<3:28:17, 40.10it/s]

0.1 1.0


  1%|▏         | 7317/502939 [02:32<2:42:04, 50.97it/s]

0.1 1.0


  2%|▏         | 9637/502939 [03:17<2:48:53, 48.68it/s]


KeyboardInterrupt: 