In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
from gensim.models.wrappers import FastText
from gensim.models import KeyedVectors

from tqdm.notebook import tqdm
tqdm.pandas()

from evaluation import *
from preprocessing import BasicPreprocessing, StopWordPreprocessor, Corpus
from word_embedding_retrieval import *

  from pandas import Panel


In [2]:
base_file =  "../data/kit_expert_2017_papers.csv"
p = [BasicPreprocessing(), StopWordPreprocessor()]
papers_basic_nostopwords = Corpus(base_file, p, load_from_cache=True, n_jobs=1)

In [3]:
import json
with open("../data/kit_expert_2017_keywords.json", "r") as file:
    keywords = json.load(file)
general_keywords = ("general keywords", [k for k in keywords if k["level"]<=1 ])
specific_keywords = ("specific_keywords", [k for k in keywords if k["level"]>=2 ][:5000])

In [5]:
pretrained_models = [
    ("Fasttext from scratch w=5", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        sentence_embedder = AverageSentenceEmbedding,
        window_size=5)
    ),
    ("Fasttext pretrained wiki", partial(
        WordEmbeddingRetrieval.from_pretrained_embedding,
        corpus = papers_basic_nostopwords,
        sentence_embedder = AverageSentenceEmbedding,
        pretrained_model_path = "")
    ),
    ("Fasttext finetuned wiki", partial(
        WordEmbeddingRetrieval.from_finetuned_embedding,
        corpus = papers_basic_nostopwords,
        sentence_embedder = AverageSentenceEmbedding,
        pretrained_model_path = "")
    ),
]

In [None]:
pretrained_results = train_evaluate_models(pretrained_models, [general_keywords, specific_keywords], n_jobs=3)

In [None]:
print("mAP scores for pretrained models:")
pretrained_results

In [None]:
search_window_size_models = [
    (f"Fasttext from scratch w={window_size}", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        sentence_embedder = AverageSentenceEmbedding,
        window_size=window_size)
    )
    for window_size in [3,5,7,9,19,29,39,49,59,69]
]

In [None]:
search_window_size_results = train_evaluate_models(search_window_size_models, [general_keywords, specific_keywords], n_jobs=5)

In [None]:
print("mAP scores for window size search models:")
search_window_size_results

In [None]:
search_vec_size_models = [
    (f"Fasttext from scratch s={embedding_size}", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        sentence_embedder = AverageSentenceEmbedding,
        window_size=5,
        embedding_size=embedding_size)
    )
    for embedding_size in [100,200,300,400,500,600,700,800]
]

In [None]:
search_vec_size_results = train_evaluate_models(search_vec_size_models, [general_keywords, specific_keywords], n_jobs=4)

In [None]:
print("mAP scores for embedding vector size search models:")
search_vec_size_results

In [6]:
sentence_embedding_models = [
    ("Fasttext from scratch average embedding", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        sentence_embedder = AverageSentenceEmbedding,
        window_size=5,
        embedding_size=300)
    ),
    ("Fasttext from scratch idf embedding", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        sentence_embedder = TfidfSentenceEmbedding,
        window_size=5,
        embedding_size=300)
    ),
    ("Fasttext from scratch sif embedding", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        sentence_embedder = SifSentenceEmbedding,
        window_size=5,
        embedding_size=300)
    )
]

In [None]:
sentence_embedding_results = train_evaluate_models(sentence_embedding_models, [general_keywords, specific_keywords], n_jobs=4)

In [None]:
print("mAP scores for sentence embedding models:")
sentence_embedding_results

In [19]:
results3

Unnamed: 0,general keywords,specific_keywords
Fasttext from scratch w=51,0.122398,0.2753
Fasttext from scratch w=71,0.120828,0.27588
Fasttext from scratch w=91,0.121188,0.276131
