In [1]:
%load_ext autoreload
%autoreload 2
import json
import numpy as np
from gensim.models.wrappers import FastText
from gensim.models import KeyedVectors

from tqdm.notebook import tqdm
tqdm.pandas()

import sys
import logging
logging.basicConfig(level=logging.INFO, filename='embedding_test_all.log')

from evaluation import *
from preprocessing import BasicPreprocessing, StopWordPreprocessor, Corpus
from word_embedding_retrieval import *

  from pandas import Panel


In [2]:
base_file =  "../data/kit_expert_2017_papers.csv"
all_papers_file = "../data/general_papers.csv"
p = [BasicPreprocessing(), StopWordPreprocessor()]
papers_basic_nostopwords = Corpus(base_file, p, load_from_cache=True, n_jobs=1)
all_papers_basic_nostopwords = Corpus(all_papers_file, p, load_from_cache=True, n_jobs=4)

In [3]:
with open("../data/kit_expert_2017_keywords.json", "r") as file:
    keywords = json.load(file)
general_keywords = ("general keywords", [k for k in keywords if k["level"]<=1 ])
specific_keywords = ("specific_keywords", [k for k in keywords if k["level"]>=2 ][:5000])

In [4]:
pretrained_models = [
    ("Fasttext from scratch w=5", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = all_papers_basic_nostopwords,
        doc_corpus = papers_basic_nostopwords,
        sentence_embedder = AverageSentenceEmbedding,
        
        window_size=5)
    ),
#     ("Fasttext pretrained wiki", partial(
#         WordEmbeddingRetrieval.from_pretrained_embedding,
#         corpus = papers_basic_nostopwords,
#         sentence_embedder = AverageSentenceEmbedding,
#         pretrained_model_path = "../data/models/fasttext/wiki.en.bin")
#     ),
    ("Fasttext finetuned wiki", partial(
        WordEmbeddingRetrieval.from_finetuned_embedding,
        corpus = all_papers_basic_nostopwords,
        doc_corpus = papers_basic_nostopwords,
        sentence_embedder = AverageSentenceEmbedding,
        pretrained_model_path = "../data/models/fasttext/wiki.en.bin")
    ),
]

In [None]:
pretrained_results = train_evaluate_models(pretrained_models, [general_keywords, specific_keywords], n_jobs=3)

from_new_embedding
from_finetuned_embedding


In [10]:
print("mAP scores for pretrained models:")
pretrained_results

mAP scores for pretrained models:


Unnamed: 0,general keywords,specific_keywords
Fasttext from scratch w=5,0.119104,0.191036
Fasttext finetuned wiki,0.120579,0.181598


In [4]:
search_window_size_models = [
    (f"Fasttext from scratch w={window_size}", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = all_papers_basic_nostopwords,
        doc_corpus = papers_basic_nostopwords,
        sentence_embedder = AverageSentenceEmbedding,
        window_size=window_size)
    )
    for window_size in [9,19,29]
]

In [None]:
search_window_size_results = train_evaluate_models(search_window_size_models, [general_keywords, specific_keywords], n_jobs=3)

from_new_embedding
from_new_embedding
from_new_embedding


In [7]:
search_window_size_results

Unnamed: 0,general keywords,specific_keywords
Fasttext from scratch w=9,0.118987,0.157548
Fasttext from scratch w=19,0.125979,0.157121
Fasttext from scratch w=29,0.128977,0.154507


In [16]:
print("mAP scores for window size search models:")
search_window_size_results

mAP scores for window size search models:


Unnamed: 0,general keywords,specific_keywords
Fasttext from scratch w=3,0.10072,0.194378
Fasttext from scratch w=5,0.109446,0.214477
Fasttext from scratch w=7,0.111308,0.229064
Fasttext from scratch w=9,0.115127,0.239992
Fasttext from scratch w=19,0.118319,0.265147
Fasttext from scratch w=29,0.122126,0.273782
Fasttext from scratch w=39,0.120768,0.28163
Fasttext from scratch w=49,0.120709,0.280478
Fasttext from scratch w=59,0.119116,0.278808
Fasttext from scratch w=69,0.119178,0.279651


In [None]:
search_vec_size_models = [
    (f"Fasttext from scratch s={embedding_size}", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = all_papers_basic_nostopwords,
        doc_corpus = papers_basic_nostopwords,
        sentence_embedder = AverageSentenceEmbedding,
        window_size=5,
        embedding_size=embedding_size)
    )
    for embedding_size in [100,200,300,400,500,600,700,800]
]

In [None]:
search_vec_size_results = train_evaluate_models(search_vec_size_models, [general_keywords, specific_keywords], n_jobs=4)

In [17]:
print("mAP scores for embedding vector size search models:")
search_vec_size_results

mAP scores for embedding vector size search models:


Unnamed: 0,general keywords,specific_keywords
Fasttext from scratch s=100,0.103092,0.15737
Fasttext from scratch s=200,0.107897,0.198498
Fasttext from scratch s=300,0.10847,0.216062
Fasttext from scratch s=400,0.110393,0.224441
Fasttext from scratch s=500,0.108084,0.229331
Fasttext from scratch s=600,0.107626,0.233991
Fasttext from scratch s=700,0.10757,0.234928
Fasttext from scratch s=800,0.106576,0.235087


In [None]:
sentence_embedding_models = [
    ("Fasttext from scratch average embedding", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = all_papers_basic_nostopwords,
        doc_corpus = papers_basic_nostopwords,
        sentence_embedder = AverageSentenceEmbedding,
        window_size=5,
        embedding_size=300)
    ),
    ("Fasttext from scratch idf embedding", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = all_papers_basic_nostopwords,
        doc_corpus = papers_basic_nostopwords,
        sentence_embedder = TfidfSentenceEmbedding,
        window_size=5,
        embedding_size=300)
    ),
    ("Fasttext from scratch sif embedding", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = all_papers_basic_nostopwords,
        doc_corpus = papers_basic_nostopwords,
        sentence_embedder = SifSentenceEmbedding,
        window_size=5,
        embedding_size=300)
    )
]

In [None]:
sentence_embedding_results = train_evaluate_models(sentence_embedding_models, [general_keywords, specific_keywords], n_jobs=4)

In [18]:
print("mAP scores for sentence embedding models:")
sentence_embedding_results

mAP scores for sentence embedding models:


Unnamed: 0,general keywords,specific_keywords
Fasttext from scratch average embedding,0.10889,0.214719
Fasttext from scratch idf embedding,0.115,0.258512
Fasttext from scratch sif embedding,0.110788,0.240554


In [19]:
best_model = [
    ("Fasttext best for general", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = all_papers_basic_nostopwords,
        doc_corpus = papers_basic_nostopwords,
        sentence_embedder = TfidfSentenceEmbedding,
        window_size=29,
        embedding_size=400)
    ),
    ("Fasttext best for specific", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = all_papers_basic_nostopwords,
        doc_corpus = papers_basic_nostopwords,
        sentence_embedder = TfidfSentenceEmbedding,
        window_size=39,
        embedding_size=800)
    ),
]

In [None]:
best_results = train_evaluate_models(best_model, [general_keywords, specific_keywords], n_jobs=4)

In [22]:
print("mAP scores for best model:")
best_results

mAP scores for best model:


Unnamed: 0,general keywords,specific_keywords
Fasttext best for general,0.126743,0.342433
Fasttext best for specific,0.122381,0.368905


In [9]:
from query_expansion_retrieval import QueryExpansionRetrieval

In [10]:
with open("../data/keyword_hierarchy.json", 'r') as file:
    keyword_hierarchy = json.load(file)

In [None]:
fasttext_model = WordEmbeddingRetrieval.from_new_embedding(
        corpus = all_papers_basic_nostopwords,
        doc_corpus = papers_basic_nostopwords,
        sentence_embedder = TfidfSentenceEmbedding,
        window_size=29,
        embedding_size=400)

from_new_embedding


In [21]:
query_expansion_models = [
    ("qe fasttext expand all", 
     partial(QueryExpansionRetrieval, 
             wrapped_model=fasttext_model, 
             expansion_hierarchy=keyword_hierarchy,
             only_expand_once=False,
             separate_weighting=False)),
    ("qe fasttext expand first",
     partial(QueryExpansionRetrieval, 
             wrapped_model=fasttext_model, 
             expansion_hierarchy=keyword_hierarchy,
             only_expand_once=True,
             separate_weighting=False)),
    ("qe fasttext expand all separate weighting",
     partial(QueryExpansionRetrieval, 
             wrapped_model=fasttext_model, 
             expansion_hierarchy=keyword_hierarchy,
             only_expand_once=False,
             separate_weighting=True)),
    ("qe fasttext expand first separate weighting",
     partial(QueryExpansionRetrieval, 
             wrapped_model=fasttext_model, 
             expansion_hierarchy=keyword_hierarchy,
             only_expand_once=True,
             separate_weighting=True))
]

In [22]:
query_expansion_results = train_evaluate_models(query_expansion_models, [general_keywords, specific_keywords], n_jobs=1)

HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=5293), HTML(value='')), layout=Layout(display…




In [20]:
print("mAP scores for query expansion models:")
query_expansion_results

mAP scores for query expansion models:


Unnamed: 0,general keywords,specific_keywords
qe fasttext expand all,0.166336,0.315127
qe fasttext expand first,0.19174,0.320181
qe fasttext expand all separate weighting,0.157512,0.344919
qe fasttext expand first separate weighting,0.156098,0.345124


In [None]:
best_fasttext_model = WordEmbeddingRetrieval.from_new_embedding(
        corpus = papers_basic_nostopwords,
        sentence_embedder = TfidfSentenceEmbedding,
        window_size=29,
        embedding_size=400)
best_fasttext_model.save("../data/models/fasttext/fasttext_w29_s400_tfidf.model")

from_new_embedding


In [12]:
best_qe_model = QueryExpansionRetrieval(
    wrapped_model=best_fasttext_model,
    expansion_hierarchy=keyword_hierarchy,
    only_expand_once=True,
    separate_weighting=False)
best_qe_model.save("../data/models/fasttext/qe_best_fasttext_exponce.model")