In [1]:
%load_ext autoreload
%autoreload 2
import json
import sys
import os
import pickle
import logging
logging.basicConfig(level=logging.INFO, filename="fasttext.log")#, stream=sys.stdout)
logging.getLogger("gensim.models.word2vec").setLevel(logging.WARNING)

import pandas as pd
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 1000)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


from tqdm.notebook import tqdm
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    tqdm.pandas()

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from evaluation import *
from preprocessing import Corpus, BasicPreprocessing, BigramPreprocessor, SpacyPreprocessor, StopWordPreprocessor
from retrieval_algorithms.word_vector_retrieval_algorithm import *

In [2]:
base_file =  "../../data/kit_expert_2019_all_papers.csv"
p = [BasicPreprocessing()]
papers_basic = Corpus(base_file, p, load_from_cache=True, n_jobs=4)

In [3]:
with open("../../data/kit_expert_2019_all_keywords.json", "r") as file:
    keywords = json.load(file)
general_keywords = ("general keywords", [k for k in keywords if k["level"]<=1])
specific_keywords = ("specific_keywords", [k for k in keywords if k["level"]>=2 and len(k["paper_ids"])>=10][:5000])

In [4]:
pretrained_models = [
    ("Fasttext from scratch w=5",
     WordVectorRetrievalAlgorithm(
        NewlyTrainedEmbedding(window_size=5, embedding_size=300, sg=False, negative=10),
        AverageSentenceEmbedding),
     papers_basic
    ),
    ("Fasttext from scratch w=5",
     WordVectorRetrievalAlgorithm(
        NewlyTrainedEmbedding(window_size=5, embedding_size=300, sg=True, negative=10),
        AverageSentenceEmbedding),
     papers_basic
    ),
    ("Fasttext pretrained wiki",
     WordVectorRetrievalAlgorithm(
        PreTrainedEmbedding("../../data/models/fasttext/wiki.en.bin"),
        AverageSentenceEmbedding),
     papers_basic
    ),
    ("Fasttext finetuned wiki",
     WordVectorRetrievalAlgorithm(
        FineTunedEmbedding("../../data/models/fasttext/wiki.en.bin"),
        AverageSentenceEmbedding),
     papers_basic
    ),
]

In [None]:
pretrained_results = train_evaluate_models(pretrained_models, [general_keywords, specific_keywords], n_jobs=len(pretrained_models))

In [None]:
print("mAP scores for pretrained models:")
pretrained_results

In [None]:
search_window_size_models = [
    (f"Fasttext from scratch w={window_size}",
     WordVectorRetrievalAlgorithm(
         NewlyTrainedEmbedding(window_size=window_size, embedding_size=300, sg=True, negative=10),
         AverageSentenceEmbedding),
     papers_basic)
    for window_size in [3,5,7,10,15,20,30,40,50,60,70,80]
]

In [None]:
search_window_size_results = train_evaluate_models(search_window_size_models, [general_keywords, specific_keywords], n_jobs=8)

In [None]:
print("mAP scores for window size search models:")
search_window_size_results

In [None]:
search_vec_size_models = [
    (f"Fasttext from scratch s={embedding_size}",
     WordVectorRetrievalAlgorithm(
         NewlyTrainedEmbedding(window_size=60, embedding_size=embedding_size, sg=True, negative=10),
         AverageSentenceEmbedding),
     papers_basic)
    for embedding_size in [100,200,300,400,500,600,700,800]
]

In [None]:
search_vec_size_results = train_evaluate_models(search_vec_size_models, [general_keywords, specific_keywords], n_jobs=4)

In [None]:
print("mAP scores for embedding vector size search models:")
search_vec_size_results

In [None]:
sentence_embedding_models = [
    ("Fasttext from scratch average embedding", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        sentence_embedder = AverageSentenceEmbedding,
        window_size=5,
        embedding_size=300)
    ),
    ("Fasttext from scratch idf embedding", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        sentence_embedder = TfidfSentenceEmbedding,
        window_size=5,
        embedding_size=300)
    ),
    ("Fasttext from scratch sif embedding", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        sentence_embedder = SifSentenceEmbedding,
        window_size=5,
        embedding_size=300)
    )
]

In [None]:
sentence_embedding_results = train_evaluate_models(sentence_embedding_models, [general_keywords, specific_keywords], n_jobs=4)

In [None]:
print("mAP scores for sentence embedding models:")
sentence_embedding_results

In [None]:
best_model = [
    ("Fasttext best for general", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        sentence_embedder = TfidfSentenceEmbedding,
        window_size=29,
        embedding_size=400)
    ),
    ("Fasttext best for specific", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        sentence_embedder = TfidfSentenceEmbedding,
        window_size=39,
        embedding_size=800)
    ),
]

In [None]:
best_results = train_evaluate_models(best_model, [general_keywords, specific_keywords], n_jobs=4)

In [26]:
print("mAP scores for best model:")
best_results

mAP scores for best model:


Unnamed: 0,general keywords,specific_keywords
Fasttext best for general,0.101718,0.200006
Fasttext best for specific,0.099551,0.222662


In [4]:
from query_expansion_retrieval import QueryExpansionRetrieval

In [5]:
with open("../data/keyword_hierarchy.json", 'r') as file:
    keyword_hierarchy = json.load(file)

In [None]:
fasttext_model = WordEmbeddingRetrieval.from_new_embedding(
        corpus = papers_basic_nostopwords,
        sentence_embedder = TfidfSentenceEmbedding,
        window_size=29,
        embedding_size=400)

In [None]:
query_expansion_models = [
    ("qe fasttext expand all", 
     partial(QueryExpansionRetrieval, 
             wrapped_model=fasttext_model, 
             expansion_hierarchy=keyword_hierarchy,
             only_expand_once=False,
             separate_weighting=False)),
    ("qe fasttext expand first",
     partial(QueryExpansionRetrieval, 
             wrapped_model=fasttext_model, 
             expansion_hierarchy=keyword_hierarchy,
             only_expand_once=True,
             separate_weighting=False)),
    ("qe fasttext expand all separate weighting",
     partial(QueryExpansionRetrieval, 
             wrapped_model=fasttext_model, 
             expansion_hierarchy=keyword_hierarchy,
             only_expand_once=False,
             separate_weighting=True)),
    ("qe fasttext expand first separate weighting",
     partial(QueryExpansionRetrieval, 
             wrapped_model=fasttext_model, 
             expansion_hierarchy=keyword_hierarchy,
             only_expand_once=True,
             separate_weighting=True))
]

In [None]:
query_expansion_results = train_evaluate_models(query_expansion_models, [general_keywords, specific_keywords], n_jobs=1)

In [27]:
print("mAP scores for query expansion models:")
query_expansion_results

mAP scores for query expansion models:


Unnamed: 0,general keywords,specific_keywords
qe fasttext expand all,0.125196,0.186611
qe fasttext expand first,0.150841,0.190594
qe fasttext expand all separate weighting,0.137301,0.199407
qe fasttext expand first separate weighting,0.145328,0.1998


In [None]:
best_fasttext_model = WordEmbeddingRetrieval.from_new_embedding(
        corpus = papers_basic_nostopwords,
        sentence_embedder = TfidfSentenceEmbedding,
        window_size=29,
        embedding_size=400)
best_fasttext_model.save("../data/models/fasttext/fasttext_w29_s400_tfidf.model")

from_new_embedding


In [None]:
best_qe_model = QueryExpansionRetrieval(
    wrapped_model=best_fasttext_model,
    expansion_hierarchy=keyword_hierarchy,
    only_expand_once=True,
    separate_weighting=False)
best_qe_model.save("../data/models/fasttext/qe_best_fasttext_exponce.model")