In [9]:
%load_ext autoreload
%autoreload 2
import numpy as np
from gensim.models.wrappers import FastText
from gensim.models import KeyedVectors

from tqdm.notebook import tqdm
tqdm.pandas()

from evaluation import *
from preprocessing import BasicPreprocessing, StopWordPreprocessor, Corpus
from word_embedding_retrieval import WordEmbeddingRetrieval

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
base_file =  "../data/kit_expert_2017_papers.csv"
p = [BasicPreprocessing(), StopWordPreprocessor()]
papers_basic_nostopwords = Corpus(base_file, p, load_from_cache=True, n_jobs=1)

In [11]:
import json
with open("../data/kit_expert_2017_keywords.json", "r") as file:
    keywords = json.load(file)
general_keywords = ("general keywords", [k for k in keywords if k["level"]<=1 ])
specific_keywords = ("specific_keywords", [k for k in keywords if k["level"]>=2 ][:5000])

In [12]:
models = [
    ("Fasttext pretrained wiki", partial(
        WordEmbeddingRetrieval.from_pretrained_embedding,
        corpus = papers_basic_nostopwords,
        pretrained_model_path="../data/models/fasttext/wiki.en.bin")
    ),
    ("Fasttext finetuned wiki", partial(
        WordEmbeddingRetrieval.from_finetuned_embedding,
        corpus = papers_basic_nostopwords,
        pretrained_model_path="../data/models/fasttext/wiki.en.bin")
    ),
    ("Fasttext from scratch w=3", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        window_size=3)
    ),
    ("Fasttext from scratch w=5", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        window_size=5)
    ),
    ("Fasttext from scratch w=7", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        window_size=7)
    ),
    ("Fasttext from scratch w=9", partial(
        WordEmbeddingRetrieval.from_new_embedding,
        corpus = papers_basic_nostopwords,
        window_size=9)
    ),
]

In [None]:
results = train_evaluate_models(models, [general_keywords, specific_keywords])

In [None]:
print("mAP scores for models:")
results

In [8]:
nn_papers = [k for k in keywords if k["keyword"] == "artificial neural network"][0]["paper_ids"]
ranking = embedding_model.get_ranked_documents("artificial neural network")
ranking["correct"] = ranking["id"].isin(nn_papers)
ranking[:10]

Unnamed: 0,id,score,correct
0,2620607813,0.789564,True
1,2766738555,0.786661,True
2,2097775371,0.781397,False
3,2397534874,0.780992,True
4,2081802021,0.776456,True
5,2233968761,0.773032,True
6,1022989747,0.772094,True
7,2966648307,0.770548,True
8,2891889422,0.77043,True
9,1561147972,0.762731,False


In [9]:
from scipy.spatial.distance import cosine
cosine(embedding_model.document_lookup.get_vector(2738221500), embedding_model.model[["artificial"]].mean(axis=0))

0.44634393717335463

In [10]:
cosine(embedding_model.document_lookup.get_vector(2938948885), embedding_model.model[["artificial"]].mean(axis=0))

0.4576468073089517

In [None]:
embedding_model.model.similarity(embedding_model.model[2938948885])

In [11]:
evaluate_model(embedding_model, [("test", [k for k in keywords if k["keyword"] == "artificial neural network"])])

HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=1), HTML(value='')), layout=Layout(display='i…




{'test': 0.2962911761327195}

In [12]:
evaluate_model(embedding_model, [general_keywords, specific_keywords])

HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=3293), HTML(value='')), layout=Layout(display…




{'general keywords': 0.09990206430413857,
 'specific_keywords': 0.21546136783117614}

In [80]:
evaluate_model(embedding_model, [general_keywords, specific_keywords])

HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=3293), HTML(value='')), layout=Layout(display…




{'general keywords': 0.07690513903082963,
 'specific_keywords': 0.20399510064914594}