## Lexicon Expansion with Active Learning

In [24]:
from IPython.display import clear_output
from gensim.models.fasttext import FastText
from gensim.models.word2vec import Word2Vec
from scipy.spatial.distance import cosine
from modAL.models import ActiveLearner
from sklearn.svm import SVC
from tqdm import tqdm_notebook
import logging
import numpy as np
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
model_type = "Word2Vec" # FastText
path = "./models/kranten_pd_1875-6_model.{}".format(model_type.lower())
exec("model = {}.load(path)".format(model_type))
print(model)

2019-08-09 15:56:43,908 : INFO : loading Word2Vec object from ./models/kranten_pd_1875-6_model.word2vec
2019-08-09 15:56:44,398 : INFO : loading wv recursively from ./models/kranten_pd_1875-6_model.word2vec.wv.* with mmap=None
2019-08-09 15:56:44,399 : INFO : loading vectors from ./models/kranten_pd_1875-6_model.word2vec.wv.vectors.npy with mmap=None
2019-08-09 15:56:44,461 : INFO : setting ignored attribute vectors_norm to None
2019-08-09 15:56:44,462 : INFO : loading vocabulary recursively from ./models/kranten_pd_1875-6_model.word2vec.vocabulary.* with mmap=None
2019-08-09 15:56:44,463 : INFO : loading trainables recursively from ./models/kranten_pd_1875-6_model.word2vec.trainables.* with mmap=None
2019-08-09 15:56:44,463 : INFO : loading syn1neg from ./models/kranten_pd_1875-6_model.word2vec.trainables.syn1neg.npy with mmap=None
2019-08-09 15:56:44,529 : INFO : setting ignored attribute cum_table to None
2019-08-09 15:56:44,530 : INFO : loaded ./models/kranten_pd_1875-6_model.word2

Word2Vec(vocab=219374, size=100, alpha=0.025)


In [109]:
from modAL.uncertainty import entropy_sampling,margin_sampling
examples = ["kennen","wapen","bloem","vrouw","moeder","dochter","zuster","echtgenoote"]
y_training = np.array([0,0,0,1,1,1,1,1])
X_training = np.array([model.wv[w] for w in examples])
print(X_training.shape,y_training.shape)

X = np.array([model.wv[w] for w in model.wv.vocab])
ids2word = {i:w for i,w in enumerate(model.wv.vocab)}
print(X.shape)

(8, 100) (8,)
(219374, 100)


In [110]:
# initializing the learner
learner = ActiveLearner(
    estimator=SVC(probability=True,kernel='linear'),
    query_strategy=entropy_sampling,
    X_training=X_training, y_training=y_training
)



In [112]:
for i in tqdm_notebook(range(100)):
    clear_output(wait=True)
    query_idx, query_inst = learner.query(X)
    y_new= np.array([int(input(f'Target word="{ids2word[query_idx[0]]}"\n(Options: pos=1,neg=0)'))], dtype=int)
    learner.teach(query_inst, y_new)


Target word="tante"
(Options: pos=1,neg=0)1



In [113]:
probs= learner.predict_proba(X)

In [114]:
probs[:10]

array([[9.99748758e-01, 2.51241561e-04],
       [9.98875673e-01, 1.12432733e-03],
       [9.99963249e-01, 3.67507681e-05],
       [9.99406772e-01, 5.93227993e-04],
       [9.96841248e-01, 3.15875185e-03],
       [9.99692614e-01, 3.07386239e-04],
       [9.98362767e-01, 1.63723339e-03],
       [9.99876112e-01, 1.23888110e-04],
       [9.99874675e-01, 1.25325364e-04],
       [9.85670134e-01, 1.43298660e-02]])