Notebook provis an interactive workflow for lexicon expansion inspired by SemAxis.
Requires as input
 - list of target words
 - two lists of target words, each representing one end of the dimension

In [52]:
from IPython.display import clear_output
from gensim.models.fasttext import FastText
from gensim.models.word2vec import Word2Vec
from scipy.spatial.distance import cosine
from tqdm import tqdm_notebook
import logging
import numpy as np
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [6]:
model_type = "Word2Vec" # FastText
path = "./models/kranten_pd_1875-6_model.{}".format(model_type.lower())
exec("model = {}.load(path)".format(model_type))
print(model)

2019-08-09 12:05:23,239 : INFO : loading Word2Vec object from ./models/kranten_pd_1875-6_model.word2vec
2019-08-09 12:05:23,840 : INFO : loading wv recursively from ./models/kranten_pd_1875-6_model.word2vec.wv.* with mmap=None
2019-08-09 12:05:23,841 : INFO : loading vectors from ./models/kranten_pd_1875-6_model.word2vec.wv.vectors.npy with mmap=None
2019-08-09 12:05:23,897 : INFO : setting ignored attribute vectors_norm to None
2019-08-09 12:05:23,898 : INFO : loading vocabulary recursively from ./models/kranten_pd_1875-6_model.word2vec.vocabulary.* with mmap=None
2019-08-09 12:05:23,899 : INFO : loading trainables recursively from ./models/kranten_pd_1875-6_model.word2vec.trainables.* with mmap=None
2019-08-09 12:05:23,899 : INFO : loading syn1neg from ./models/kranten_pd_1875-6_model.word2vec.trainables.syn1neg.npy with mmap=None
2019-08-09 12:05:23,954 : INFO : setting ignored attribute cum_table to None
2019-08-09 12:05:23,955 : INFO : loaded ./models/kranten_pd_1875-6_model.word2

Word2Vec(vocab=219374, size=100, alpha=0.025)


In [67]:
model.wv.most_similar(positive=['vrouw','moeder'],negative=['kind','kinderen'],)

[('meesteres', 0.5029301047325134),
 ('doorl', 0.4935253858566284),
 ('buurvrouw', 0.49145278334617615),
 ('stiefmoeder', 0.4887046813964844),
 ('betoovering', 0.48440021276474),
 ('vasallen', 0.47766774892807007),
 ('maitresse', 0.47253796458244324),
 ('stoute', 0.47128427028656006),
 ('minnares', 0.46352386474609375),
 ('eminentie', 0.4629545211791992)]

In [71]:
model.wv.most_similar(negative='moeder')

[('veneti', 0.3971819281578064),
 ('genua', 0.39455491304397583),
 ('triest', 0.3806389570236206),
 ('telegrafische', 0.37828975915908813),
 ('vecchia', 0.36860817670822144),
 ('aangezien', 0.3608497679233551),
 ('regtstreeks', 0.3408551812171936),
 ('alexandrie', 0.3406909704208374),
 ('messina', 0.3399659991264343),
 ('belgie', 0.3399370312690735)]

## Unidirectional (one-versus-all)

In [25]:
def obtain_negatives(positives):
    return [model.wv.most_similar(negative=p)[0][0] for p in positives]

In [80]:
def create_axis(pole1,pole2):
    v1 = np.mean([model.wv[w] for w in pole1],axis=0)
    v2 = np.mean([model.wv[w] for w in pole2],axis=0)
    return v1 - v2

In [82]:
def sort_vocab_by_axis(axis,model=model):
    def project_word(w):
        return 1 - cosine(model.wv[w],axis)

    return {w : project_word(w) for w in tqdm_notebook(model.wv.vocab)}
        

In [81]:
def sort_scores(scores,topn=-1,ascending=False):
    return sorted(scores.items(),key = lambda x: x[1],reverse=not ascending)[:topn]

In [83]:
v1 = ['vrouw','moeder','dochter']
v2 = ['man','vader','zoon']
v = create_axis(v1,v2)
sorted_vocab = sort_vocab_by_axis(v)

HBox(children=(IntProgress(value=0, max=219374), HTML(value='')))




In [64]:
def top_new(sorted_vocab,seen,topn=10):
    i = 0
    candidates = []
    while len(candidates) < topn:
        if sorted_vocab[i][0] not in seen:
            candidates.append(sorted_vocab[i][0])
        i+=1  
    return candidates

Initiate

In [70]:
rounds = 0
pole = ['vrouw','moeder','zus']
negatives = obtain_negatives(pole)
while True:
    try:
        clear_output(wait=True)
        rounds+=1
        print(f"At round {rounds}")
        print("Lexicon of positives: "+ ' '.join(pole))
        print("Negatives: "+ ' '.join(negatives))
        axis = create_axis(pole,negatives)
        sorted_vocab = sort_scores(sort_vocab_by_axis(axis))
        seen = set(pole).union(negatives)
        candidates = [w for w in top_new(sorted_vocab,seen)]
        print(len(candidates))
        annotations = [(w,int(input(f'Target word="{w}"\n(Options: pos=1,neg=0)'))) for w in candidates]
        
        pole.extend([w for w,t in annotations if t])
        negatives.extend([w for w,t in annotations if not t]) 
        #negatives = obtain_negatives(pole)
    except KeyboardInterrupt:
        print(f"Leaving after {rounds} annotation rounds.")
        break

At round 2
Lexicon of positives: vrouw moeder zus vronw moedor grootmoeder viouw stiefmoeder vriendin tante buurvrouw schoonmoeder
Negatives: veneti veneti betreflende kinderen


HBox(children=(IntProgress(value=0, max=219374), HTML(value='')))


10
Target word="legerstede"
(Options: pos=1,neg=0)0
Target word="waardin"
(Options: pos=1,neg=0)1
Target word="doohter"
(Options: pos=1,neg=0)1
Leaving after 2 annotation rounds.


Keep running the cells below to repeat procedure (wait until to press CTRL + ENTER until after print statements)

## Contrastive