## SemAxis

Code for Lexicon Expansion, using the [SemAxis algorithm](https://arxiv.org/abs/1806.05521)

In [55]:
from gensim.models.fasttext import FastText
from gensim.models.word2vec import Word2Vec
from scipy.spatial.distance import cosine
from tqdm import tqdm_notebook
import logging
import numpy as np
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [56]:
model_type = "Word2Vec" # FastText
path = "./models/kranten_pd_1875-6_model.{}".format(model_type.lower())
exec("model = {}.load(path)".format(model_type))
print(model)

2019-07-19 15:51:02,720 : INFO : loading Word2Vec object from ./models/kranten_pd_1875-6_model.word2vec
2019-07-19 15:51:03,401 : INFO : loading wv recursively from ./models/kranten_pd_1875-6_model.word2vec.wv.* with mmap=None
2019-07-19 15:51:03,402 : INFO : loading vectors from ./models/kranten_pd_1875-6_model.word2vec.wv.vectors.npy with mmap=None
2019-07-19 15:51:03,427 : INFO : setting ignored attribute vectors_norm to None
2019-07-19 15:51:03,428 : INFO : loading vocabulary recursively from ./models/kranten_pd_1875-6_model.word2vec.vocabulary.* with mmap=None
2019-07-19 15:51:03,428 : INFO : loading trainables recursively from ./models/kranten_pd_1875-6_model.word2vec.trainables.* with mmap=None
2019-07-19 15:51:03,429 : INFO : loading syn1neg from ./models/kranten_pd_1875-6_model.word2vec.trainables.syn1neg.npy with mmap=None
2019-07-19 15:51:03,458 : INFO : setting ignored attribute cum_table to None
2019-07-19 15:51:03,459 : INFO : loaded ./models/kranten_pd_1875-6_model.word2

Word2Vec(vocab=258224, size=100, alpha=0.025)


In [57]:
# create two poles of seed words
pole1 = ['vrouw','moeder','zus']
pole2 = ['man','vader','broer']

In [58]:
v1 = np.mean([model.wv[w] for w in pole1],axis=0)
v2 = np.mean([model.wv[w] for w in pole2],axis=0)
print(v1.shape,v2.shape)

(100,) (100,)


In [59]:
# defining a semantic axis
v = (v1 - v2)
print(v.shape)

(100,)


In [60]:
def sort_vocab_by_axis(axis,model=model):
    def project_word(w):
        return 1 - cosine(model.wv[w],axis)
    
    semaxis_score = {}
    
    for w in tqdm_notebook(model.wv.vocab):
        semaxis_score[w] = project_word(w)
        
    return semaxis_score

In [61]:
semaxis_scores = sort_vocab_by_axis(v,model)

HBox(children=(IntProgress(value=0, max=258224), HTML(value='')))




In [62]:
def sort_scores(scores,topn=10,ascending=True):
    return sorted(scores.items(),key = lambda x: x[1],reverse=not ascending)[:topn]

In [64]:
sort_scores(semaxis_scores,100,False)

[('bemanteling', 0.5075244903564453),
 ('doorbrengende', 0.4933483898639679),
 ('KOUWEN', 0.4880736768245697),
 ('vergaderkamer', 0.48656100034713745),
 ('meesteressen', 0.4767831265926361),
 ('gezelschapsdame', 0.4707792401313782),
 ('COLLECTE', 0.45863980054855347),
 ('koperslagerij', 0.4516730010509491),
 ('gedaanteverwisseling', 0.45041441917419434),
 ('stiefmoeder', 0.44748789072036743),
 ('aankleving', 0.43607035279273987),
 ('Lijnoliekokerij', 0.43444928526878357),
 ('meesteres', 0.434261679649353),
 ('verschrikkelijkheid', 0.43407416343688965),
 ('Juniveiling', 0.43368491530418396),
 ('echtverbintenis', 0.43330690264701843),
 ('bevalling', 0.43189534544944763),
 ('Kindaren', 0.42979010939598083),
 ('wouw', 0.42974257469177246),
 ('bewaarplaats', 0.42924046516418457),
 ('tamilie', 0.42809420824050903),
 ('modiste', 0.4277687966823578),
 ('dienstbode', 0.4256649613380432),
 ('iurichting', 0.4256543517112732),
 ('kleurloosheid', 0.4213293492794037),
 ('kansrekening', 0.41878932714