In [1]:
#from nltk.corpus import wordnet as wn
import wn
en = wn.Wordnet('oewn:2021')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stops = set(stopwords.words('english'))
from tqdm import tqdm
import re
import numpy as np
import scipy.sparse as sprs
import math
from multiprocessing import Pool
import os

In [2]:
import sys
sys.path.append("../../imports/")
import saver as sv

In [3]:
vocab = list(en.synsets())
#vocab = list(wn.all_synsets())

In [4]:
synset2index=dict()
index2synset=dict()
for i,ss in enumerate(vocab):
    #synset2index[ss.name()]=i
    #index2synset[i]=ss.name()
    synset2index[ss.id]=i
    index2synset[i]=ss.id

# matrix

In [5]:
# for soft computation
SOFT = True
synset_descs = dict()
for synset in tqdm(vocab,position=0):
    txt = synset.definition().lower()
    txt = re.sub(r'\W+', ' ', txt)
    words = [w for w in txt.split()]
    words = set([lemmatizer.lemmatize(w) for w in words if w not in stops])
    words = set([w for w in words if w not in stops])
    #synset = synset.name()
    synset = synset.id
    self_index = synset2index[synset]
    synset_descs[self_index]=words

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 120039/120039 [00:15<00:00, 7975.54it/s]


In [None]:
from parallelbar import progress_imap
import workers
if __name__ == "__main__":
    result = progress_imap(workers.getMvector,range(len(vocab)), n_cpu=20, \
                           initializer=workers.init_ss,\
                           initargs=(synset2index,synset_descs,SOFT)) 
#M = np.asarray(result,dtype=np.float16)
M = sprs.csr_matrix(result)

In [None]:
del result

In [None]:
sv.save(M,'M2')

In [None]:
M = sv.load('M2')

In [None]:
alpha = 0.85
iters = 30

In [None]:
P_0 = np.eye(M.shape[0],dtype=np.int8)
P = np.ones(M.shape,dtype=np.int8)

# verbose

In [None]:
def display(target):
    target_senses = wn.synsets(target)
    for target in target_senses:
        print(target)
        #print(target.lexname())
        print([(l.name(),l.synset()) for l in target.lemmas()])
        print(target.definition())
        print('Hyponyms:',[(lemma.name(),lemma.synset()) for _target in target.hyponyms() for lemma in _target.lemmas()])
        print('Hypernyms:',[(lemma.name(),lemma.synset()) for _target in target.hypernyms() for lemma in _target.lemmas()])
        print('Holonyms:',target.member_holonyms())
        print('Derivation:',[(lemma.name(),lemma.synset()) for lemma in target.lemmas() for lname in lemma.derivationally_related_forms()])
        print('Pertainyms:',[(lemma.name(),lemma.synset()) for lemma in target.lemmas() for lname in lemma.pertainyms()])
        print('___________')

In [None]:
display('webbed')

In [None]:
def get_pairs(index):
    desc = synset_descs[index]
    print(desc)
    synsets = set()
    for word in desc:
        _synsets = wn.synsets(word)
        synsets.update(_synsets)
    synsets = list(synsets)
    total = len(synsets)
    pairs = set()
    for i in range(total):
        for j in range(total):
            if i == j:
                continue
            sa = synsets[i]
            sb = synsets[j]
            rels = list()
            rels.extend(sa.hypernyms())
            rels.extend(sa.hyponyms())
            if not rels:
                continue
            if sb in rels:
                pairs.update({sa,sb})  
    return pairs

In [None]:
get_pairs(index)

In [None]:
from collections import Counter

In [None]:
def mcom(worda,wordb,verbose=False):
    sa = wn.synsets(worda)
    sb = wn.synsets(wordb)
    roots = list()
    for _sa in sa:
        for _sb in sb:
            if _sa.name() == _sb.name():
                continue
            sim = _sa.wup_similarity(_sb, simulate_root=False)
            if sim:
                root = _sa.lowest_common_hypernyms(_sb)
                roots.extend(root)
                if verbose:
                    print(root)
                    print(_sa.name(),_sb.name(),sim)
    counts =  Counter(roots)
    if counts:
        mcom = counts.most_common(1)[0][0]
    else:
        mcom = None
    return mcom

In [None]:
desc = synset_descs[index]
print(desc)
_desc = set()
for worda in desc:
    for wordb in desc:
        if worda == wordb:
            continue
        if graph_sim(worda,wordb):
            _desc.update({worda,wordb})
print(_desc)

In [None]:
def getMvector(index,soft=False):
    '''
    parameters:
    -----------
    synset: nltk Wordnet Synset object
    
    returns:
    --------
    vector: numpy vector
    '''
    synset = vocab[index]
    vector = np.zeros(len(synset2index))
    connections = list()
    connections.extend([l.synset().name() for l in synset.lemmas()])
    connections.extend([lemma.synset().name() for _target in synset.hyponyms() for lemma in _target.lemmas()])
    connections.extend([lemma.synset().name() for _target in synset.hypernyms() for lemma in _target.lemmas()])
    #connections.extend([lemma.synset().name() for _target in synset.root_hypernyms() for lemma in _target.lemmas()])
    connections.extend(s.name() for s in synset.member_holonyms())
    #connections.extend([lname.synset().name() for lemma in synset.lemmas() for lname in lemma.derivationally_related_forms()])
    #connections.extend([lname.synset().name() for lemma in synset.lemmas() for lname in lemma.pertainyms()])
    #connections.extend([lname.synset().name() for lemma in synset.lemmas() for lname in lemma.antonyms()])
    connections=set([synset2index[ss] for ss in connections])
    connections.remove(index)
    out = dict()
    for i in connections:
        out[i] = 2   
    #return out
    if soft:
        connections = list()
        for word in synset_descs[index]:
            mentions = [key for key in synset2index if key.startswith(word+'.')]
            connections.extend(mentions)
        connections=set([synset2index[ss] for ss in connections])
        if index in connections:
            connections.remove(index)
        for i in connections:
            if i not in out:
                out[i] = 1 
    n = sum(out.values())
    for i,v in out.items():
        vector[i]=v/n
    vector = sprs.csr_matrix(vector)
    return vector
    #return connections
    

In [None]:
#index = 52392 # 'digit.n.03'
#index = 14380 # 'webbed.a.01'
index = 35053 # 'tarsier.n.01'

In [None]:
getMvector(15229)

In [None]:
def get_vec(index,iters=30):
    '''
    Executes power equation and returns the vector.
    '''
    P_t = P[index]
    for _ in range(iters):
        P_t = (1-alpha)*P_0[index]+alpha*M*P_t
        P_t = P_t/P_t.sum()
    return P_t

In [None]:
vec = get_vec(index,200)

In [None]:
#sanity check

for i in np.flip(np.argsort(PPR[index]))[:27]:
    print(i,index2synset[i])

In [None]:
for indexes,score in zip(M[index].indices,M[index].data):
    print(index2synset[indexes],score)

In [None]:
def get_argmax(vec,k):
    for i in np.flip(np.argsort(vec))[:k+1]:
        print(i,index2synset[i])

In [None]:
get_argmax(vec,27)

# pre-made

In [None]:
def pagerank(A, p=0.85, personalize=None, reverse=False):
    """ Calculates PageRank given a csr graph

    Inputs:
    -------

    G: a csr graph.
    p: damping factor
    personlize: if not None, should be an array with the size of the nodes
                containing probability distributions.
                It will be normalized automatically
    reverse: If true, returns the reversed-PageRank

    outputs
    -------

    PageRank Scores for the nodes

    """
    # In Moler's algorithm, $A_{ij}$ represents the existences of an edge
    # from node $j$ to $i$, while we have assumed the opposite!
    if reverse:
        A = A.T

    n, _ = A.shape
    
    r = np.asarray(A.sum(axis=1)).reshape(-1)

    k = r.nonzero()[0]

    D_1 = sprs.csr_matrix((1 / r[k], (k, k)), shape=(n, n))

    if personalize is None:
        personalize = np.ones(n)
    personalize = personalize.reshape(n, 1)
    s = (personalize / personalize.sum()) * n

    I = sprs.eye(n)
    x = sprs.linalg.spsolve((I - p * A.T @ D_1), s)
    x = x / x.sum()
    return x

In [None]:
result = pagerank(M, personalize=P_0[0])

# organic compute

In [None]:
def get_vec(index,iters=30):
    '''
    Executes power equation and returns the vector.
    '''
    P_t = P[index]
    for _ in range(iters):
        P_t = (1-alpha)*P_0[index]+alpha*M*P_t
        P_t = P_t/P_t.sum()
    return P_t

In [None]:
PPR = np.zeros(M.shape)
for i in tqdm(range(P.shape[0]),position=0):
    P_t = get_vec(i)
    PPR[i]=P_t

In [None]:
from parallelbar import progress_imap
import workers
if __name__ == "__main__":
    result = progress_imap(workers.getVec,range(P.shape[0]),initializer=workers.init_worker,\
                            initargs=(alpha,iters,P,P_0,M)) 

In [None]:
del P_0, P

In [None]:
PPR = np.asarray(result,dtype=np.float16)

In [None]:
del result

In [None]:
sv.save(PPR,'PPR2')

# load PPR

In [None]:
PPR = sv.load('PPR')

# word embs loading

In [None]:
from gensim.models import KeyedVectors

w2v = '/home/manni/embs/w2v.model'
model = KeyedVectors.load(w2v)

# embeddings computations

In [None]:
def get_lambdas(k=25,lamda=0.2):
    '''
    Returns list of lamdas
    '''
    lamdas = list()
    for i in range(1,k+1):
        l = 1/(math.exp(lamda*i)*k)
        lamdas.append(l)
    return lamdas

lamdas = get_lambdas(k=100)

def get_vector(i,k=25,beta=1):
    _top_k = np.flip(np.argsort(PPR[i]))[1:k+1]
    #_top_k = [l.name() for _i in _top_k for l in wn.synset(index2synset[_i]).lemmas()]
    _top_k = [l for _i in _top_k for l in synset_descs[_i]] # orignal suggested version 
    top_k = list()
    [top_k.append(x) for x in _top_k if x not in top_k]
    top_k = top_k[:k+1] # fixed limit
    targets = [l.name() for l in wn.synset(index2synset[i]).lemmas()]
    V_si = np.zeros(model.vector_size)
    for target in targets:
        if target in model.vocab:
            V_si = np.add(V_si,model.get_vector(target))
    V_si = beta*V_si 
    V_sum = np.zeros(model.vector_size)
    #lamdas = get_lambdas(len(top_k))
    _lamdas = lamdas[:len(top_k)]
    for j,lamda in enumerate(_lamdas):
        word = top_k[j]
        if word not in model.vocab:
            continue
        _V = lamda*model.get_vector(word)
        V_sum = np.add(V_sum,_V)
    num = V_si+V_sum # numerator
    den = beta + sum(_lamdas)
    return num/den

In [None]:
vectors = dict()
for i,synset in tqdm(index2synset.items(),position=0):
    vec = get_vector(i)
    if sum(vec) == 0:
        continue
    vectors[synset]=vec

In [None]:
emb_file = '/home/manni/embs/dconf2_1.txt'
with gzip.open(emb_file, 'w', encoding='utf-8') as f:
    f.write('%d %d\n' % (len(vectors), 300))
    for word,vector in tqdm(vectors.items(), position=0):
        f.write('%s %s\n' % (word, ' '.join([str(v) for v in vector])))

In [None]:
# lemmatize 