In [1]:
import logging
import os

import wiki as w # changed wiki to include '[]'
import traceback
from tqdm import tqdm
import numpy as np
from nltk.corpus import wordnet as wn
import re
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()
from multiprocessing import Pool
import workers

In [2]:
print(w.__file__)

/home/manni/ner-s2s/word_embedding/wiki.py


In [5]:
#WIKIXML = '/home/manni/data/wiki/enwiki-latest-pages-articles-multistream1.xml-p1p41242.bz2'
WIKIXML = '/home/manni/data/wiki/enwiki-20221020-pages-articles9.xml-p2936261p4045402.bz2'

In [3]:
import sys
sys.path.append("../../imports/")
import saver as sv

In [4]:
logging.basicConfig(format='[%(asctime)s] %(message)s', level=logging.INFO)
os.makedirs('data/', exist_ok=True)

## skip if loaded

In [None]:
wiki_sentences = w.WikiSentences(WIKIXML, 'en',lower=True)

In [None]:
for sent in wiki_sentences:
    print(sent[:100])
    break

In [None]:
sv.save(wiki_sentences,'sentences_temp')

## load sentences

In [None]:
sentences = sv.load('sentences_temp')

In [None]:
from collections import Counter

In [None]:
WINDOW = 5
TOTAL = sentences.wiki.length

In [None]:
#https://gist.github.com/nwjlyons/621fabfc0d4c1119b2ad338f615ce4ef
def chunks(generator, chunk_size):
    """Yield successive chunks from a generator"""
    chunk = []

    for item in generator:
        if len(chunk) >= chunk_size:
            yield chunk
            chunk = [item]
        else:
            chunk.append(item)

    if chunk:
        yield chunk

## tuples

In [None]:
results = list()
if __name__ == "__main__":
    with Pool(processes=40) as pool:
        results = list(pool.apply_async(workers.getTuples, args=(sent,WINDOW)) \
                       for sent in tqdm(sentences,position=0,total=TOTAL))
        results = [r.get() for r in results if r.get()]

In [None]:
results = [item for sublist in results for item in sublist]

In [None]:
c = Counter(results)
sv.save(dict(c),'PairFreqs')

In [None]:
pairfreqs = sv.load('PairFreqs')

## Triples

In [None]:
results = list()
if __name__ == "__main__":
    with Pool(processes=40) as pool:
        results = list(pool.apply_async(workers.getTriples, args=(sent,WINDOW)) \
                       for sent in tqdm(sentences,position=0,total=TOTAL))
        results = [r.get() for r in results if r.get()]
        
results = [item for sublist in results for item in sublist]
c = Counter(results)
sv.save(dict(c),'TripleFreqs')

In [None]:
triplefreqs = sv.load('TripleFreqs')

## Analysis

In [None]:
word_desc = sv.load('word_desc')

In [None]:
list(triplefreqs.keys())[:10]

In [None]:
list(pairfreqs.keys())[:10]

In [None]:
pairfreqs[('david', 'queensland')]

In [None]:
def get_pair_score(w1,w2,sentences):
    pair = [w1,w2]
    pair.sort()
    pair = tuple(pair)
    score = pairfreqs[pair]
    #div = max(get_cfs(w1,sentences),get_cfs(w2,sentences))
    return score
def _get_triple_score(w1,w2,w3):
    pair = [w1,w2,w3]
    pair.sort()
    pair = tuple(pair)
    return triplefreqs[pair]
def get_cfs(word,sentences):
    if word in sentences.wiki.dictionary.token2id: 
        id_ = sentences.wiki.dictionary.token2id[word]
        return sentences.wiki.dictionary.cfs[id_]
    else:
        return 0
def get_dfs(word,sentences):
    if word in sentences.wiki.dictionary.token2id: 
        id_ = sentences.wiki.dictionary.token2id[word]
        return sentences.wiki.dictionary.dfs[id_]
    else:
        return 0

In [None]:
get_cfs('stagg',sentences)

In [None]:
def get_score(w1,w2,sentences):
    cf1 = get_cfs(w1,sentences)
    cf2 = get_cfs(w2,sentences)
    den = max(cf1,cf2)
    try:
        num = get_pair_score(w1, w2)
    except:
        return 0
    return num/den
def get_triple_score(w1,w2,w3,sentences):
    cf1 = get_cfs(w1,sentences)
    cf2 = get_cfs(w2,sentences)
    cf3 = get_cfs(w3,sentences)
    den = max(cf1,cf2,cf3)
    num = _get_triple_score(w1, w2, w3)
    return num/den

In [None]:
get_score('river','current',sentences)

In [None]:
get_score('river','part',sentences)

In [None]:
get_cfs('river',sentences)

In [None]:
get_cfs('current',sentences)

In [None]:
get_cfs('part',sentences)

In [None]:
get_pair_score('river','current')

In [None]:
get_pair_score('river','part')

In [None]:
tokens = list(word2desc['rapid'][0])

In [None]:
total = len(tokens)
for i in range(total):
    for j in range(i+1,total):
        a = tokens[i]
        b = tokens[j]
        try:
            print(a,b,get_pair_score(a,b))
        except:
            print(a,b,0)
            continue

In [None]:
for token in tokens:
    print(token,get_cfs(token,sentences))

In [None]:
total = len(tokens)
for i in range(total):
    for j in range(i+1,total):
        a = tokens[i]
        b = tokens[j]
        try:
            print(a,b,get_score(a,b,sentences))
        except:
            print(a,b,0)
            continue

In [None]:
total = len(tokens)
for i in range(total):
    for j in range(i+1,total):
        for k in range(j+1,total):
            a = tokens[i]
            b = tokens[j]
            c = tokens[k]
            print('NOT FOUND:',a,b,c)
            try:
                print(a,b,c,_get_triple_score(a,b,c))
            except:
                continue

In [None]:
tokens = list(word2desc['rapid'][0])

In [None]:
tokens = list(word2desc['rapid'][1])
print(tokens)

In [None]:
def odd_ones(tokens,sentences,verbose=False):
    for token in tokens:
        num = 0
        for _token in tokens: 
            if token!=_token:
                try:
                    score = get_pair_score(token,_token,sentences)
                    if verbose:
                        print(token,_token,score)
                    num += score
                except:
                    if verbose:
                        print(token,_token,0)
                    continue
        den = get_cfs(token,sentences)
        if den == 0:
            print(token,0)
        else:
            print(token,num/den)    
        print('___________')

In [None]:
def odd_ones(tokens,sentences,verbose=False):
    for token in tokens:
        num = 0
        for _token in tokens: 
            if token!=_token:
                try:
                    score = get_pair_score(token,_token,sentences)
                    if verbose:
                        print(token,_token,score)
                    num += score
                except:
                    if verbose:
                        print(token,_token,0)
                    continue
        den = get_cfs(token,sentences)
        if den == 0:
            print(token,0)
        else:
            print(token,num/den)    
        print('___________')

In [None]:
odd_ones(tokens,sentences)

# wordnet exploration

In [None]:
wn.synsets('rapid')

In [None]:
sense = wn.synsets('rapid')[0]

In [None]:
[str(lemma.name()) for lemma in sense.lemmas()]

In [None]:
def display(target):
    target_senses = wn.synsets(target)
    for target in target_senses:
        print(target)
        #print(target.lexname())
        print([(l.name(),l.synset()) for l in target.lemmas()])
        print(target.definition())
        print('Hyponyms:',[(lemma.name(),lemma.synset()) for _target in target.hyponyms() for lemma in _target.lemmas()])
        print('Hypernyms:',[(lemma.name(),lemma.synset()) for _target in target.hypernyms() for lemma in _target.lemmas()])
        print('Holonyms:',target.member_holonyms())
        print('Derivation:',[(lemma.name(),lemma.synset()) for lemma in target.lemmas() for lname in lemma.derivationally_related_forms()])
        print('Pertainyms:',[(lemma.name(),lemma.synset()) for lemma in target.lemmas() for lname in lemma.pertainyms()])
        print('___________')
        #print(sen'Derivation:',se.lowest_common_hypernyms(wn.synsets('river')[0]))
        

In [None]:
display('digit')

In [None]:
print(sense.path_similarity(wn.synsets('river')[0]))
print(wn.synsets('fast')[3].path_similarity(wn.synsets('rapid')[2]))

In [None]:
# dconf
target = 'digit'
sense_a = ['dactyl', 'finger', 'toe', 'thumb', 'pollex', 'body_part', 'nail', 'minimus', 'tarsier', 'webbed', 'extremity', 'appendage']
sense_b = ['figure', 'cardinal_number', 'cardinal', 'integer', 'whole_number', 'numeration_system', 'number_system', 'system_of_numeration', 'large_integer', 'constituent', 'element', 'digital']

found_a = ['dactyl','finger', 'minimus', 'toe','extremity', 'appendage', 'member','thumb','pollex']



In [None]:
for word in sense_a:
    print(wn.synsets('fast')[3].path_similarity(wn.synsets('rapid')[2]))

In [None]:
for b in list(wn.all_synsets()):
    defi=b.definition()
    if 'digit' in defi:
        print(a.wup_similarity(b),b,defi)

In [None]:
a = wn.synset('digit.n.03')
a_nodes = [l.synset() for l in a.lemmas()]
a_hyper = [l.synset() for h in a.hypernyms() for l in h.lemmas()]
a_hypo = [l.synset() for h in a.hyponyms() for l in h.lemmas()]
a_nodes = a_nodes+a_hypo+a_hyper
hypo = lambda s: s.hyponyms()
a_hypo_ = list(a.closure(hypo, depth=3))
a_nodes = set(a_nodes+a_hypo_)

In [None]:
a_nodes

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
bias_words = set()
for b in tqdm(list(wn.all_synsets())):
    text=b.definition()
    if 'digit' not in text:
        continue
    doc = nlp(text)
    candidates = set()
    for word in doc:
        if word.pos_=='NOUN':
            #print(word.text)
            synsets = wn.synsets(word.text)
            for _b in synsets:
                sim=a.wup_similarity(_b)
                if sim>0.5:
                    if 'digit' not in word.text:
                        candidates.add(word.text)
    if len(candidates)>0:
        #print(b,candidates)
        bias_words.add(b)

In [None]:
bias_words = set()
for b in tqdm(list(wn.all_synsets())):
    text=b.definition()
    if 'digit' not in text:
        continue
    doc = nlp(text)
    candidates = set()
    for word in doc:
        if word.pos_=='NOUN':
            #print(word.text)
            if 'digit' in word.text:
                continue
            synsets = wn.synsets(word.text)
            for _b in synsets:
                b_hyper = [l.synset() for h in _b.hypernyms() for l in h.lemmas()]
                b_hypo = [l.synset() for h in _b.hyponyms() for l in h.lemmas()]
                b_nodes = set(b_hyper+b_hypo)
                if len(a_nodes&b_nodes)>0:
                    candidates.add(word.text) 
            #if 'thumb' in word.text:
            #    print(candidates)
    if len(candidates)>0:
        #print(b,candidates)
        bias_words.add(b)

In [None]:
bias_words

In [None]:
a_nodes.update(bias_words)

In [None]:
a_nodes = [lemma.name() for ss in a_nodes for lemma in ss.lemmas()]
a_hyper = [l.name() for h in a.hypernyms() for l in h.lemmas()]
a_nodes+=a_hyper
print(a_nodes)

In [None]:
display('able')

# sentences

In [None]:
def get_sent(word,sentences):
    terms = list()
    for sent in sentences:
        if word in sent: 
            for i,term in enumerate(sent):
                if term==word:
                    return(sent[i-5:i+6])

In [None]:
terms = get_sent('rapid',sentences)

In [None]:
terms

In [None]:
def get_output(terms):
    for term in terms:
        #get_score('river','current',sentences)
        if term in word_desc:
            print(term,':')
            print('***********')
            for words,onto in word_desc[term]:
                print(words,onto)
                odd_ones(words,sentences,True)
                print('-------------')
        return
        print('============')

In [None]:
get_output(tokens)

In [None]:
tokens

In [None]:
word2desc['fast']

# word2vec

In [None]:
from gensim.models import KeyedVectors

In [None]:
model_path = '/home/manni/embs/w2v.model'
w2v_model = KeyedVectors.load(model_path)

In [None]:
w2v_model.doesnt_match(tokens)

In [None]:
vecs = np.zeros((len(tokens),w2v_model.vector_size),dtype=np.float32)
for i,word in enumerate(tokens):
    vecs[i]=w2v_model.get_vector(word)

In [None]:
for i,vec in enumerate(vecs):
    avec = np.zeros(w2v_model.vector_size)
    for j,_vec in enumerate(vecs):
        if i==j:
            continue
        print(tokens[i],tokens[j],w2v_model.cosine_similarities(vec,[_vec]))
        avec = np.add(avec,_vec)
    print(tokens[i],w2v_model.cosine_similarities(vec,[avec]))   
    print('--------')
    input()

In [None]:
tokens.index('current')

In [None]:
vec = vecs[2]
avec = np.add(vecs[0],vecs[1])
w2v_model.cosine_similarities(vec,[avec])

In [None]:
tokens

# w2v based embs

In [None]:
from gensim.models import KeyedVectors

In [None]:
model_path = '/home/manni/embs/w2v.model'
w2v_model = KeyedVectors.load(model_path)

In [None]:
syndef = dict()
for synset in tqdm(list(wn.all_synsets()),position=0):
    text=synset.definition()
    text = re.sub(r'[^\w\s]', '', text)
    words = [word for word in text.split() if word not in stopwords]
    words = set([lemmatizer.lemmatize(word) for word in words])
    syndef[synset.name()]=words

In [None]:
sv.save(syndef,'syndef')

In [14]:
syndef = sv.load('syndef')

In [15]:
def get_blist():
    bias_list = dict()
    for synset in tqdm(list(wn.all_synsets()),position=0):
        nodes = [l.name() for l in synset.lemmas()]
        hypo = [l.name() for h in synset.hyponyms() for l in h.lemmas()]
        nodes = nodes+hypo
        #hyper = lambda s: s.hypernyms()
        #hyper = list(synset.closure(hyper, depth=3))
        #if hyper:
        #    hyper = [l.name() for h in hyper for l in h.lemmas()]
        #    nodes = nodes+hyper
        nodes = set([lemmatizer.lemmatize(node) for node in nodes])
        #text=synset.definition()
        #name = synset.name().split('.')[0]
        #if name not in text:
        #    continue
        for synset_,words in syndef.items(): 
            overlap = words & nodes
            if overlap:
                rem = words-overlap
                ss = [wn.synsets(word) for word in rem]
                ss = [_s for s in ss for _s in s]
                for s in ss:
                    if s.lowest_common_hypernyms(synset):
                    #if synset in ss:
                        names = set([lemma.name() for lemma in wn.synset(synset_).lemmas()])
                        nodes.update(names)
        hyper = [l.name() for h in synset.hypernyms() for l in h.lemmas()]
        nodes.update(set(hyper))
        bias_list[synset.name()]=nodes
    return bias_list

In [None]:
bias_list = get_blist()

 13%|██████████████████▉                                                                                                                                    | 14735/117659 [11:55<1:05:11, 26.31it/s]

In [None]:
results = list()
if __name__ == "__main__":
    with Pool(processes=10) as pool:
        results = list(pool.apply_async(workers.getBlist, args=(synset)) \
                       for synset in tqdm(list(wn.all_synsets()),position=0))
        results = [r.get() for r in results if r.get()]  

In [None]:
bias_dict = dict()
for i, synset in enumerate(list(wn.all_synsets())):
    bias_dict[synset.name()]=results[i]

In [None]:
def sum_vec(lst):
    vec = np.zeros(300)
    for word in lst:
        if word in w2v_model.vocab:
            vec = np.add(vec,w2v_model.get_vector(word))
    return vec

In [None]:
vecs = dict()
for word,lst in bias_list.items():
    vecs[word]=sum_vec(lst)

In [None]:
0

In [None]:
sv.save(vecs,"wnet_vecs")

In [None]:
display('able')

In [None]:
wn.synset('aerobic.a.02').lowest_common_hypernyms(wn.synset('principle.n.01'))

In [None]:
from pprint import pprint
pprint(wn.synset('aerobic.a.02').mst(lambda s:s.also_sees()))

In [None]:
wn.synset('aerobic.a.02').topic_domains()