# Babelnet

In [35]:
import sys
sys.path.append("../../imports/")
import saver as sv

In [471]:
import babelnet as bn
from babelnet import Language
from babelnet.data.source import BabelSenseSource
from babelnet.resources import BabelSynsetID
from tqdm import tqdm
import gzip
import numpy as np

# testsets

In [491]:
ws353A = '/home/manni/data/wordsim/EN-WS353.out'
ws353R = '/home/manni/data/wordsim/EN-WSR353.out'
ws353S = '/home/manni/data/wordsim/EN-WSS353.out'
rw = '/home/manni/data/wordsim/rw.out'
sim999 = '/home/manni/data/wordsim/EN-SIM999.out'
turk = '/home/manni/data/wordsim/EN_TRUK.txt'
mturk = '/home/manni/data/wordsim/MTURK-771.out'
rg = '/home/manni/data/wordsim/EN-RG-65.txt'
men = '/home/manni/data/wordsim/EN-MEN-LEM.out'

In [492]:
datasets = [ws353A,sim999,rg,men]

In [493]:
vocab = set()
for ds in datasets:
    with open(ds) as fin:
        lines = fin.readlines()
        for line in lines:
            line=line.split()
            if len(line)!=3:
                continue
            vocab.add(line[0].lower())
            vocab.add(line[1].lower())

# extract synsets

In [5]:
def get_syn_data(synset):
    '''
    Returns a dict conainting,
    0. wn offset (str)
    1. translations (set)
    2. gloss (str)
    3. wiki title (str)
    4. languages (list of str)
    '''
    dat = list()
    if synset.wordnet_offsets:
        wt = ''
        trans = set()
        for sense in synset:
            if 'WIKI' in str(sense) and 'EN' in str(sense):
                wt = str(sense.full_lemma)
            trans.add(str(sense.full_lemma))
        langs = [str(l) for l in synset.languages]
        offset = [str(l).split(':')[1] for l in synset.wordnet_offsets if 'wn' in str(l)][0]
        gloss = str(synset.main_gloss())
        dat.append(offset)
        dat.append(trans)
        dat.append(gloss)
        dat.append(wt)
        dat.append(langs)
    return dat

In [None]:
data = dict()
retrieved = set()

In [494]:
vocab = list(vocab) #testset vocab

In [26]:
for word in tqdm(vocab[20:],position=0,leave=True):
    synsets_a = bn.get_synsets(word, from_langs=[Language.EN,Language.FR,Language.IT])
    synsets_b = bn.get_synsets(word, from_langs=[Language.DE,Language.FA,Language.ES])
    synsets_c = bn.get_synsets(word, from_langs=[Language.PT,Language.EU,Language.RU])
    synsets = synsets_a + synsets_b + synsets_c
    for synset in synsets:
        dat = get_syn_data(synset)
        if dat:
            if dat[0] in data:
                data[dat[0]].append(dat[1:])
            else:
                data[dat[0]] = [dat[1:]]
    retrieved.add(word)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1807/1807 [3:09:16<00:00,  6.28s/it]


In [None]:
data = load('syndata')

In [44]:
syn_dat = dict()
for k,v in data.items():
    words = set()
    gloss = set()
    title = set()
    langs = set()
    for dat in v:
        words.update(dat[0])
        gloss.update({dat[1]})
        title.update({dat[2]})
        langs.update(set(dat[3]))
    syn_dat[k]=[words,gloss,title,langs]

In [48]:
syn_dat['02351010v']

[{'avaliar',
  'balioztatu',
  'baloratu',
  'cotar',
  'cotizar',
  'fissare_il_prezzo_di',
  'fixer_le_prix',
  'prezzare',
  'price',
  'valorar',
  'valorizar',
  'قیمت_گذاشتن'},
 {'Determine the price of'},
 {''},
 {'EN', 'ES', 'EU', 'FA', 'FR', 'IT', 'PT'}]

In [49]:
sv.save(syn_dat,'syn_dat')

Saved the data


# filter sents

In [102]:
import nltk
from nltk import pos_tag
from nltk import RegexpParser

In [529]:
#text ="learn php from guru99 and make study easy".split()
#text ="to be star".split()
#text ="black , wide or narrow".split()
text = "relating to a recently developed fashion or style".split()

In [531]:
WORD_TAGS = ['to', 'be', 'relate', 'relating', 'related','or']

In [524]:
#chunk_a: {<TO> <VB> <NN+>}

grammar = r"""
  nn: {<TO> <BE> <NN>+}        
  nn: {<NN><CC><NN>}
  nn: {<DT>+<.*>*<NN>+}
  nn: {(<RELATE>|<RELATING>|<RELATED>) <TO> <.*>* <NN>}
  vb: {<TO> <BE> <VB>}
  vb: {^<VB>}
  vb: {<VB> ((<CC>|,) <VB>)+}
  jj: {<JJ>}
  jj: {<JJ> <CC> <JJ>}
  jj: {<JJ><,><JJ><OR><JJ>}
  """

In [525]:
chunker = RegexpParser(grammar)

In [574]:
def gloss_clean(words):
    tokens_tag = pos_tag(words)
    pos_tags = [(w, w.upper()) if w in WORD_TAGS else (w, t) for w, t in tokens_tag]
    parsed = chunker.parse(pos_tags)
    out = list()
    for tree in parsed:
        if type(tree) is tuple:
            continue
        if tree.label()=='nn':
            for word in tree:
                if word[-1]=='NN':
                    out.append(word[0])
        if tree.label()=='vb':
            for word in tree:
                if word[-1]=='VB':
                    out.append(word[0])
        if tree.label()=='jj':
            for word in tree:
                if word[-1]=='JJ':
                    out.append(word[0])
    return out

In [402]:
def wt_clean(word):
    word = word.replace('(','')
    word = word.replace(')','')
    word = word.replace(',','')
    return word

# process_data

In [423]:
lemmates_list = list() #counter
lemma_bias = dict()
for _,v in tqdm(syn_dat.items()):
    lemmas = v[0]
    gloss = wt_clean(list(v[1])[0].lower())
    wt = wt_clean(list(v[2])[0].lower())
    for lemma in lemmas:
        lemmas_list.append(lemma)
        lemma_bias[lemma+'#'+str(lemmas_list.count(lemma))]=[lemmas,gloss,wt]

 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 12632/13487 [1:17:04<22:34,  1.58s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# load embeddings

In [432]:
from gensim.models import KeyedVectors

2023-02-01 08:07:17,971 [gensim.summarization.textcleaner] INFO: 'pattern' package not found; tag filters are not available for English


In [426]:
emb = '/home/manni/embs/numberbatch-19.08.txt.gz'

In [434]:
vecs = KeyedVectors.load_word2vec_format(emb)

2023-02-01 08:09:44,704 [gensim.models.utils_any2vec] INFO: loading projection weights from /home/manni/embs/numberbatch-19.08.txt.gz
2023-02-01 08:46:36,624 [gensim.models.utils_any2vec] INFO: loaded (9161912, 300) matrix from /home/manni/embs/numberbatch-19.08.txt.gz


# embeddings computations

In [488]:
filtered_map = dict()
for word in vecs.vocab:
    _word = word.split('/')[-1]
    if _word in filtered_map:
        filtered_map[_word].append(word)
    else:
        filtered_map[_word]=[word]

In [498]:
l_vocab = set(lemmas_list)

In [499]:
# expand l_vocab with parts of WT
for v in lemma_bias.values():
    if v[2]:
        continue
    wt = v[2].replace(' ','_')
    l_vocab.add(wt)
    for word in v[2].split():
        l_vocab.add(word)
    words = v[2].split()
    for i in range(len(words)-1):
        word = '_'.join(word[i+1:])
        l_vocab.add(word)

In [576]:
# get vectors 
vectors = dict()
for k,v in tqdm(lemma_bias.items()):
    unfound = True
    words = list() # for WT processing (else case.)
    vector = np.zeros(vecs.vector_size)
    for word in v[0]: # lemmas
        if word in filtered_map:
            for _word in filtered_map[word]:
                vector = np.add(vecs.get_vector(_word),vector)
    filtered_gloss = gloss_clean(v[1].split()) #grammar rules applied
    for word in filtered_gloss:
        if word in filtered_map:
            for _word in filtered_map[word]:
                vector = np.add(vecs.get_vector(_word),vector)
    if not wt:
        vectors[k]=vector
        continue
    wt = v[2].replace(' ','_')
    if wt in filtered_map:
        for _word in filtered_map[word]:
                vector = np.add(vecs.get_vector(_word),vector)
        vectors[k]=vector
        continue
    else:
        words = v[2].split()
        for i in range(len(words)-1):
            word = '_'.join(word[i+1:])
            if word in filtered_map:
                for _word in filtered_map[word]:
                    vector = np.add(vecs.get_vector(_word),vector)
                vectors[k]=vector
                unfound = False 
                break
    if unfound:
        word = words[-1]
        if word in filtered_map:
            for _word in filtered_map[word]:
                vector = np.add(vecs.get_vector(_word),vector)
            vectors[k]=vector        

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 463308/463308 [17:17<00:00, 446.35it/s]


In [577]:
emb_file = '/home/manni/embs/llx.txt.gz'
with gzip.open(emb_file, 'wt', encoding='utf-8') as f:
    f.write('%d %d\n' % (len(vectors), 300))
    for word,vector in tqdm(vectors.items(), position=0):
        f.write('%s %s\n' % (word, ' '.join([str(v) for v in vector])))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 463308/463308 [04:10<00:00, 1847.49it/s]
