In [1]:
from gensim.models import KeyedVectors
from tqdm import tqdm
from scipy.stats import pearsonr
from scipy.stats import spearmanr
import re
import numpy as np

In [4]:
#spxEM = '/home/manni/embs/en_wiki_spx_mc100_epoch5_300_em.txt'
spxEM = '/home/manni/embs/en_wiki_spx_mc100_epoch5_300_em_w5_2.txt'
spxEM_model = KeyedVectors.load_word2vec_format(spxEM, binary=False)
#w2v = '/home/manni/embs/en_wiki_w2v_mc100_epoch5_300.txt'
w2v = '/home/manni/embs/en_wiki_w2v_mc100_epoch5_300_w5_2.txt'
w2v_model = KeyedVectors.load_word2vec_format(w2v, binary=False)
spx = '/home/manni/embs/en_wiki_spx_mc100_epoch5_300.txt'
spx_model = KeyedVectors.load_word2vec_format(spx, binary=False)

In [5]:
models = [w2v_model,spx_model,spxEM_model]
model_names = ['Word2vec','EWEM-MIX','EWEM']

In [None]:
spxEM = '/home/manni/embs/en_wiki_spx_mc100_epoch5_300_em_w1.txt'
spxEM_model = KeyedVectors.load_word2vec_format(spxEM, binary=False)
w2v = '/home/manni/embs/en_wiki_w2v_mc100_epoch5_300_w1.txt'
w2v_model = KeyedVectors.load_word2vec_format(w2v, binary=False)

In [None]:
models = [w2v_model,spxEM_model]
model_names = ['Word2vec','EWEM']

# SCWS

In [None]:
scws_file = '/home/manni/data/wordsim/SCWS/ratings.txt'

In [None]:
data = list()
with open(scws_file) as fin:
    lines = fin.readlines()
    for l in lines:
        l = l.lower().split('\t')
        w1 = l[1]
        w2 = l[3]
        c1 = l[5]
        c2 = l[6]
        score = float(l[7])
        data.append([w1,w2,c1,c2,score])

In [None]:
data[6]

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_sent(sent):
    '''
    parameters:
        sent: str
    returns:
        tokens: list(str)     
    '''
    return [word for word in sent.split() if word not in stop_words]

def sent_words(sent):
    '''
    parameters:
        sent: str
    returns:
        tokens: list(str) 
    '''
    sent = re.sub(r'[^\w\s]', '', sent)
    left = sent.split('<b>')[0]
    right = sent.split('</b>')[-1]
    sentA = clean_sent(left)[-10:]
    sentB = clean_sent(right)[:10]
    return sentA+sentB

def reducer(wrd,lst,model):
    out = set()
    for word in lst:
        top = [tup[0] for tup in model.most_similar(word,topn=100)]
        out.update(set(top)&set(lst))
    out.add(wrd)
    if wrd+'#E' in model.vocab:
        out.add(wrd+'#E')
    return out

def avgsim(w1,w2,c1,c2,score,model,reduce=False,verbose=False):
    '''
    Computes average similarity score.

    parameters:
        w1 : str
        w2 : str
        c1 : str
        c2 : str
        model : gensim.keyedvectors
    returns:
        score : float
    '''
    if w1 not in model.vocab or w2 not in model.vocab:
        return None
    a = [w1] + sent_words(c1)
    _a = [word+'#E' for word in a] 
    a+=_a
    b = [w2] + sent_words(c2)
    _b = [word+'#E' for word in b] 
    b+=_b
    a = [word for word in a if word in model.vocab]
    b = [word for word in b if word in model.vocab]
    if reduce:
        a = reducer(w1,a,model)
        b = reducer(w2,b,model)    
    if verbose:
        print(a)
        print(b)
    div = len(a)*len(b)
    sims = 0
    for i in a:
        for j in b:
            sim = model.similarity(i,j)
            if verbose:
                print(i,j,sim)
            sims+= sim
    return (sims/div,score)

def _avgvec(lst,model):
    vec = np.zeros(model.vector_size)
    for word in lst:
        if word in model:
            vec = np.add(vec,model.get_vector(word))
    return vec  
    
def avgvec(w1,w2,c1,c2,score,model,reduce=False,verbose=False):
    if w1 not in model.vocab or w2 not in model.vocab:
        return None
    a = [w1] + sent_words(c1)
    _a = [word+'#E' for word in a] 
    a+=_a
    b = [w2] + sent_words(c2)
    _b = [word+'#E' for word in b] 
    b+=_b
    a = [word for word in a if word in model.vocab]
    b = [word for word in b if word in model.vocab]
    if reduce:
        a = reducer(w1,a,model)
        b = reducer(w2,b,model) 
    if verbose:
        print(a)
        print(b) 
    v1 = _avgvec(a,model)
    v2 = _avgvec(b,model)
    return (model.cosine_similarities(v1,[v2])[0],score)

In [None]:
def get_corr(model,method='asim',reduce=False):
    sims = list()
    scores = list()
    for dat in tqdm(data,position=0):
        if method == 'asim':
            sim = avgsim(dat[0],dat[1],dat[2],dat[3],dat[4],model,reduce=reduce)
        if method == 'avg':
            sim = avgvec(dat[0],dat[1],dat[2],dat[3],dat[4],model,reduce=reduce)
        if sim:
            sims.append(sim[0])
            scores.append(sim[1])
    corr, _ = pearsonr(sims, scores)
    print('%.2f &' % (corr*100),end=' ')
    corr, _ = spearmanr(sims, scores)
    print('%.2f' % (corr*100))   

In [None]:
get_corr(spxEM_model)
get_corr(spx_model)
get_corr(w2v_model)

In [None]:
get_corr(spxEM_model,method='avg')
get_corr(spx_model,method='avg')
get_corr(w2v_model,method='avg')

In [None]:
from multiprocessing import Pool

def get_corr_parallel(model,method='asim',reduce=False):
    if __name__=="__main__":
        pool = Pool(processes=40)
        sims = list()
        scores = list()
        for dat in tqdm(data,position=0):
            if method == 'asim':
                sim = pool.apply_async(func=avgsim, args=(dat[0],dat[1],dat[2],dat[3],dat[4],model,reduce)) 
            if method == 'avg':
                sim = pool.apply_async(func=avgvec, args=(dat[0],dat[1],dat[2],dat[3],dat[4],model,reduce)) 
            sim = sim.get()
            if sim:
                sims.append(sim[0])
                scores.append(sim[1])
        pool.close()
        pool.join()
        corr, _ = pearsonr(sims, scores)
        print('Pearsons correlation: %.2f' % (corr*100))
        corr, _ = spearmanr(sims, scores)
        print('Spearmans correlation: %.2f' % (corr*100)) 

In [None]:
get_corr_parallel(spxEM_model,reduce=True)
get_corr_parallel(spx_model,reduce=True)
get_corr_parallel(w2v_model,reduce=True)

In [None]:
get_corr(spxEM_model,reduce=True)
get_corr(spx_model,reduce=True)
get_corr(w2v_model,reduce=True)

In [None]:
get_corr(spxEM_model,method='avg',reduce=True)
get_corr(spx_model,method='avg',reduce=True)
get_corr(w2v_model,method='avg',reduce=True)

In [None]:
print(w2v_model.similarity('brazil','nut'))
print(spx_model.similarity('brazil','nut'))
print(spxEM_model.similarity('brazil','nut'))
print(spxEM_model.similarity('brazil#E','nut#E'))

In [None]:
print(spx_model.similarity('new','york'))
print(w2v_model.similarity('new','york'))
print(spxEM_model.similarity('new','york'))
print(spxEM_model.similarity('new#E','york#E'))

In [None]:
avgsim(data[0][0],data[0][1],data[0][2],data[0][3],w2v_model,verbose=True)

In [None]:
x = ['brazil', 'bahia', 'instance', 'never', 'gap', 'income', 'blacks', 'nonwhites', 'relatively', 'brazil#E', 'bahia#E', 'instance#E', 'never#E', 'gap#E', 'income#E', 'blacks#E', 'relatively#E']
y = ['nut', 'player', 'turns', 'change', 'string', 'tension', 'neck', 'bridge', 'pickups', 'features', 'found', 'nut#E', 'player#E', 'turns#E', 'change#E', 'string#E', 'tension#E', 'neck#E', 'bridge#E', 'pickups#E', 'features#E', 'found#E']

In [None]:
x = ['brazil', 'population', 'black', 'politicians', 'city', 'salvador', 'bahia', 'instance', 'never', 'gap', 'income', 'blacks', 'nonwhites', 'relatively', 'small', 'compared', 'large', 'gap', 'whites', 'brazil#E', 'population#E', 'black#E', 'politicians#E', 'city#E', 'salvador#E', 'bahia#E', 'instance#E', 'never#E', 'gap#E', 'income#E', 'blacks#E', 'relatively#E', 'small#E', 'compared#E', 'large#E', 'gap#E', 'whites#E']
y = ['nut', 'machine', 'heads', 'worm', 'gears', 'player', 'turns', 'change', 'string', 'tension', 'neck', 'bridge', 'pickups', 'features', 'found', 'almost', 'every', 'guitar', 'photo', 'shows', 'nut#E', 'machine#E', 'heads#E', 'worm#E', 'gears#E', 'player#E', 'turns#E', 'change#E', 'string#E', 'tension#E', 'neck#E', 'bridge#E', 'pickups#E', 'features#E', 'found#E', 'almost#E', 'every#E', 'guitar#E', 'photo#E', 'shows#E']

In [None]:
x = ['israel', 'case', 'incorporates', 'fiber', 'optics', 'list', 'pathogens', 'attached', 'silver', 'gold', 'nanowires', 'israel#E', 'case#E', 'fiber#E', 'optics#E', 'list#E', 'pathogens#E', 'attached#E', 'silver#E', 'gold#E']
y = ['israeli', 'policy', 'territories', 'song', 'banned', 'radio', 'israeli', 'folk', 'singer', 'miri', 'israeli#E', 'policy#E', 'territories#E', 'song#E', 'banned#E', 'radio#E', 'israeli#E', 'folk#E', 'singer#E', 'miri#E']

In [None]:
x = ['israel', 'similar', 'widely', 'employed', 'immunological', 'technique', 'case', 'incorporates', 'fiber', 'optics', 'list', 'pathogens', 'attached', 'silver', 'gold', 'nanowires', 'netherlands', 'company', 'tno', 'designed', 'israel#E', 'similar#E', 'widely#E', 'employed#E', 'immunological#E', 'technique#E', 'case#E', 'fiber#E', 'optics#E', 'list#E', 'pathogens#E', 'attached#E', 'silver#E', 'gold#E', 'netherlands#E', 'company#E', 'tno#E', 'designed#E']
y = ['israeli', 'written', 'shalom', 'hanoch', 'protest', 'israeli', 'policy', 'territories', 'song', 'banned', 'radio', 'israeli', 'folk', 'singer', 'miri', 'aloni', 'sang', 'israeli', 'pop', 'song', 'israeli#E', 'written#E', 'shalom#E', 'hanoch#E', 'protest#E', 'israeli#E', 'policy#E', 'territories#E', 'song#E', 'banned#E', 'radio#E', 'israeli#E', 'folk#E', 'singer#E', 'miri#E', 'aloni#E', 'sang#E', 'israeli#E', 'pop#E', 'song#E']

In [None]:
v1 = get_avgvec(x,w2v_model)
v2 = get_avgvec(y,w2v_model)

w2v_model.cosine_similarities(v1,[v2])

In [None]:
v1 = get_avgvec(x,spx_model)
v2 = get_avgvec(y,spx_model)

spx_model.cosine_similarities(v1,[v2])

In [None]:
v1 = get_avgvec(x,spxEM_model)
v2 = get_avgvec(y,spxEM_model)

spxEM_model.cosine_similarities(v1,[v2])

In [None]:
avgsim(data[0][0],data[0][1],data[0][2],data[0][3],spx_model,verbose=True)

In [None]:
avgsim(data[0][0],data[0][1],data[0][2],data[0][3],spxEM_model,verbose=True)

In [None]:
avgsim(data[6][0],data[6][1],data[6][2],data[6][3],spxEM_model,verbose=True)

In [None]:
for w in model.vocab:
    if '#' in w:
        print(w)
        break

In [None]:
import wiki as w
import sys
sys.path.append("../../imports/")
import saver as sv

In [None]:
sentences = sv.load("wiki_sentences_spx")

In [None]:
for sent in sentences:
    print(sent[:105])
    break

In [None]:
print(data[0][0])
print(sent_words(data[0][2]))
print(data[0][1])
print(sent_words(data[0][3]))

In [None]:
spxEM_model.similarity('brazil','nut')

In [None]:
spxEM_model.similarity('brazil#E','nut#E')

In [None]:
w2v_model.similarity('brazil','nut')

In [None]:
sent_words(c1)

# non-context similarities

In [7]:
ws353A = '/home/manni/data/wordsim/EN-WS353.txt'
ws353R = '/home/manni/data/wordsim/EN-WSR353.txt'
ws353S = '/home/manni/data/wordsim/EN-WSS353.txt'
rw = '/home/manni/data/wordsim/rw.txt'
sim999 = '/home/manni/data/wordsim/EN-SIM999.txt'
turk = '/home/manni/data/wordsim/EN_TRUK.txt'
mturk = '/home/manni/data/wordsim/MTURK-771.csv'
rg = '/home/manni/data/wordsim/EN-RG-65.txt'
men = '/home/manni/data/wordsim/EN-MEN-LEM.txt'

In [8]:
with open(sim999) as fin:
    lines = fin.readlines()
    with open(sim999+'_new','w') as fout:
        for line in lines:
            line = line.split('\t')
            try:
                float(line[3])
                fout.write("{} {} {} \n".format(line[0],line[1],line[3])) 
            except:
                continue   
sim999 = sim999+'_new'

In [9]:
with open(mturk) as fin:
    lines = fin.readlines()
    with open(mturk+'_new','w') as fout:
        for line in lines:
            line = line.split(',')
            try:
                fout.write("{} {} {} \n".format(line[0],line[1],line[2])) 
            except:
                continue   
mturk = mturk+'_new'

In [10]:
with open(men) as fin:
    lines = fin.readlines()
    with open(men+'_new','w') as fout:
        for line in lines:
            line = line.split()
            try:
                fout.write("{} {} {} \n".format(line[0].split('-')[0],line[1].split('-')[0],line[2])) 
            except:
                continue   
men = men+'_new'

In [11]:
# debug
with open(mturk) as fin:
    lines = fin.readlines()
    for line in lines:
        line= line.split()
        print(line)
        continue
        try:
            if line[0] in model.vocab and line[1] in model.vocab:
                continue
        except:
            print(line)

['access', 'gateway', '3.791666667']
[]
['account', 'explanation', '2']
[]
['account', 'invoice', '3.75']
[]
['account', 'statement', '3.681818182']
[]
['acoustic', 'remedy', '1.227272727']
[]
['acrylic', 'cloth', '2.739130435']
[]
['action', 'adjustment', '2']
[]
['action', 'entrance', '1.583333333']
[]
['activity', 'event', '4.083333333']
[]
['activity', 'music', '2.681818182']
[]
['activity', 'skiing', '3.45']
[]
['addition', 'segment', '2.5']
[]
['adhesive', 'glue', '4.608695652']
[]
['adult', 'dentist', '2.47826087']
[]
['adult', 'doctor', '2.782608696']
[]
['afternoon', 'substance', '1']
[]
['age', 'childhood', '3.782608696']
[]
['agency', 'army', '2.916666667']
[]
['agency', 'office', '3.857142857']
[]
['agency', 'police', '3.19047619']
[]
['agent', 'spy', '4']
[]
['agreement', 'contract', '4.476190476']
[]
['aim', 'purpose', '4.363636364']
[]
['aircraft', 'balloon', '2.869565217']
[]
['aircraft', 'yacht', '2.434782609']
[]
['alarm', 'horn', '3.458333333']
[]
['alarm', 'press', 

In [13]:
datasets = [ws353A,ws353R,ws353S,rw,sim999,turk,mturk,rg,men]

In [14]:
def avg_sim(w1,w2,model):
    s1 = [w1,w1+'#E']
    s2 = [w2,w2+'#E']
    a = [w for w in s1 if w in model.vocab]
    b = [w for w in s2 if w in model.vocab]
    div = len(a)*len(b)
    sims = 0
    for i in a:
        for j in b:
            sims+=model.similarity(i,j)
    return sims/div

In [15]:

for ds in datasets:
    name = ds.split('/')[-1].split('.')[0]
    print(r'\begin{subsection}{'+name+r'}')
    print(r'\begin{table}[!h]')
    print(r'\begin{tabular}{|l|c|c|}')
    print('\hline')
    print("Model & Pearsons & Spearmans"+r"\\")
    print('\hline')
    with open(ds) as fin:
        lines = fin.readlines()
        for i, model in enumerate(models):
            print(model_names[i],end=' & ')
            sims = list()
            scores = list()
            for line in lines:
                line = line.split()
                if not line:
                    continue
                if line[0] in model.vocab and line[1] in model.vocab:
                    sim = avg_sim(line[0],line[1],model) 
                else:
                    continue
                if sim:
                    score = float(line[2])
                    sims.append(sim)
                    scores.append(score)
            corr, _ = pearsonr(sims, scores)
            print('%.2f' % (corr*100),end=' & ')
            corr, _ = spearmanr(sims, scores)
            print('%.2f' % (corr*100),end=r'\\') 
            print()
    print('\hline')
    print('\end{tabular}')
    print('\end{table}')
    print(r'\end{subsection}')
    print()

\begin{subsection}{EN-WS353}
\begin{table}[!h]
\begin{tabular}{|l|c|c|}
\hline
Model & Pearsons & Spearmans\\
\hline
Word2vec & 64.58 & 67.37\\
EWEM-MIX & 64.70 & 67.64\\
EWEM & 63.29 & 67.39\\
\hline
\end{tabular}
\end{table}
\end{subsection}

\begin{subsection}{EN-WSR353}
\begin{table}[!h]
\begin{tabular}{|l|c|c|}
\hline
Model & Pearsons & Spearmans\\
\hline
Word2vec & 59.58 & 60.29\\
EWEM-MIX & 60.36 & 61.06\\
EWEM & 59.96 & 61.92\\
\hline
\end{tabular}
\end{table}
\end{subsection}

\begin{subsection}{EN-WSS353}
\begin{table}[!h]
\begin{tabular}{|l|c|c|}
\hline
Model & Pearsons & Spearmans\\
\hline
Word2vec & 74.56 & 76.01\\
EWEM-MIX & 74.49 & 76.49\\
EWEM & 72.38 & 75.11\\
\hline
\end{tabular}
\end{table}
\end{subsection}

\begin{subsection}{rw}
\begin{table}[!h]
\begin{tabular}{|l|c|c|}
\hline
Model & Pearsons & Spearmans\\
\hline
Word2vec & 44.23 & 46.11\\
EWEM-MIX & 40.82 & 41.83\\
EWEM & 33.44 & 34.85\\
\hline
\end{tabular}
\end{table}
\end{subsection}

\begin{subsection}{EN-SI