In [1]:
from gensim.models import KeyedVectors
from tqdm import tqdm
from scipy.stats import pearsonr
from scipy.stats import spearmanr
import re

In [3]:
spxEM = '/home/manni/embs/en_wiki_spx_mc100_epoch5_300_em.txt'
spxEM_model = KeyedVectors.load_word2vec_format(spxEM, binary=False)
w2v = '/home/manni/embs/en_wiki_w2v_mc100_epoch5_300.txt'
w2v_model = KeyedVectors.load_word2vec_format(w2v, binary=False)
spx = '/home/manni/embs/en_wiki_spx_mc100_epoch5_300.txt'
spx_model = KeyedVectors.load_word2vec_format(spx, binary=False)

In [6]:
models = [w2v_model,spx_model,spxEM_model]
model_names = ['Word2vec','EWEM-MIX','EWEM']

In [None]:
spxEM = '/home/manni/embs/en_wiki_spx_mc100_epoch5_300_em_w1.txt'
spxEM_model = KeyedVectors.load_word2vec_format(spxEM, binary=False)
w2v = '/home/manni/embs/en_wiki_w2v_mc100_epoch5_300_w1.txt'
w2v_model = KeyedVectors.load_word2vec_format(w2v, binary=False)

In [None]:
models = [w2v_model,spxEM_model]
model_names = ['Word2vec','EWEM']

# SCWS

In [8]:
scws_file = '/home/manni/data/wordsim/SCWS/ratings.txt'

In [9]:
data = list()
with open(scws_file) as fin:
    lines = fin.readlines()
    for l in lines:
        l = l.lower().split('\t')
        w1 = l[1]
        w2 = l[3]
        c1 = l[5]
        c2 = l[6]
        score = float(l[7])
        data.append([w1,w2,c1,c2,score])

In [10]:
data[0]

['brazil',
 'nut',
 'gap in income between blacks and other non-whites is relatively small compared to the large gap between whites and non-whites . other factors such as illiteracy and education level show the same patterns . unlike in the us where african americans were united in the civil rights struggle , in <b> brazil </b> the philosophy of whitening has helped divide blacks from other non-whites and prevented a more active civil rights movement . though afro-brazilians make up half the population there are very few black politicians . the city of salvador , bahia for instance is 80 % afro-brazilian but has never',
 'of the neck , bridge , and pickups , there are features which are found in almost every guitar . the photo below shows the different parts of an electric guitar . the headstock ( 1 ) contains the metal machine heads , which are used for tuning ; the <b> nut </b> ( 1.4 ) , a thin fret-like strip of metal , plastic , graphite or bone which the strings pass over as they 

In [11]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_sent(sent):
    '''
    parameters:
        sent: str
    returns:
        tokens: list(str)     
    '''
    return [word for word in sent.split() if word not in stop_words]

def sent_words(sent):
    '''
    parameters:
        sent: str
    returns:
        tokens: list(str) 
    '''
    sent = re.sub(r'[^\w\s]', '', sent)
    left = sent.split('<b>')[0]
    right = sent.split('</b>')[-1]
    sentA = clean_sent(left)[-5:]
    sentB = clean_sent(right)[:5]
    return sentA+sentB

def avgsim(w1,w2,c1,c2,model,verbose=False):
    '''
    Computes average similarity score.

    parameters:
        w1 : str
        w2 : str
        c1 : str
        c2 : str
        model : gensim.keyedvectors
    returns:
        score : float
    '''
    if w1 not in model.vocab or w2 not in model.vocab:
        return None
    a = [w1] + sent_words(c1)
    _a = [word+'#E' for word in a] 
    a+=_a
    b = [w2] + sent_words(c2)
    _b = [word+'#E' for word in b] 
    b+=_b
    a = [word for word in a if word in model.vocab]
    b = [word for word in b if word in model.vocab]
    if verbose:
        print(a)
        print(b)
    div = len(a)*len(b)
    sims = 0
    for i in a:
        for j in b:
            sim = model.similarity(i,j)
            if verbose:
                print(i,j,sim)
            sims+= sim
    return sims/div

In [12]:
def get_corr(model):
    sims = list()
    scores = list()
    for dat in tqdm(data,position=0):
        sim = avgsim(dat[0],dat[1],dat[2],dat[3],model)
        if sim:
            score = dat[4]
            sims.append(sim)
            scores.append(score)
    corr, _ = pearsonr(sims, scores)
    print('Pearsons correlation: %.2f' % (corr*100))
    corr, _ = spearmanr(sims, scores)
    print('Spearmans correlation: %.2f' % (corr*100))   

In [None]:
get_corr(spxEM_model)


In [None]:
get_corr(spx_model)

In [None]:
get_corr(w2v_model)

In [13]:
print(w2v_model.similarity('brazil','nut'))
print(spx_model.similarity('brazil','nut'))
print(spxEM_model.similarity('brazil','nut'))
print(spxEM_model.similarity('brazil#E','nut#E'))

0.19745755
0.15671067
0.15111805
0.18348204


In [14]:
print(spx_model.similarity('new','york'))
print(w2v_model.similarity('new','york'))
print(spxEM_model.similarity('new','york'))
print(spxEM_model.similarity('new#E','york#E'))

0.5839992
0.60688496
0.5302377
0.862124


In [15]:
avgsim(data[0][0],data[0][1],data[0][2],data[0][3],w2v_model,verbose=True)

['brazil', 'bahia', 'instance', 'never', 'gap', 'income', 'blacks', 'nonwhites', 'relatively']
['nut', 'player', 'turns', 'change', 'string', 'tension', 'neck', 'bridge', 'pickups', 'features', 'found']
brazil nut 0.19745755
brazil player 0.130856
brazil turns 0.0463257
brazil change 0.16854647
brazil string 0.073782526
brazil tension 0.053863127
brazil neck 0.06356639
brazil bridge 0.07224046
brazil pickups 0.043330412
brazil features 0.1246864
brazil found 0.25188786
bahia nut 0.18450944
bahia player 0.11996905
bahia turns 0.008881014
bahia change 0.12484312
bahia string 0.08254954
bahia tension 0.013918953
bahia neck 0.04056377
bahia bridge 0.076342374
bahia pickups 0.0072754477
bahia features 0.12120576
bahia found 0.142302
instance nut 0.101189315
instance player 0.12255697
instance turns 0.15928455
instance change 0.22907887
instance string 0.26618966
instance tension 0.14873882
instance neck 0.027811175
instance bridge 0.077633396
instance pickups 0.06914475
instance features 0.

0.12569543208005002

In [None]:
avgsim(data[0][0],data[0][1],data[0][2],data[0][3],spx_model)

In [None]:
avgsim(data[0][0],data[0][1],data[0][2],data[0][3],spxEM_model)

In [None]:
for w in model.vocab:
    if '#' in w:
        print(w)
        break

In [None]:
import wiki as w
import sys
sys.path.append("../../imports/")
import saver as sv

In [None]:
sentences = sv.load("wiki_sentences_spx")

In [None]:
for sent in sentences:
    print(sent[:105])
    break

In [None]:
print(data[0][0])
print(sent_words(data[0][2]))
print(data[0][1])
print(sent_words(data[0][3]))

In [None]:
spxEM_model.similarity('brazil','nut')

In [None]:
spxEM_model.similarity('brazil#E','nut#E')

In [None]:
w2v_model.similarity('brazil','nut')

In [None]:
sent_words(c1)

# non-context similarities

In [None]:
ws353A = '/home/manni/data/wordsim/EN-WS353.txt'
ws353R = '/home/manni/data/wordsim/EN-WSR353.txt'
ws353S = '/home/manni/data/wordsim/EN-WSS353.txt'
rw = '/home/manni/data/wordsim/rw.txt'
sim999 = '/home/manni/data/wordsim/EN-SIM999.txt'
turk = '/home/manni/data/wordsim/EN_TRUK.txt'
mturk = '/home/manni/data/wordsim/MTURK-771.csv'
rg = '/home/manni/data/wordsim/EN-RG-65.txt'
men = '/home/manni/data/wordsim/EN-MEN-LEM.txt'

In [None]:
with open(sim999) as fin:
    lines = fin.readlines()
    with open(sim999+'_new','w') as fout:
        for line in lines:
            line = line.split('\t')
            try:
                float(line[3])
                fout.write("{} {} {} \n".format(line[0],line[1],line[3])) 
            except:
                continue   
sim999 = sim999+'_new'

In [None]:
with open(mturk) as fin:
    lines = fin.readlines()
    with open(mturk+'_new','w') as fout:
        for line in lines:
            line = line.split(',')
            try:
                fout.write("{} {} {} \n".format(line[0],line[1],line[2])) 
            except:
                continue   
mturk = mturk+'_new'

In [None]:
with open(men) as fin:
    lines = fin.readlines()
    with open(men+'_new','w') as fout:
        for line in lines:
            line = line.split()
            try:
                fout.write("{} {} {} \n".format(line[0].split('-')[0],line[1].split('-')[0],line[2])) 
            except:
                continue   
men = men+'_new'

In [None]:
# debug
with open(mturk) as fin:
    lines = fin.readlines()
    for line in lines:
        line= line.split()
        print(line)
        continue
        try:
            if line[0] in model.vocab and line[1] in model.vocab:
                continue
        except:
            print(line)

In [None]:
datasets = [ws353A,ws353R,ws353S,rw,sim999,turk,mturk,rg,men]

In [None]:
def avg_sim(w1,w2,model):
    s1 = [w1,w1+'#E']
    s2 = [w2,w2+'#E']
    a = [w for w in s1 if w in model.vocab]
    b = [w for w in s2 if w in model.vocab]
    div = len(a)*len(b)
    sims = 0
    for i in a:
        for j in b:
            sims+=model.similarity(i,j)
    return sims/div

In [None]:

for ds in datasets:
    name = ds.split('/')[-1].split('.')[0]
    print(r'\begin{subsection}{'+name+r'}')
    print(r'\begin{table}[!h]')
    print(r'\begin{tabular}{|l|c|c|}')
    print('\hline')
    print("Model & Pearsons & Spearmans"+r"\\")
    print('\hline')
    with open(ds) as fin:
        lines = fin.readlines()
        for i, model in enumerate(models):
            print(model_names[i],end=' & ')
            sims = list()
            scores = list()
            for line in lines:
                line = line.split()
                if not line:
                    continue
                if line[0] in model.vocab and line[1] in model.vocab:
                    sim = avg_sim(line[0],line[1],model) 
                else:
                    continue
                if sim:
                    score = float(line[2])
                    sims.append(sim)
                    scores.append(score)
            corr, _ = pearsonr(sims, scores)
            print('%.2f' % (corr*100),end=' & ')
            corr, _ = spearmanr(sims, scores)
            print('%.2f' % (corr*100),end=r'\\') 
            print()
    print('\hline')
    print('\end{tabular}')
    print('\end{table}')
    print(r'\end{subsection}')
    print()