In [1]:
from gensim.models import KeyedVectors
from tqdm import tqdm
from scipy.stats import pearsonr
from scipy.stats import spearmanr
import re
import numpy as np
import pandas as pd

In [None]:
spxEM = '/home/manni/embs/en_wiki_spx_mc100_epoch5_300_em.txt'
#spxEM = '/home/manni/embs/en_wiki_spx_mc100_epoch5_300_em_w5_2.txt'
spxEM_model = KeyedVectors.load_word2vec_format(spxEM, binary=False)
w2v = '/home/manni/embs/en_wiki_w2v_mc100_epoch5_300.txt'
#w2v = '/home/manni/embs/en_wiki_w2v_mc100_epoch5_300_w5_2.txt'
w2v_model = KeyedVectors.load_word2vec_format(w2v, binary=False)
spx = '/home/manni/embs/en_wiki_spx_mc100_epoch5_300.txt'
spx_model = KeyedVectors.load_word2vec_format(spx, binary=False)
spxPEM = '/home/manni/embs/en_wiki_spx_mc100_epoch5_300_pem.txt'
spxPEM_model = KeyedVectors.load_word2vec_format(spxPEM, binary=False)
spxNEM = '/home/manni/embs/en_wiki_spx_mc100_epoch5_300_nem.txt'
spxNEM_model = KeyedVectors.load_word2vec_format(spxNEM, binary=False)

In [None]:
models = [w2v_model,spx_model,spxEM_model,spxPEM_model,spxNEM_model]
model_names = ['Word2vec','EWEM-MIX','EWEM','EWEM-3',"EWEM-NULL"]

In [None]:
spxEM = '/home/manni/embs/en_wiki_spx_mc100_epoch5_300_em_w1.txt'
spxEM_model = KeyedVectors.load_word2vec_format(spxEM, binary=False)
w2v = '/home/manni/embs/en_wiki_w2v_mc100_epoch5_300_w1.txt'
w2v_model = KeyedVectors.load_word2vec_format(w2v, binary=False)

In [None]:
models = [w2v_model,spxEM_model]
model_names = ['Word2vec','EWEM']

# SCWS

In [None]:
scws_file = '/home/manni/data/wordsim/SCWS/ratings.txt'

In [None]:
data = list()
with open(scws_file) as fin:
    lines = fin.readlines()
    for l in lines:
        l = l.lower().split('\t')
        w1 = l[1]
        w2 = l[3]
        c1 = l[5]
        c2 = l[6]
        score = float(l[7])
        data.append([w1,w2,c1,c2,score])

In [None]:
data[6]

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_sent(sent):
    '''
    parameters:
        sent: str
    returns:
        tokens: list(str)     
    '''
    return [word for word in sent.split() if word not in stop_words]

def sent_words(sent):
    '''
    parameters:
        sent: str
    returns:
        tokens: list(str) 
    '''
    sent = re.sub(r'[^\w\s]', '', sent)
    left = sent.split('<b>')[0]
    right = sent.split('</b>')[-1]
    sentA = clean_sent(left)[-10:]
    sentB = clean_sent(right)[:10]
    return sentA+sentB

def reducer(wrd,lst,model):
    out = set()
    for word in lst:
        top = [tup[0] for tup in model.most_similar(word,topn=100)]
        out.update(set(top)&set(lst))
    out.add(wrd)
    if wrd+'#E' in model.vocab:
        out.add(wrd+'#E')
    return out

def avgsim(w1,w2,c1,c2,score,model,reduce=False,verbose=False):
    '''
    Computes average similarity score.

    parameters:
        w1 : str
        w2 : str
        c1 : str
        c2 : str
        model : gensim.keyedvectors
    returns:
        score : float
    '''
    if w1 not in model.vocab or w2 not in model.vocab:
        return None
    a = [w1] + sent_words(c1)
    _a = [word+'#E' for word in a] 
    a+=_a
    b = [w2] + sent_words(c2)
    _b = [word+'#E' for word in b] 
    b+=_b
    a = [word for word in a if word in model.vocab]
    b = [word for word in b if word in model.vocab]
    if reduce:
        a = reducer(w1,a,model)
        b = reducer(w2,b,model)    
    if verbose:
        print(a)
        print(b)
    div = len(a)*len(b)
    sims = 0
    for i in a:
        for j in b:
            sim = model.similarity(i,j)
            if verbose:
                print(i,j,sim)
            sims+= sim
    return (sims/div,score)

def _avgvec(lst,model):
    vec = np.zeros(model.vector_size)
    for word in lst:
        if word in model:
            vec = np.add(vec,model.get_vector(word))
    return vec  
    
def avgvec(w1,w2,c1,c2,score,model,reduce=False,verbose=False):
    if w1 not in model.vocab or w2 not in model.vocab:
        return None
    a = [w1] + sent_words(c1)
    _a = [word+'#E' for word in a] 
    a+=_a
    b = [w2] + sent_words(c2)
    _b = [word+'#E' for word in b] 
    b+=_b
    a = [word for word in a if word in model.vocab]
    b = [word for word in b if word in model.vocab]
    if reduce:
        a = reducer(w1,a,model)
        b = reducer(w2,b,model) 
    if verbose:
        print(a)
        print(b) 
    v1 = _avgvec(a,model)
    v2 = _avgvec(b,model)
    return (model.cosine_similarities(v1,[v2])[0],score)

In [None]:
def get_corr(model,method='asim',reduce=False):
    sims = list()
    scores = list()
    for dat in tqdm(data,position=0):
        if method == 'asim':
            sim = avgsim(dat[0],dat[1],dat[2],dat[3],dat[4],model,reduce=reduce)
        if method == 'avg':
            sim = avgvec(dat[0],dat[1],dat[2],dat[3],dat[4],model,reduce=reduce)
        if sim:
            sims.append(sim[0])
            scores.append(sim[1])
    corr, _ = pearsonr(sims, scores)
    print('%.2f &' % (corr*100),end=' ')
    corr, _ = spearmanr(sims, scores)
    print('%.2f' % (corr*100))   

In [None]:
get_corr(spxPEM_model)
get_corr(spxEM_model)
get_corr(spx_model)
get_corr(w2v_model)

In [None]:
get_corr(spxPEM_model,method='avg')
get_corr(spxEM_model,method='avg')
get_corr(spx_model,method='avg')
get_corr(w2v_model,method='avg')

In [None]:
from multiprocessing import Pool

def get_corr_parallel(model,method='asim',reduce=False):
    if __name__=="__main__":
        pool = Pool(processes=40)
        sims = list()
        scores = list()
        for dat in tqdm(data,position=0):
            if method == 'asim':
                sim = pool.apply_async(func=avgsim, args=(dat[0],dat[1],dat[2],dat[3],dat[4],model,reduce)) 
            if method == 'avg':
                sim = pool.apply_async(func=avgvec, args=(dat[0],dat[1],dat[2],dat[3],dat[4],model,reduce)) 
            sim = sim.get()
            if sim:
                sims.append(sim[0])
                scores.append(sim[1])
        pool.close()
        pool.join()
        corr, _ = pearsonr(sims, scores)
        print('Pearsons correlation: %.2f' % (corr*100))
        corr, _ = spearmanr(sims, scores)
        print('Spearmans correlation: %.2f' % (corr*100)) 

In [None]:
get_corr_parallel(spxEM_model,reduce=True)
get_corr_parallel(spx_model,reduce=True)
get_corr_parallel(w2v_model,reduce=True)

In [None]:
get_corr(spxEM_model,reduce=True)
get_corr(spx_model,reduce=True)
get_corr(w2v_model,reduce=True)

In [None]:
get_corr(spxEM_model,method='avg',reduce=True)
get_corr(spx_model,method='avg',reduce=True)
get_corr(w2v_model,method='avg',reduce=True)

In [None]:
print(w2v_model.similarity('brazil','nut'))
print(spx_model.similarity('brazil','nut'))
print(spxEM_model.similarity('brazil','nut'))
print(spxEM_model.similarity('brazil#E','nut#E'))

In [None]:
print(spx_model.similarity('new','york'))
print(w2v_model.similarity('new','york'))
print(spxEM_model.similarity('new','york'))
print(spxEM_model.similarity('new#E','york#E'))

In [None]:
avgsim(data[0][0],data[0][1],data[0][2],data[0][3],w2v_model,verbose=True)

In [None]:
x = ['brazil', 'bahia', 'instance', 'never', 'gap', 'income', 'blacks', 'nonwhites', 'relatively', 'brazil#E', 'bahia#E', 'instance#E', 'never#E', 'gap#E', 'income#E', 'blacks#E', 'relatively#E']
y = ['nut', 'player', 'turns', 'change', 'string', 'tension', 'neck', 'bridge', 'pickups', 'features', 'found', 'nut#E', 'player#E', 'turns#E', 'change#E', 'string#E', 'tension#E', 'neck#E', 'bridge#E', 'pickups#E', 'features#E', 'found#E']

In [None]:
x = ['brazil', 'population', 'black', 'politicians', 'city', 'salvador', 'bahia', 'instance', 'never', 'gap', 'income', 'blacks', 'nonwhites', 'relatively', 'small', 'compared', 'large', 'gap', 'whites', 'brazil#E', 'population#E', 'black#E', 'politicians#E', 'city#E', 'salvador#E', 'bahia#E', 'instance#E', 'never#E', 'gap#E', 'income#E', 'blacks#E', 'relatively#E', 'small#E', 'compared#E', 'large#E', 'gap#E', 'whites#E']
y = ['nut', 'machine', 'heads', 'worm', 'gears', 'player', 'turns', 'change', 'string', 'tension', 'neck', 'bridge', 'pickups', 'features', 'found', 'almost', 'every', 'guitar', 'photo', 'shows', 'nut#E', 'machine#E', 'heads#E', 'worm#E', 'gears#E', 'player#E', 'turns#E', 'change#E', 'string#E', 'tension#E', 'neck#E', 'bridge#E', 'pickups#E', 'features#E', 'found#E', 'almost#E', 'every#E', 'guitar#E', 'photo#E', 'shows#E']

In [None]:
x = ['israel', 'case', 'incorporates', 'fiber', 'optics', 'list', 'pathogens', 'attached', 'silver', 'gold', 'nanowires', 'israel#E', 'case#E', 'fiber#E', 'optics#E', 'list#E', 'pathogens#E', 'attached#E', 'silver#E', 'gold#E']
y = ['israeli', 'policy', 'territories', 'song', 'banned', 'radio', 'israeli', 'folk', 'singer', 'miri', 'israeli#E', 'policy#E', 'territories#E', 'song#E', 'banned#E', 'radio#E', 'israeli#E', 'folk#E', 'singer#E', 'miri#E']

In [None]:
x = ['israel', 'similar', 'widely', 'employed', 'immunological', 'technique', 'case', 'incorporates', 'fiber', 'optics', 'list', 'pathogens', 'attached', 'silver', 'gold', 'nanowires', 'netherlands', 'company', 'tno', 'designed', 'israel#E', 'similar#E', 'widely#E', 'employed#E', 'immunological#E', 'technique#E', 'case#E', 'fiber#E', 'optics#E', 'list#E', 'pathogens#E', 'attached#E', 'silver#E', 'gold#E', 'netherlands#E', 'company#E', 'tno#E', 'designed#E']
y = ['israeli', 'written', 'shalom', 'hanoch', 'protest', 'israeli', 'policy', 'territories', 'song', 'banned', 'radio', 'israeli', 'folk', 'singer', 'miri', 'aloni', 'sang', 'israeli', 'pop', 'song', 'israeli#E', 'written#E', 'shalom#E', 'hanoch#E', 'protest#E', 'israeli#E', 'policy#E', 'territories#E', 'song#E', 'banned#E', 'radio#E', 'israeli#E', 'folk#E', 'singer#E', 'miri#E', 'aloni#E', 'sang#E', 'israeli#E', 'pop#E', 'song#E']

In [None]:
v1 = get_avgvec(x,w2v_model)
v2 = get_avgvec(y,w2v_model)

w2v_model.cosine_similarities(v1,[v2])

In [None]:
v1 = get_avgvec(x,spx_model)
v2 = get_avgvec(y,spx_model)

spx_model.cosine_similarities(v1,[v2])

In [None]:
v1 = get_avgvec(x,spxEM_model)
v2 = get_avgvec(y,spxEM_model)

spxEM_model.cosine_similarities(v1,[v2])

In [None]:
avgsim(data[0][0],data[0][1],data[0][2],data[0][3],spx_model,verbose=True)

In [None]:
avgsim(data[0][0],data[0][1],data[0][2],data[0][3],spxEM_model,verbose=True)

In [None]:
avgsim(data[6][0],data[6][1],data[6][2],data[6][3],spxEM_model,verbose=True)

In [None]:
for w in model.vocab:
    if '#' in w:
        print(w)
        break

In [None]:
import wiki as w
import sys
sys.path.append("../../imports/")
import saver as sv

In [None]:
sentences = sv.load("wiki_sentences_spx")

In [None]:
for sent in sentences:
    print(sent[:105])
    break

In [None]:
print(data[0][0])
print(sent_words(data[0][2]))
print(data[0][1])
print(sent_words(data[0][3]))

In [None]:
spxEM_model.similarity('brazil','nut')

In [None]:
spxEM_model.similarity('brazil#E','nut#E')

In [None]:
w2v_model.similarity('brazil','nut')

In [None]:
sent_words(c1)

# non-context similarities

In [None]:
ws353A = '/home/manni/data/wordsim/EN-WS353.txt'
ws353R = '/home/manni/data/wordsim/EN-WSR353.txt'
ws353S = '/home/manni/data/wordsim/EN-WSS353.txt'
rw = '/home/manni/data/wordsim/rw.txt'
sim999 = '/home/manni/data/wordsim/EN-SIM999.txt'
turk = '/home/manni/data/wordsim/EN_TRUK.txt'
mturk = '/home/manni/data/wordsim/MTURK-771.csv'
rg = '/home/manni/data/wordsim/EN-RG-65.txt'
men = '/home/manni/data/wordsim/EN-MEN-LEM.txt'

In [None]:
with open(sim999) as fin:
    lines = fin.readlines()
    with open(sim999+'_new','w') as fout:
        for line in lines:
            line = line.split('\t')
            try:
                float(line[3])
                fout.write("{} {} {} \n".format(line[0],line[1],line[3])) 
            except:
                continue   
sim999 = sim999+'_new'

In [None]:
with open(mturk) as fin:
    lines = fin.readlines()
    with open(mturk+'_new','w') as fout:
        for line in lines:
            line = line.split(',')
            try:
                fout.write("{} {} {} \n".format(line[0],line[1],line[2])) 
            except:
                continue   
mturk = mturk+'_new'

In [None]:
with open(men) as fin:
    lines = fin.readlines()
    with open(men+'_new','w') as fout:
        for line in lines:
            line = line.split()
            try:
                fout.write("{} {} {} \n".format(line[0].split('-')[0],line[1].split('-')[0],line[2])) 
            except:
                continue   
men = men+'_new'

In [None]:
# debug
with open(mturk) as fin:
    lines = fin.readlines()
    for line in lines:
        line= line.split()
        print(line)
        continue
        try:
            if line[0] in model.vocab and line[1] in model.vocab:
                continue
        except:
            print(line)

In [None]:
datasets = [ws353A,ws353R,ws353S,rw,sim999,turk,mturk,rg,men]

In [None]:
def avg_sim(w1,w2,model):
    s1 = [w1,w1+'#E',w1+'#P']
    s2 = [w2,w2+'#E',w1+'#P']
    a = [w for w in s1 if w in model.vocab]
    b = [w for w in s2 if w in model.vocab]
    div = len(a)*len(b)
    sims = 0
    for i in a:
        for j in b:
            sims+=model.similarity(i,j)
    return sims/div

In [None]:
import numpy as np
import scipy.stats


def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, h

In [None]:
def get_corrs(lines,bs=False):
    if bs:
        p_corrs = list()
        s_corrs = list()
        for _ in range(100):
            sims = list()
            scores = list()
            sidx  = np.random.choice(idx,replace=True,size=b_size)
            lines = pd.Series(lines)
            lines = list(lines[sidx])
            for line in lines:
                line = line.split()
                if not line:
                    continue
                if line[0] in model.vocab and line[1] in model.vocab:
                    sim = avg_sim(line[0],line[1],model) 
                else:
                    continue
                if sim:
                    score = float(line[2])
                    sims.append(sim)
                    scores.append(score)
            corr, _ = pearsonr(sims, scores)
            p_corrs.append(corr)
            corr, _ = spearmanr(sims, scores)
            s_corrs.append(corr)
        return mean_confidence_interval(p_corrs),mean_confidence_interval(s_corrs)
    else:
        sims = list()
        scores = list()
        for line in lines:
            line = line.split()
            if not line:
                continue
            if line[0] in model.vocab and line[1] in model.vocab:
                sim = avg_sim(line[0],line[1],model) 
            else:
                continue
            if sim:
                score = float(line[2])
                sims.append(sim)
                scores.append(score)
        pcorr, _ = pearsonr(sims, scores)
        scorr, _ = spearmanr(sims, scores)
        return '%.2f' % (pcorr*100),'%.2f' % (scorr*100)

In [None]:
for ds in datasets:
    name = ds.split('/')[-1].split('.')[0]
    print(r'\begin{subsection}{'+name+r'}')
    print(r'\begin{table}[!h]')
    print(r'\begin{tabular}{|l|c|c|}')
    print('\hline')
    print("Model & Pearsons & Spearmans"+r"\\")
    print('\hline')
    with open(ds) as fin:
        lines = fin.readlines()
        b_size = len(lines)
        idx = [i for i in range(b_size)]
        for i, model in enumerate(models):
            print(model_names[i],end=' & ')
            pcorr,scorr=get_corrs(lines)
            print(pcorr,end=' & ')
            print(scorr,end=r'\\') 
            print()
    print('\hline')
    print('\end{tabular}')
    print('\end{table}')
    print(r'\end{subsection}')
    print()

In [None]:
phrases = ['new york','new zealand']
markers = ['#E','#P']
print(r'\begin{table}[!h]')
print(r'\begin{tabular}{|l|c|c|}')
print('\hline')
print("Model & Phrase & Similarity"+r"\\")
print('\hline')
for phrase in phrases:
    words = phrase.split()
    s1 = [word+'#E' for word in words]
    s2 = [word+'#P' for word in words]
    senses = [s1,s2]
    for i,model in enumerate(models):
        sim = model.similarity(words[0],words[1])
        print(model_names[i],end=' & ')
        print(phrase,end=' & ')
        print('%.2f' % (sim),end=r'\\')
        print()
        for sense in senses:
            if sense[0] in model.vocab and sense[1] in model.vocab:
                sim = model.similarity(sense[0],sense[1])
                print(model_names[i],end=' & ')
                print(' '.join(sense),end=' & ')
                print('%.2f' % (sim),end=r'\\')
                print()
    print('\hline')
print('\end{tabular}')
print('\end{table}')

In [None]:
import scipy
print(scipy.__version__)

In [None]:
from scipy.stats import bootstrap

In [None]:
def viterbi_algorithm(observations, states, start_p, trans_p, emit_p):
    V = [{}]
    for st in states:
         V[0][st] = {"prob": start_p[st] * emit_p[st][observations[0]], "prev": None}

    for t in range(1, len(observations)):
        V.append({})
        for st in states:
            max_tr_prob = V[t - 1][states[0]]["prob"] * trans_p[states[0]][st]
            prev_st_selected = states[0]
            for prev_st in states[1:]:
                tr_prob = V[t - 1][prev_st]["prob"] * trans_p[prev_st][st]
                if tr_prob > max_tr_prob:
                    max_tr_prob = tr_prob
                    prev_st_selected = prev_st

            max_prob = max_tr_prob * emit_p[st][observations[t]]
            V[t][st] = {"prob": max_prob, "prev": prev_st_selected}
    for line in dptable(V):
        print(line)

    opt = []
    max_prob = 0.0
    best_st = None

    for st, data in V[-1].items():
        if data["prob"] > max_prob:
            max_prob = data["prob"]
            best_st = st
    opt.append(best_st)
    previous = best_st

    for t in range(len(V) - 2, -1, -1):
        opt.insert(0, V[t + 1][previous]["prev"])
        previous = V[t + 1][previous]["prev"]

    print ("The steps of states are " + " ".join(opt) + " with highest probability of %s" % max_prob)

def dptable(V):
     
    yield " ".join(("%12d" % i) for i in range(len(V)))
    for state in V[0]:
        yield "%.7s: " % state + " ".join("%.7s" % ("%f" % v[state]["prob"]) for v in V)

In [None]:
observations = ("normal", "cold", "dizzy")
states = ("Healthy", "Fever")
start_p = {"Healthy": 1, "Fever": 1}
trans_p = {
    "Healthy": {"Healthy": 1, "Fever": 1},
    "Fever": {"Healthy": 1, "Fever": 1},
}
emit_p = {
    "Healthy": {"normal": 0.5, "cold": 0.4, "dizzy": 0.1},
    "Fever": {"normal": 0.1, "cold": 0.3, "dizzy": 0.6},
}

In [None]:
viterbi_algorithm(observations, states, start_p, trans_p, emit_p)

In [None]:
model = spxEM_model

In [None]:
P = list()
w = list()
mat = {}

        

In [None]:
import numpy as np

def viterbi(y, A, B, initial_probs = None):
    K = A.shape[0]
    initial_probs = initial_probs if initial_probs is not None else np.full(K, 1 / K)
    T = len(y)
    T1 = np.empty((K, T), 'd')
    T2 = np.empty((K, T), 'B')
    T1[:, 0] = initial_probs * B[:, y[0]]
    T2[:, 0] = 0
    
    for i in range(1, T):
        T1[:, i] = np.max(T1[:, i - 1] * A.T * B[np.newaxis, :, y[i]].T, 1)
        T2[:, i] = np.argmax(T1[:, i - 1] * A.T, 1)

    x = np.empty(T, 'B')
    x[-1] = np.argmax(T1[:, T - 1])
    
    for i in reversed(range(1, T)):
        x[i - 1] = T2[x[i], i]

    return x, T1, T2


In [None]:
print(spxEM_model.similarity('new#E','york#E'))
print(spxEM_model.similarity('new#E','york'))
print(spxEM_model.similarity('new','york#E'))
print(spxEM_model.similarity('new','york'))

In [None]:
print(spxPEM_model.similarity('new#E','york#E'))
print(spxPEM_model.similarity('new#E','york'))
print(spxPEM_model.similarity('new','york#E'))
print(spxPEM_model.similarity('new','york'))

# HMM

In [53]:
model = spxEM_model
#model = w2v_model

In [18]:
sentence = 'new york city is composed of five boroughs'
sentence = sentence.split()
sentence_E = [word+'#E' for word in sentence]
sentence_P = [word+'#P' for word in sentence]
#sentences = [sentence,sentence_E,sentence_P]
sentences = [sentence,sentence_E]

In [34]:
best_tuples = list()
z = 1
path = list()
for i in range(len(sentence)-1):
    scores = list()
    candidates = list()
    for j in range(len(sentences)):
        for k in range(len(sentences)):
            edge = model.similarity(sentences[j][i],sentences[k][i+1])
            scores.append(edge)
            candidates.append([sentences[j][i],sentences[k][i+1]])
    z = max(scores)
    _best = tuple(candidates[np.argmax(scores)])
    print(scores,_best)
    print()
    best_tuples.append(_best)

[0.5302377, 0.28975484, 0.49186322, 0.862124] ('new#E', 'york#E')

[0.38330087, 0.330131, 0.20029005, 0.5829354] ('york#E', 'city#E')

[0.21510811, 0.112545684, 0.16010503, 0.24233986] ('city#E', 'is#E')

[0.1830385, 0.07866019, 0.20058198, 0.2881142] ('is#E', 'composed#E')

[0.1950385, 0.16179606, 0.08870192, 0.18681721] ('composed', 'of')

[0.37493145, 0.18115063, 0.24392094, 0.35853893] ('of', 'five')

[0.15485162, 0.19954851, 0.17588222, 0.22568594] ('five#E', 'boroughs#E')



In [None]:
mat.shape

In [26]:
import itertools
sequences = [seq for seq in itertools.product([0,1], repeat=len(sentence))]
best = None
m_score = 0
for seq in sequences:
    sent = list()
    score = 0
    for i,j in zip(seq,range(len(sentence))):
        sent.append(sentences[i][j])
    for i in range(len(sent)-1): 
        score+=model.similarity(sent[i],sent[i+1])
    if score>m_score:
        m_score=score
        best=sent
        print(best,score)

['new', 'york', 'city', 'is', 'composed', 'of', 'five', 'boroughs'] 2.036506727337837
['new', 'york', 'city', 'is', 'composed', 'of', 'five', 'boroughs#E'] 2.0812036246061325
['new', 'york', 'city', 'is#E', 'composed#E', 'of#E', 'five#E', 'boroughs#E'] 2.0852404981851578
['new', 'york', 'city#E', 'is#E', 'composed#E', 'of#E', 'five#E', 'boroughs'] 2.112061083316803
['new', 'york', 'city#E', 'is#E', 'composed#E', 'of#E', 'five#E', 'boroughs#E'] 2.1618648022413254
['new', 'york#E', 'city#E', 'is#E', 'composed#E', 'of#E', 'five#E', 'boroughs#E'] 2.1741863638162613
['new#E', 'york#E', 'city', 'is', 'composed', 'of', 'five', 'boroughs'] 2.1853822618722916
['new#E', 'york#E', 'city', 'is', 'composed', 'of', 'five', 'boroughs#E'] 2.230079159140587
['new#E', 'york#E', 'city', 'is#E', 'composed#E', 'of#E', 'five#E', 'boroughs#E'] 2.234116032719612
['new#E', 'york#E', 'city#E', 'is', 'composed', 'of', 'five', 'boroughs'] 2.5130245238542557
['new#E', 'york#E', 'city#E', 'is', 'composed', 'of', 'f

In [43]:
#Binary sequence

import itertools
sequences = [seq for seq in itertools.product([0,1], repeat=len(sentence))]
best = None
m_score = 0
for seq in sequences:
    sent = list()
    score = 0
    for i,j in zip(seq,range(len(sentence))):
        sent.append(sentences[i][j])
    for i in range(len(sent)-1):
        if (sent[i],sent[i+1]) in best_tuples:
            score+=1
    for i in range(len(sent)-1,1,-1):
        if (sent[i-1],sent[i]) in best_tuples:
            score+=1
    if score>m_score:
        m_score=score
        best=sent
        print(best,score)

['new', 'york', 'city', 'is', 'composed', 'of', 'five', 'boroughs'] 4
['new', 'york', 'city#E', 'is#E', 'composed', 'of', 'five', 'boroughs'] 6
['new', 'york#E', 'city#E', 'is#E', 'composed', 'of', 'five', 'boroughs'] 8
['new#E', 'york#E', 'city#E', 'is#E', 'composed', 'of', 'five', 'boroughs'] 9


In [36]:
best

['new#E', 'york#E', 'city#E', 'is#E', 'composed', 'of', 'five', 'boroughs']

In [11]:
truth = ['new#E', 'york#E', 'city#E', 'is', 'composed', 'of', 'five', 'boroughs#E']
score = 0
for i in range(len(truth)-1): 
    score+=model.similarity(truth[i],truth[i+1])
print(score)

2.557721421122551


In [27]:
import pickle

a = dict()

for sent in sentences:
    for word in sent:
        if word in model.vocab:
            a[word]=model.get_vector(word)

with open('spx.pkl', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('spx.pkl', 'rb') as handle:
    b = pickle.load(handle)
    print(b)

In [54]:
data = [['bookcase','fridge','wood',1],['bucket','mug','round',0],['angle','curve','sharp',1],\
        ['pelican','turtle','water',0],['wire','coil','metal',0],['apple','banana','red',1]]

In [57]:
model = spxEM_model

for items in data:
    s1 = model.similarity(items[0],items[2])
    s2 = model.similarity(items[1],items[2])
    print('NE:',s1,s2,items[3])
    if items[0]+'#E' in model.vocab and items[2]+'#E' in model.vocab:
        s1 = model.similarity(items[0]+'#E',items[2]+'#E')
    else:
        s1 = 0
    if items[1]+'#E' in model.vocab and items[2]+'#E' in model.vocab:
        s2 = model.similarity(items[1]+'#E',items[2]+'#E')
    else:
        s2 = 0
    print('E:',s1,s2,items[3])
    print()

NE: 0.3058291 0.18865623 1
E: 0 0.29928586 1

NE: 0.11041794 0.0500729 0
E: 0.11949739 0.12153117 0

NE: 0.40792596 0.39381152 1
E: 0.19719705 0.28119615 1

NE: 0.21382351 0.32552668 0
E: 0.30226216 0.39326108 0

NE: 0.50295186 0.38081402 0
E: 0.42521927 0.45205674 0

NE: 0.26313502 0.26002082 1
E: 0.23832269 0.27724728 1



In [58]:
model = w2v_model

for items in data:
    s1 = model.similarity(items[0],items[2])
    s2 = model.similarity(items[1],items[2])
    print('NE:',s1,s2,items[3])
    if items[0]+'#E' in model.vocab and items[2]+'#E' in model.vocab:
        s1 = model.similarity(items[0]+'#E',items[2]+'#E')
    else:
        s1 = 0
    if items[1]+'#E' in model.vocab and items[2]+'#E' in model.vocab:
        s2 = model.similarity(items[1]+'#E',items[2]+'#E')
    else:
        s2 = 0
    print(s1,s2,items[3])
    print()

NE: 0.25981387 0.15935534 1
0 0 1

NE: 0.11115408 0.13547464 0
0 0 0

NE: 0.3331629 0.37625045 1
0 0 1

NE: 0.20982693 0.2429899 0
0 0 0

NE: 0.47389236 0.33841693 0
0 0 0

NE: 0.16752617 0.17206126 1
0 0 1

