In [24]:
from gensim.models import KeyedVectors
from tqdm import tqdm
from scipy.stats import pearsonr
from scipy.stats import spearmanr

In [3]:
spx = '/home/manni/embs/en_wiki_spx_mc100_epoch5_300_em.txt'
model = KeyedVectors.load_word2vec_format(spx, binary=False)

# SCWS

In [6]:
scws_file = '/home/manni/data/SCWS/ratings.txt'

In [14]:
data = list()
with open(scws_file) as fin:
    lines = fin.readlines()
    for l in lines:
        l = l.lower().split('\t')
        w1 = l[1]
        w2 = l[3]
        c1 = l[5]
        c2 = l[6]
        score = float(l[7])
        data.append([w1,w2,c1,c2,score])

In [15]:
data[0]

['brazil',
 'nut',
 'gap in income between blacks and other non-whites is relatively small compared to the large gap between whites and non-whites . other factors such as illiteracy and education level show the same patterns . unlike in the us where african americans were united in the civil rights struggle , in <b> brazil </b> the philosophy of whitening has helped divide blacks from other non-whites and prevented a more active civil rights movement . though afro-brazilians make up half the population there are very few black politicians . the city of salvador , bahia for instance is 80 % afro-brazilian but has never',
 'of the neck , bridge , and pickups , there are features which are found in almost every guitar . the photo below shows the different parts of an electric guitar . the headstock ( 1 ) contains the metal machine heads , which are used for tuning ; the <b> nut </b> ( 1.4 ) , a thin fret-like strip of metal , plastic , graphite or bone which the strings pass over as they 

In [37]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_sent(sent):
    '''
    parameters:
        sent: str
    returns:
        tokens: list(str)     
    '''
    return [word for word in sent.split() if word not in stop_words]

def sent_words(sent):
    '''
    parameters:
        sent: str
    returns:
        tokens: list(str) 
    '''
    left = sent.split('<b>')[0]
    right = sent.split('</b>')[-1]
    sentA = clean_sent(left)[-5:]
    sentB = clean_sent(right)[:5]
    return sentA+sentB

def avgsim(w1,w2,c1,c2,model):
    '''
    Computes average similarity score.

    parameters:
        w1 : str
        w2 : str
        c1 : str
        c2 : str
        model : gensim.keyedvectors
    returns:
        score : float
    '''
    if w1 not in model.vocab or w2 not in model.vocab:
        return None
    a = [w1]+ sent_words(c1)
    b = [w2]+ sent_words(c2)
    a = [word for word in a if word in model.vocab]
    b = [word for word in b if word in model.vocab]
    div = len(a)*len(b)
    sims = 0
    for i in a:
        for j in b:
            sims+=model.similarity(i,j)
    return sims/div

In [38]:
sims = list()
scores = list()
for dat in tqdm(data,position=0):
    sim = avgsim(dat[0],dat[1],dat[2],dat[3],model)
    if sim:
        score = dat[4]
        sims.append(sim)
        scores.append(score)

100%|██████████| 2003/2003 [00:02<00:00, 836.01it/s]


In [39]:
corr, _ = pearsonr(sims, scores)
print(round(corr*100,2))

44.56


In [None]:
corr, _ = spearmanr(sims, scores)
print('Spearmans correlation: %.2f' % corr)

In [52]:
for w in model.vocab:
    if '#' in w:
        print(w)
        break

In [54]:
import wiki as w
import sys
sys.path.append("../../imports/")
import saver as sv

In [58]:
sentences = sv.load("wiki_sentences_spx")

In [59]:
for sent in sentences:
    print(sent[:105])
    break

['anarchism', 'is', 'a', '[', 'political', 'philosophy', ']', 'and', '[', 'movement', ']', 'that', 'is', 'sceptical', 'of', '[', 'authority', ']', 'and', 'rejects', 'all', 'involuntary', 'coercive', 'forms', 'of', '[', 'hierarchy', ']', 'anarchism', 'calls', 'for', 'the', 'abolition', 'of', 'the', '[', 'state', ']', 'which', 'it', 'holds', 'to', 'be', 'unnecessary', 'undesirable', 'and', 'harmful', 'as', 'a', 'historically', '[', 'left', 'wing', ']', 'movement', 'placed', 'on', 'the', 'farthest', 'left', 'of', 'the', '[', 'political', 'spectrum', ']', 'it', 'is', 'usually', 'described', 'alongside', '[', 'libertarian', 'marxism', ']', 'as', 'the', '[', 'libertarian', ']', 'wing', '[', 'libertarian', 'socialism', ']', 'of', 'the', '[', 'socialist', 'movement', ']', 'and', 'has', 'a', 'strong', 'historical', 'association', 'with', '[', 'anti', 'capitalism', ']', 'and', '[', 'socialism']
