In [1]:
import sys
from scipy.sparse.linalg import eigsh
from scipy.sparse.linalg.eigen import arpack
import numpy as np
from scipy.spatial.distance import cosine
from scipy.sparse import csc_matrix, lil_matrix, coo_matrix, csr_matrix
from collections import Counter
import matplotlib.pyplot as plt 
from collections import OrderedDict
from itertools import chain
import operator
import pickle
import pandas as pd
import gensim
from sklearn.metrics.pairwise import cosine_similarity



import warnings
import tqdm
warnings.filterwarnings("ignore")
%matplotlib inline




In [11]:
def calc_mapak(frame, score_col = 'cosine', word_col = 'word1', rel_col = 'SimLex'):
    count = 0
    mapak = 0.0
    for word, df in frame.groupby(word_col):

        median_ = df[rel_col].median()

        K = (1 * (df[rel_col] >= median_)).sum()
        if K == 0:
            print(df)
        df_sorted = df.sort_values(by = score_col, ascending = False)
        apak = 0.0
        for k_ in range(K):
            k = k_+1
            pak = 1.0 * (df_sorted.iloc[:k][rel_col] >= median_).sum()/float(k)
            apak += pak
        apak /= float(K)

        mapak += apak
        count += 1
    mapak /= float(count)
    return mapak

def calc_cosines(frame, model):
    cosines = []
    for ind, row in frame.iterrows():
        cosines.append(cosine_similarity([model.wv[row.word1], model.wv[row.word2]])[0][1])
    return cosines

def calc_V(X, k = 100):
    eig_vals, eig_vecs = eigsh(X, k, which = 'LA')
    V = np.zeros((X.shape[0], k))
    for i in range(k):
        V[:, i] = np.sqrt(eig_vals[-1 - i]) * eig_vecs[:,-1 - i]
    return V

def calc_V(X, k = 100, which_ = 'LA'):
    eig_vals, eig_vecs = eigsh(X, k, which = which_)
    V = np.zeros((X.shape[0], k))
    if which_ == 'LM':
        order = np.argsort(np.abs(eig_vals))[::-1]
        for i in range(k):
            ind = order[i]
            V[:, i] = np.sign(eig_vals[ind]) * np.sqrt(np.abs(eig_vals[ind])) * eig_vecs[:,ind]
    elif which_ == 'LA':
        for i in range(k):
            V[:, i] = np.sqrt(eig_vals[-1 - i]) * eig_vecs[:,-1 - i]
    return V

def calc_cosines_V(frame, V, word_to_num):
    cosines = []
    for ind, row in frame.iterrows():

        cosines.append(cosine_similarity([V[word_to_num[row.word1], :], V[word_to_num[row.word2], :]])[0][1])
    return cosines

def test_models(k, X, word_to_num, frame, col_name, W2V_ = True, which_ = 'LA'):
    print('Vocab size: ', X.shape[0], ' factorization type: ', which_)
    size_ = int(k/2)
    if X.shape[0] == 10000:
        min_count_ = 175
    elif X.shape[0] == 100000:
        min_count_ = 5
    elif X.shape[0] == 50000:
        min_count_ = 15
    else:
        min_count_ = 1
        
    V = calc_V(X, k, which_)
    frame['V'+ col_name] = calc_cosines_V(frame, V, word_to_num)
    if W2V_:
        model = gensim.models.Word2Vec(split_lines, size=size_, window=2, min_count=min_count_)
        frame['w2v_'+ col_name] = calc_cosines(frame, model)

        print('comp =', k,'min_c =',min_count_,'\t',round(calc_mapak(frame, 'w2v_'+ col_name),4), '\t\t', round(calc_mapak(frame, 'V'+ col_name),4))
    else:
        print('comp =', k,'\t\t', round(calc_mapak(frame, 'V'+ col_name),4))

def calc_X_short(X, word_counts, word_list, vocab_size = 100000, stop_words_num = 29):

    stop_inds = [x[0] for x in word_counts.most_common(stop_words_num)]
    #print(stop_inds)
    stop_words = [word_list[x] for x in stop_inds]

    vocab_inds = sorted([x[0] for x in word_counts.most_common(vocab_size + stop_words_num) if not x[0] in stop_inds])
    vocab = [word_list[x] for x in vocab_inds]
    word_freqs_short = [word_counts[x] for x in vocab_inds]
    word_to_num_short = {}
    word_list_short = {}
    for i, word in enumerate(vocab):
        word_to_num_short[word] = i
        word_list_short[i] = word

    X_short = X[vocab_inds, :]
    X_short = X_short[:, vocab_inds]
    return X_short, word_list_short, word_to_num_short, word_freqs_short

def calc_norm_X(X, word_freqs):
    inv_word_freqs = [1.0/float(x) for x in word_freqs]
    N = coo_matrix((np.sqrt(inv_word_freqs), (range(len(inv_word_freqs)), range(len(inv_word_freqs)))), shape=(len(inv_word_freqs), len(inv_word_freqs)))
    N_csr = N.tocsr()
    X_norm = N_csr.dot(X.dot(N_csr))
    return X_norm

In [3]:
lines = [line.rstrip('\n') for line in open('parsed_simple_wiki.txt', encoding = 'utf-8')]
split_lines = [line.split(' ') for line in lines]

In [4]:
simlex_doubled = pickle.load(open('simlex_doubled_cleaned.pkl','rb'))
vocab = pickle.load(open('vocab_10000','rb'))
word_counts = Counter(simlex_doubled['word1'])
word_set3 = set()
for word in simlex_doubled.word1.unique():
    if word_counts[word] > 2:
        word_set3.add(word)

simlex_fr3 = simlex_doubled[simlex_doubled.word1.isin(word_set3)]
simlex_fr3 = simlex_fr3[simlex_fr3.word1.isin(vocab)]
simlex_fr3 = simlex_fr3[simlex_fr3.word2.isin(vocab)]
print(len(word_set3), simlex_fr3.shape[0])
simlex_fr3.head()

228 645


Unnamed: 0,word1,word2,SimLex
1,new,old,1.58
4,hard,difficult,8.77
5,difficult,hard,8.77
8,hard,easy,0.95
9,easy,hard,0.95


In [5]:
X = pickle.load(open('X_maxtrix5_end.pkl','rb'))
for i in tqdm.tqdm(range(X.shape[0])):
    X[i, i] = 0

100%|███████████████████████████████████████████████████████████████████████| 468473/468473 [00:28<00:00, 16179.96it/s]


In [6]:
word_set = pickle.load( open('word_set_small_wiki.pkl', 'rb'))
s0 = pickle.load(open('s0_end.pkl','rb'))
#468473, 468473
X.shape[0], len(word_set)
word_to_num = {}
num_to_word = {}
n = 0
for i, word in enumerate(word_set):
    word_to_num[word] = i
    num_to_word[i] = word
    n += 1
word_list = list(word_set)
word_counts = Counter(s0)

In [7]:
X_short, word_list_short, word_to_num_short, word_freqs_short = calc_X_short(X, 
                                        word_counts, word_list, 100000, 29)
X_norm = calc_norm_X(X_short, word_freqs_short)
X_short.shape, X_norm.shape

((100000, 100000), (100000, 100000))

In [10]:
#На полном SimLex
test_models(100, X_short, word_to_num_short, simlex_fr3, '_short100')
test_models(100, X_short, word_to_num_short, simlex_fr3, '_short100', W2V_ = False, which_ = 'LM')
test_models(100, X_norm, word_to_num_short, simlex_fr3, '_short100', W2V_ = False)
test_models(100, X_norm, word_to_num_short, simlex_fr3, '_short100', W2V_ = False, which_ = 'LM')


Vocab size:  100000  factorization type:  LA
LA path
comp = 100 min_c = 5 	 0.6231 		 0.6788
Vocab size:  100000  factorization type:  LM
LM path
comp = 100 		 0.6496
Vocab size:  100000  factorization type:  LA
LA path
comp = 100 		 0.5977
Vocab size:  100000  factorization type:  LM
LM path
comp = 100 		 0.5977


In [12]:
#На полном SimLex
test_models(200, X_short, word_to_num_short, simlex_fr3, '_short200')
test_models(200, X_short, word_to_num_short, simlex_fr3, '_short200', W2V_ = False, which_ = 'LM')
test_models(200, X_norm, word_to_num_short, simlex_fr3, '_short200', W2V_ = False)
test_models(200, X_norm, word_to_num_short, simlex_fr3, '_short200', W2V_ = False, which_ = 'LM')


Vocab size:  100000  factorization type:  LA
comp = 200 min_c = 5 	 0.6451 		 0.6891
Vocab size:  100000  factorization type:  LM
comp = 200 		 0.6707
Vocab size:  100000  factorization type:  LA
comp = 200 		 0.6197
Vocab size:  100000  factorization type:  LM
comp = 200 		 0.6197


In [13]:
del X_short, X_norm

In [14]:
X_short, word_list_short, word_to_num_short, word_freqs_short = calc_X_short(X, 
                                        word_counts, word_list, 10000, 29)
X_norm = calc_norm_X(X_short, word_freqs_short)
X_short.shape, X_norm.shape

((10000, 10000), (10000, 10000))

In [15]:
#На полном SimLex
test_models(100, X_short, word_to_num_short, simlex_fr3, '_short100')
test_models(100, X_short, word_to_num_short, simlex_fr3, '_short100', W2V_ = False, which_ = 'LM')
test_models(100, X_norm, word_to_num_short, simlex_fr3, '_short100', W2V_ = False)
test_models(100, X_norm, word_to_num_short, simlex_fr3, '_short100', W2V_ = False, which_ = 'LM')


Vocab size:  10000  factorization type:  LA
comp = 100 min_c = 175 	 0.6381 		 0.6793
Vocab size:  10000  factorization type:  LM
comp = 100 		 0.6496
Vocab size:  10000  factorization type:  LA
comp = 100 		 0.6128
Vocab size:  10000  factorization type:  LM
comp = 100 		 0.6109


In [16]:
#На полном SimLex
test_models(200, X_short, word_to_num_short, simlex_fr3, '_short200')
test_models(200, X_short, word_to_num_short, simlex_fr3, '_short200', W2V_ = False, which_ = 'LM')
test_models(200, X_norm, word_to_num_short, simlex_fr3, '_short200', W2V_ = False)
test_models(200, X_norm, word_to_num_short, simlex_fr3, '_short200', W2V_ = False, which_ = 'LM')


Vocab size:  10000  factorization type:  LA
comp = 200 min_c = 175 	 0.6553 		 0.6891
Vocab size:  10000  factorization type:  LM
comp = 200 		 0.677
Vocab size:  10000  factorization type:  LA
comp = 200 		 0.6452
Vocab size:  10000  factorization type:  LM
comp = 200 		 0.6211


In [17]:
X_short, word_list_short, word_to_num_short, word_freqs_short = calc_X_short(X, 
                                        word_counts, word_list, 50000, 29)
X_norm = calc_norm_X(X_short, word_freqs_short)
X_short.shape, X_norm.shape

((50000, 50000), (50000, 50000))

In [18]:
#На полном SimLex
test_models(100, X_short, word_to_num_short, simlex_fr3, '_short100')
test_models(100, X_short, word_to_num_short, simlex_fr3, '_short100', W2V_ = False, which_ = 'LM')
test_models(100, X_norm, word_to_num_short, simlex_fr3, '_short100', W2V_ = False)
test_models(100, X_norm, word_to_num_short, simlex_fr3, '_short100', W2V_ = False, which_ = 'LM')


Vocab size:  50000  factorization type:  LA
comp = 100 min_c = 15 	 0.6285 		 0.6788
Vocab size:  50000  factorization type:  LM
comp = 100 		 0.6496
Vocab size:  50000  factorization type:  LA
comp = 100 		 0.6108
Vocab size:  50000  factorization type:  LM
comp = 100 		 0.6108


In [19]:
#На полном SimLex
test_models(200, X_short, word_to_num_short, simlex_fr3, '_short100')
test_models(200, X_short, word_to_num_short, simlex_fr3, '_short100', W2V_ = False, which_ = 'LM')
test_models(200, X_norm, word_to_num_short, simlex_fr3, '_short100', W2V_ = False)
test_models(200, X_norm, word_to_num_short, simlex_fr3, '_short100', W2V_ = False, which_ = 'LM')


Vocab size:  50000  factorization type:  LA
comp = 200 min_c = 15 	 0.6512 		 0.6891
Vocab size:  50000  factorization type:  LM
comp = 200 		 0.6707
Vocab size:  50000  factorization type:  LA
comp = 200 		 0.6258
Vocab size:  50000  factorization type:  LM
comp = 200 		 0.613
