In [15]:
import evaluate
import process
import numpy as np
text = 'data/brown.txt'

In [16]:
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [17]:
class MySentences(object):
    def __init__(self, filename):
        self.filename = filename
 
    def __iter__(self):
        for line in open(self.filename, 'r'):
            yield [word.lower() for word in tokenizer.tokenize(line)]
#sentence = MySentences(text)

In [18]:
from collections import Counter
# count unigrams
def vocab_index(sentences):
    word_counts = Counter()
    for x in sentences:
        word_counts.update(x)
    print(len(word_counts))
    print(word_counts.most_common(5))
    print(word_counts.most_common()[-5:])
    
    # generate index for words in vocabulary
    index_dict = dict()
    ii = 0
    for i in word_counts:
        index_dict[i] = ii
        ii += 1
    
    return index_dict

#index_dict = vocab_index(sentence)

In [19]:
# count co-occurance as a dictionary "joint"
def cooc_dict(sentences, iwin):
    joint = Counter()
    for isen in sentences:
        for j, word in enumerate(isen):
            index_min = max(0, j-iwin)
            index_max = min(len(isen), j+iwin+1)
            index = [ii for ii in range(index_min, index_max) if ii!=j]
            for iin in index:
                joint[(word, isen[iin])] += 1
    return joint

# joint = cooc_dict(sentence, 2)
# print(len(joint))
# print(joint.most_common(5))

In [20]:
# transform count joint to sparse matrix
from scipy import sparse
def cooc_matrix(joint_dict, index_dict):
    row_index = []
    col_index = []
    values = []
    for (wi,wj), count in joint.items():
        row = index_dict[wi]
        col = index_dict[wj]
        value = count
        row_index.append(row)
        col_index.append(col)
        values.append(value)
    return sparse.csr_matrix((values, (row_index, col_index)))
#joint_matrix = cooc_matrix(joint, index_dict)

In [21]:
import math
def ppmi_matrix(joint_matrix):
   # calculate the column sum, row sum and total sum of the co-occur matrix.
    sum_a0 = joint_matrix.sum(axis=0)
    print(np.shape(sum_a0))
    sum_a1 = joint_matrix.sum(axis=1)
    print(np.shape(sum_a1))
    sum_total = joint_matrix.sum()
    print(sum_total) 
   
    # find the non-zero elements in the sparse matrix
    nonzero_index = joint_matrix.nonzero()
    num_nonzero = np.shape(nonzero_index)[1]
    print(num_nonzero)
    
    # calculate values for non-zero ppmi
    ppmi_values = []
    for i in range(num_nonzero):
        row = nonzero_index[0][i]
        col = nonzero_index[1][i]
        pwc_scaled = joint_matrix[row, col]
        pwpc_scaled = sum_a1[row,0]*sum_a0[0,col]/sum_total
        if pwc_scaled > pwpc_scaled:
            value = math.log(pwc_scaled/pwpc_scaled)
        else:
            value = 0
        ppmi_values.append(value)
    
    return sparse.csr_matrix((ppmi_values, nonzero_index)) 

#ppmi = ppmi_matrix(joint_matrix)

In [22]:
from scipy.sparse import linalg
# truncate with svd
def ppmi_svd(ppmi, idim):
    uu,ss,vv = linalg.svds(ppmi, idim)
    print(np.shape(uu),np.shape(ss),np.shape(vv)) 
  
    sigma_sr = np.diag([x**0.5 for x in ss])
    return np.matmul(uu,sigma_sr)

#word_vecs = ppmi_svd(ppmi, 100)

In [23]:
import os
def save_wv(word_vecs, index_dict, iwin, idim):
    keys = list(index_dict.keys())
    print(np.shape(keys))
    
    f1 = open('../../NLPdata/hw3/savedModel/wv_svd_win%d_dim%d.txt' %(iwin, idim),'w')
    for i in range(len(keys)):
        f1.write(keys[i]+' '+' '.join(str(x) for x in word_vecs[i,:]))
        f1.write('\r\n')
    f1.close()    
    
#save_wv(word_vecs, index_dict, 2,100)

In [24]:
def svd_paras(corpus, iwin, idim):
    index_dict = vocab_index(corpus)
    joint = cooc_dict(corpus, iwin)
    joint_matrix = cooc_matrix(joint, index_dict)
    ppmi = ppmi_matrix(joint_matrix)
    word_vecs = ppmi_svd(ppmi, idim)
    save_wv(word_vecs, index_dict, iwin, idim)
    
#svd_paras(sentence,3,100)

# test final win10, dim1000

In [25]:
iwin = 10
idim = 1000
index_dict = vocab_index(sentence)
joint = cooc_dict(sentence, iwin)
joint_matrix = cooc_matrix(joint, index_dict)
ppmi = ppmi_matrix(joint_matrix)
nonzero_index = joint_matrix.nonzero()

42432
[('the', 70003), ('of', 36473), ('and', 28935), ('to', 26247), ('a', 23502)]
[('perelman', 1), ('exhaling', 1), ('aviary', 1), ('boucle', 1), ('stupefying', 1)]
(1, 42432)
(42432, 1)
14851962
4815005


In [26]:
np.shape(nonzero_index)

(2, 4815005)

In [14]:
windowList = [2,5,10]
dimList = [100,300,1000]

sentence = MySentences(text)
for iwin in windowList:
    for idim in dimList:
        svd_paras(sentence, iwin, idim)

42432
[('the', 70003), ('of', 36473), ('and', 28935), ('to', 26247), ('a', 23502)]
[('perelman', 1), ('exhaling', 1), ('aviary', 1), ('boucle', 1), ('stupefying', 1)]
(1, 42432)
(42432, 1)
3794604
1468689
(42432, 100) (100,) (100, 42432)
(42432,)
42432
[('the', 70003), ('of', 36473), ('and', 28935), ('to', 26247), ('a', 23502)]
[('perelman', 1), ('exhaling', 1), ('aviary', 1), ('boucle', 1), ('stupefying', 1)]
(1, 42432)
(42432, 1)
3794604
1468689
(42432, 300) (300,) (300, 42432)
(42432,)
42432
[('the', 70003), ('of', 36473), ('and', 28935), ('to', 26247), ('a', 23502)]
[('perelman', 1), ('exhaling', 1), ('aviary', 1), ('boucle', 1), ('stupefying', 1)]
(1, 42432)
(42432, 1)
3794604
1468689
(42432, 1000) (1000,) (1000, 42432)
(42432,)
42432
[('the', 70003), ('of', 36473), ('and', 28935), ('to', 26247), ('a', 23502)]
[('perelman', 1), ('exhaling', 1), ('aviary', 1), ('boucle', 1), ('stupefying', 1)]
(1, 42432)
(42432, 1)
3794604
1468689
(42432, 100) (100,) (100, 42432)
(42432,)
42432
[('

# following are test

In [69]:
# test cos similarity with ppmi
from sklearn.metrics.pairwise import cosine_similarity
def ww_sim(word, mat, topn=10):
    indx = index_dict[word]
    if isinstance(mat, sparse.csr_matrix):
        v1 = mat.getrow(indx)
    else:
        v1 = mat[indx:indx+1, :]
    sims = cosine_similarity(mat, v1).flatten()
    sindxs = np.argsort(-sims)
    sim_word_scores = [(word_dict[sindx], sims[sindx]) for sindx in sindxs[0:topn]]
    return sim_word_scores

ww_sim('uncle', ppmi)

[('uncle', 0.9999999999999993),
 ('randolph', 0.21424651831115082),
 ('ffortescue', 0.15364112842611377),
 ('conspires', 0.14258384785514255),
 ('dragooned', 0.14046243950105333),
 ('countrey', 0.1396034035433304),
 ('stowe', 0.1294718674981437),
 ('replanted', 0.12937519810706688),
 ('grandparents', 0.11413975147356778),
 ('lorde', 0.11341216492243941)]

In [71]:
ww_sim("female",word_vecs)

[('female', 1.0),
 ('idal', 0.7426855705101922),
 ('dissenters', 0.6487736746520316),
 ('matriarchal', 0.5460884319403518),
 ('amazons', 0.5286572527913542),
 ('significantly', 0.5125824483005444),
 ('psithyrus', 0.5083448354841025),
 ('colombian', 0.5068891562074264),
 ('predictable', 0.5040026593735853),
 ('male', 0.4945138048954064)]

In [72]:
ww_sim("uncle",word_vecs)

[('uncle', 0.9999999999999997),
 ('countrey', 0.8359475206229168),
 ('ffortescue', 0.8255886328699187),
 ('dragooned', 0.7734844249562545),
 ('remus', 0.7041995914120744),
 ('morse', 0.6642162573971708),
 ('indisposition', 0.6404643774887171),
 ('farnworth', 0.6180976883988671),
 ('beecher', 0.5971136029647994),
 ('linda', 0.5843139670256834)]

In [76]:
ww_sim("first",word_vecs)

[('first', 1.0),
 ('mubarak', 0.8043438743016806),
 ('tuxedoed', 0.8033108086162087),
 ('nighters', 0.7977749772268158),
 ('platitudinous', 0.7950816094711803),
 ('gracias', 0.7943228326127885),
 ('angered', 0.7770089916923083),
 ('tullio', 0.7756776667479051),
 ('prettiness', 0.7750187800324218),
 ('foote', 0.7739161161800336)]

In [75]:
ww_sim("uncle",word_vecs2)

[('uncle', 1.0),
 ('countrey', 0.8951414851006021),
 ('ffortescue', 0.8706983548734593),
 ('dragooned', 0.8555405718071133),
 ('remus', 0.7458469743586553),
 ('beecher', 0.7003451258521559),
 ('indisposition', 0.6668285423281214),
 ('morse', 0.6442708218591424),
 ('farnworth', 0.6238947231908696),
 ('manly', 0.5652279520513261)]

In [73]:
ww_sim("female",uu)

[('female', 0.9999999999999997),
 ('idal', 0.7472648119196135),
 ('dissenters', 0.6602121619841566),
 ('matriarchal', 0.5355959892322617),
 ('amazons', 0.5137198417249681),
 ('colombian', 0.5111274085873134),
 ('psithyrus', 0.506383123747987),
 ('digs', 0.49014346941055265),
 ('significantly', 0.4886866635444046),
 ('andrenas', 0.4753720225751541)]

In [74]:
unorm = uu / np.sqrt(np.sum(uu*uu, axis=1, keepdims=True))
vnorm = vv / np.sqrt(np.sum(vv*vv, axis=0, keepdims=True))
#word_vecs = unorm
#word_vecs = vnorm.T
word_vecs2 = uu + vv.T
word_vecs_norm = word_vecs / np.sqrt(np.sum(word_vecs*word_vecs, axis=1, keepdims=True))

  
  


In [64]:
np.diag([x**0.5 for x in ss])

array([[10.3880257 ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , 10.39186573,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        , 10.39906501, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., 14.7839714 ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        16.21563346,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , 24.37001025]])

In [66]:
t = np.diag(ss)
uu*t*vv == ppmi

ValueError: operands could not be broadcast together with shapes (42281,100) (100,100) 

In [53]:
sum_a0[0,2],sum_a1[2,0]

(178, 178)

In [68]:
np.shape(np.matmul(np.matmul(uu,t),vv))

KeyboardInterrupt: 

In [48]:
nonzero_index

(array([    0,     0,     0, ..., 42279, 42280, 42280], dtype=int32),
 array([    1,     2,     3, ..., 42280,  6523, 42279], dtype=int32))

In [49]:
joint_matrix[nonzero_index[0][0],nonzero_index[1][0]]

6

In [42]:
text = "Sentences iterable can be simply a list, but for larger corpora, consider a generator that streams the sentences directly from disk/network"

In [86]:
print(ppmi[0,:])

  (0, 1)	0.0
  (0, 2)	0.0
  (0, 3)	0.0
  (0, 7)	0.0
  (0, 29)	0.0
  (0, 30)	0.0
  (0, 31)	0.0
  (0, 58)	0.0
  (0, 59)	0.0
  (0, 60)	0.0
  (0, 64)	0.0
  (0, 66)	0.0
  (0, 67)	0.0
  (0, 90)	0.0
  (0, 106)	0.0
  (0, 119)	0.0
  (0, 129)	0.0
  (0, 130)	0.0
  (0, 131)	0.0
  (0, 138)	0.0
  (0, 139)	0.0
  (0, 149)	0.0
  (0, 150)	0.0
  (0, 151)	0.0
  (0, 156)	0.0
  (0, 157)	0.0
  (0, 158)	0.0
  (0, 210)	0.0
  (0, 211)	0.0
  (0, 215)	0.0
  (0, 222)	0.0
  (0, 235)	0.0
  (0, 236)	0.0
  (0, 237)	0.0
  (0, 241)	0.6394438532335271
  (0, 242)	0.0
  (0, 243)	0.0
  (0, 244)	0.0
  (0, 246)	0.6394438532335271
  (0, 252)	0.0
  (0, 304)	0.0
  (0, 1051)	0.0
  (0, 1212)	0.0
  (0, 2258)	0.0
  (0, 4037)	0.0
  (0, 4165)	0.0
  (0, 6212)	0.0
  (0, 6789)	0.0


In [123]:
word_vecs.save("../../NLPdata/hw3/savedModel/test")

AttributeError: 'numpy.ndarray' object has no attribute 'save'

In [124]:
np.shape(word_vecs)

(13891, 100)

In [125]:
word_vecs[0]

array([ 1.63849412e-04, -9.70924308e-06, -3.89196601e-04, -1.99606045e-04,
       -1.52572737e-04,  3.92952168e-04,  4.11227720e-16,  6.34080014e-04,
       -8.64653808e-05, -1.56360244e-03, -5.49293412e-15, -2.56142763e-17,
        1.16559588e-04, -4.07202049e-05,  4.32493179e-05,  2.21068619e-04,
        3.71345924e-04,  4.73482216e-04,  8.14642407e-17, -4.92336396e-04,
       -1.26622883e-04,  1.42008801e-15,  6.22602646e-04,  9.23833035e-05,
       -1.18927200e-04,  9.85675300e-17,  2.16759825e-05, -4.91414635e-16,
       -3.35825277e-04, -3.18863859e-16, -1.83852395e-04, -8.15787470e-05,
        2.11504127e-17, -8.56977114e-17, -6.59427894e-05,  2.85409917e-04,
       -1.09294355e-16,  6.32557204e-05,  5.77771367e-05,  8.97643609e-05,
       -4.61098228e-05,  2.48408268e-05, -3.19926818e-04, -9.76574519e-05,
       -5.61873316e-05,  6.93308960e-04,  3.10089953e-15, -1.51924772e-04,
       -2.84483907e-05,  1.10950504e-04, -5.57999895e-17,  1.62489884e-05,
       -1.04200530e-04, -