In [1]:
import evaluate
import process
text = 'data/brown.txt'

In [4]:
import nltk
from nltk.corpus import brown
from nltk.tokenize import casual

import json

from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import numpy as np
import argparse
import numpy as np
from scipy.stats import spearmanr

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import cosine
from numpy.linalg import norm

import random
import os

from process import load_model, load_msr

In [5]:
windowList = [2,5,10]
dimList = [100,300,1000]
EPOCHS = 5
iwin = 2
idim = 100

In [6]:
class MySentences(object):
    def __init__(self, filename):
        self.filename = filename
 
    def __iter__(self):
        for line in open(self.filename, 'r'):
            yield [word.lower() for word in line.split()]
sentence = MySentences(text)

In [7]:
# count unigrams
from collections import Counter
word_counts = Counter()
for x in sentence:
    word_counts.update(x)

In [8]:
print(len(word_counts))
print(word_counts.most_common(5))
print(word_counts.most_common()[-5:])

49815
[('the', 69971), (',', 58334), ('.', 49346), ('of', 36412), ('and', 28853)]
[('aviary', 1), ('olive-flushed', 1), ('coral-colored', 1), ('boucle', 1), ('stupefying', 1)]


In [55]:
# minimum word count = 5
word_counts2 = Counter()
for key,value in word_counts.items():
    if value >= 5:
        word_counts2[key] = value
print(len(word_counts2))
print(word_counts2.most_common(5))
print(word_counts2.most_common()[-5:])

13891
[('one', 3504), ('would', 2719), ('said', 1961), ('time', 1695), ('new', 1646)]
[('furrow', 5), ('richert', 5), ('kafka', 5), ('poitrine', 5), ('quasimodo', 5)]


In [9]:
# count co-occurance
joint = Counter()
for isen in sentence:
    for j, word in enumerate(isen):
        if word not in word_counts:
            continue
        index_min = max(0, j-iwin)
        index_max = min(len(isen), j+iwin+1)
        index = [ii for ii in range(index_min, index_max) if ii!=j]
        for iin in index:
            #if isen[iin] in word_counts:
            joint[(word, isen[iin])] += 1
print(len(joint))
print(joint.most_common(5))

1504865
[(('of', 'the'), 20851), (('the', 'of'), 20851), (('the', ','), 11580), ((',', 'the'), 11580), ((',', 'and'), 9296)]


In [82]:
word_counts2['new']

1646

In [10]:
# generate index for words in vocabulary
index_dict = dict()
word_dict = dict()
ii = 0
for i in word_counts:
    index_dict[i] = ii
    word_dict[ii] = i
    ii += 1
    
# transform count joint to sparse matrix
import math
row_index = []
col_index = []
values = []
for (wi,wj), count in joint.items():
    row = index_dict[wi]
    col = index_dict[wj]
    prod = word_counts[wi]*word_counts[wj]*len(joint)/(len(word_counts)**2)
    if count > prod:
        value = math.log(count/prod)
    else:
        value = 0
    row_index.append(row)
    col_index.append(col)
    values.append(value)

from scipy import sparse
ppmi = sparse.csr_matrix((values, (row_index, col_index)))

In [11]:
ppmi.nonzero()

(array([    1,     1,     1, ..., 49813, 49813, 49814], dtype=int32),
 array([    2,     3,    41, ...,  7178, 49812,  7178], dtype=int32))

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
def ww_sim(word, mat, topn=10):
    indx = index_dict[word]
    if isinstance(mat, sparse.csr_matrix):
        v1 = mat.getrow(indx)
    else:
        v1 = mat[indx:indx+1, :]
    sims = cosine_similarity(mat, v1).flatten()
    sindxs = np.argsort(-sims)
#     print(sindxs[0:topn])
#     print(index_dict[9254])
#     print(sims[9254])
    sim_word_scores = [(word_dict[sindx], sims[sindx]) for sindx in sindxs[0:topn]]
    return sim_word_scores

In [13]:
ww_sim('uncle', ppmi)

[('uncle', 0.9999999999999998),
 ("stowe's", 0.2231759231137599),
 ('lyman', 0.18067858267842332),
 ('chuckled', 0.17889973368974996),
 ('harriet', 0.17816836465527214),
 ('morse', 0.17554553236411702),
 ("tom's", 0.170997917706415),
 ('underlay', 0.1659886873587491),
 ('beecher', 0.14489940818841424),
 ('shortest', 0.1431663593894243)]

In [14]:
from scipy.sparse import linalg
uu,ss,vv = linalg.svds(ppmi, idim)
print(np.shape(uu),np.shape(ss),np.shape(vv))

(49815, 100) (100,) (100, 49815)


In [15]:
unorm = uu / np.sqrt(np.sum(uu*uu, axis=1, keepdims=True))
vnorm = vv / np.sqrt(np.sum(vv*vv, axis=0, keepdims=True))
#word_vecs = unorm
#word_vecs = vnorm.T
word_vecs = uu + vv.T
word_vecs_norm = word_vecs / np.sqrt(np.sum(word_vecs*word_vecs, axis=1, keepdims=True))

  
  


In [16]:
ww_sim("female",word_vecs)

[('female', 1.0000000000000002),
 ('19-foot', 0.9980361675974996),
 ('vocalists', 0.9965653314649788),
 ('bumblebee', 0.9599715105301654),
 ('parasite', 0.9287603772810248),
 ('andrena', 0.9229003957172038),
 ('dissenters', 0.9217030872903869),
 ('cowbirds', 0.9069719691806799),
 ('spate', 0.8845950933788866),
 ('bobbed', 0.8840057140235706)]

In [17]:
ww_sim("strike",word_vecs)

[('strike', 0.9999999999999998),
 ('shrinking', 0.9156697744209977),
 ('heaven', 0.8904639794587017),
 ('slang', 0.8888747215148729),
 ("t'ien", 0.8875081627802461),
 ("smugglers'", 0.8875081627802459),
 ('tollgate', 0.8875081627802458),
 ('unnerving', 0.8875081627802456),
 ('haint', 0.8872022500803394),
 ('piously', 0.8799832390289005)]

In [18]:
ww_sim("uncle",word_vecs)

[('uncle', 1.0000000000000004),
 ('remus', 0.9993300025849559),
 ("elaine's", 0.9993300025849557),
 ('ffortescue', 0.9993300025849556),
 ("dan'l", 0.9985238437324462),
 ('dragooned', 0.993848203422079),
 ('chuckled', 0.9910543209000767),
 ('randolph', 0.9852530055504984),
 ('izaak', 0.9770375458670217),
 ("tom's", 0.9746761864136466)]

In [42]:
text = "Sentences iterable can be simply a list, but for larger corpora, consider a generator that streams the sentences directly from disk/network"

In [86]:
print(ppmi[0,:])

  (0, 1)	0.0
  (0, 2)	0.0
  (0, 3)	0.0
  (0, 7)	0.0
  (0, 29)	0.0
  (0, 30)	0.0
  (0, 31)	0.0
  (0, 58)	0.0
  (0, 59)	0.0
  (0, 60)	0.0
  (0, 64)	0.0
  (0, 66)	0.0
  (0, 67)	0.0
  (0, 90)	0.0
  (0, 106)	0.0
  (0, 119)	0.0
  (0, 129)	0.0
  (0, 130)	0.0
  (0, 131)	0.0
  (0, 138)	0.0
  (0, 139)	0.0
  (0, 149)	0.0
  (0, 150)	0.0
  (0, 151)	0.0
  (0, 156)	0.0
  (0, 157)	0.0
  (0, 158)	0.0
  (0, 210)	0.0
  (0, 211)	0.0
  (0, 215)	0.0
  (0, 222)	0.0
  (0, 235)	0.0
  (0, 236)	0.0
  (0, 237)	0.0
  (0, 241)	0.6394438532335271
  (0, 242)	0.0
  (0, 243)	0.0
  (0, 244)	0.0
  (0, 246)	0.6394438532335271
  (0, 252)	0.0
  (0, 304)	0.0
  (0, 1051)	0.0
  (0, 1212)	0.0
  (0, 2258)	0.0
  (0, 4037)	0.0
  (0, 4165)	0.0
  (0, 6212)	0.0
  (0, 6789)	0.0


In [123]:
word_vecs.save("../../NLPdata/hw3/savedModel/test")

AttributeError: 'numpy.ndarray' object has no attribute 'save'

In [124]:
np.shape(word_vecs)

(13891, 100)

In [125]:
word_vecs[0]

array([ 1.63849412e-04, -9.70924308e-06, -3.89196601e-04, -1.99606045e-04,
       -1.52572737e-04,  3.92952168e-04,  4.11227720e-16,  6.34080014e-04,
       -8.64653808e-05, -1.56360244e-03, -5.49293412e-15, -2.56142763e-17,
        1.16559588e-04, -4.07202049e-05,  4.32493179e-05,  2.21068619e-04,
        3.71345924e-04,  4.73482216e-04,  8.14642407e-17, -4.92336396e-04,
       -1.26622883e-04,  1.42008801e-15,  6.22602646e-04,  9.23833035e-05,
       -1.18927200e-04,  9.85675300e-17,  2.16759825e-05, -4.91414635e-16,
       -3.35825277e-04, -3.18863859e-16, -1.83852395e-04, -8.15787470e-05,
        2.11504127e-17, -8.56977114e-17, -6.59427894e-05,  2.85409917e-04,
       -1.09294355e-16,  6.32557204e-05,  5.77771367e-05,  8.97643609e-05,
       -4.61098228e-05,  2.48408268e-05, -3.19926818e-04, -9.76574519e-05,
       -5.61873316e-05,  6.93308960e-04,  3.10089953e-15, -1.51924772e-04,
       -2.84483907e-05,  1.10950504e-04, -5.57999895e-17,  1.62489884e-05,
       -1.04200530e-04, -