In [8]:
import string
import numpy as np
import random
import nltk
import csv
import time
from sklearn.decomposition import TruncatedSVD

In [9]:
import scipy.stats

In [10]:
def strip_punctuation(x):
    x = [i for i in x if i not in string.punctuation]
    x = ''.join(x)
    return x

In [11]:
def sel_second(x):
    return x[1]

In [12]:
def normalize_vec(vec):
    if np.sum(vec**2) > 0:
        sq_vec = vec**2
        sum_sq = np.sum(sq_vec)
        mag = np.sqrt(sum_sq)
        normed_vec = vec / mag
    else:
        normed_vec = vec.copy()
    return normed_vec

In [13]:
def normalize_matrix(mat):
    sq_mat = mat**2
    sum_sq = np.sum(sq_mat, axis=1)
    mag = np.sqrt(sum_sq)
    normed_mat = np.transpose((np.transpose(mat) / mag))
    return normed_mat

In [14]:
def find_neighbors(word, dic, mat, n=20):
    cos_sim = normalize_vec(mat[dic[word]]) @ np.transpose(normalize_matrix(mat))
    cos_sim = list(zip([i for i in dic], cos_sim))
    cos_sim.sort(key = sel_second, reverse=True)
    return cos_sim[1:(n+1)]

Train LSA

In [108]:
import string
import numpy as np
import time
from sklearn.decomposition import TruncatedSVD

In [109]:
with open('tasaSentDocs.txt') as f:
    df = f.read()

In [110]:
doc_df = df.split(" \n\n")

In [111]:
# get documents as lists of words
cleaned_df = []
all_words = []
for i in doc_df:
    cleaned_doc = i.replace('\n', '')
    cleaned_doc = cleaned_doc.split(' ')
    cleaned_doc = [strip_punctuation(i) for i in cleaned_doc]
    cleaned_doc = [i.lower() for i in cleaned_doc if i!='']
    if len(cleaned_doc)>0:
        cleaned_df.append(cleaned_doc)
        all_words.extend(cleaned_doc)

In [112]:
len(cleaned_df)

37639

In [113]:
# find all unique words
all_words = list(set(all_words))

In [114]:
# dictionary to index rows of matrix based on the word
word_dic = {}
for i in range(0, len(all_words)):
    word_dic[all_words[i]] = i

In [115]:
# matrix of zeros
word_matrix = np.zeros(shape=(len(all_words), len(cleaned_df)), dtype=np.float32)

In [116]:
# loop to count words in each document (constructs raw count matrix)
for i in range(0, len(cleaned_df)):
    for x in cleaned_df[i]:
        word_matrix[word_dic[x]][i] += 1

In [117]:
# calculate entropy weights for each row
entropy_vec = []
for i in range(0, len(word_matrix)):
    p_vec = word_matrix[i] / np.sum(word_matrix[i])
    p_vec = p_vec[p_vec > 0]
    log_entropy = 1 + np.sum((p_vec * np.log2(p_vec)) / np.log2(len(word_matrix[i])))
    entropy_vec.append(log_entropy)

In [118]:
# apply log transform and entropy weight
for i in range(0, len(word_matrix)):
    word_matrix[i] = np.log(word_matrix[i] + 1) * entropy_vec[i]

In [42]:
# code for svd
svd = TruncatedSVD(n_components=300, algorithm='arpack')

In [43]:
# compute svd
start = time.time()
word_matrix = svd.fit_transform(word_matrix)
end = time.time()
end-start

2626.506085395813

In [44]:
normed_matrix = normalize_matrix(word_matrix)

Example from Martin and Berry

In [81]:
doc_df = ['music rock roll', 'demonstration drum roll', 'composition drum', 'music rock', 'composition music', 'bread demonstration roll', 'ingredients roll', 'bread dough recipe', 'dough ingredients recipe']

In [82]:
# get documents as lists of words
cleaned_df = []
all_words = []
for i in doc_df:
    cleaned_doc = i.replace('\n', '')
    cleaned_doc = cleaned_doc.split(' ')
    cleaned_doc = [strip_punctuation(i) for i in cleaned_doc]
    cleaned_doc = [i.lower() for i in cleaned_doc if i!='']
    if len(cleaned_doc)>0:
        cleaned_df.append(cleaned_doc)
        all_words.extend(cleaned_doc)

In [83]:
# find all unique words
all_words = list(set(all_words))

In [84]:
all_words.sort()

In [85]:
# dictionary to index rows of matrix based on the word
word_dic = {}
for i in range(0, len(all_words)):
    word_dic[all_words[i]] = i

In [86]:
# matrix of zeros
word_matrix = np.zeros(shape=(len(all_words), len(cleaned_df)), dtype=np.float32)

In [87]:
# loop to count words in each document (constructs raw count matrix)
for i in range(0, len(cleaned_df)):
    for x in cleaned_df[i]:
        word_matrix[word_dic[x]][i] += 1

In [88]:
# Produces matrix from Martin and Berry p. 38
word_matrix

array([[0., 0., 0., 0., 0., 1., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 1.],
       [0., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 1.],
       [1., 0., 0., 1., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 1., 1., 0., 0.]], dtype=float32)

In [89]:
# calculate entropy weights for each row
entropy_vec = []
for i in range(0, len(word_matrix)):
    p_vec = word_matrix[i] / np.sum(word_matrix[i])
    p_vec = p_vec[p_vec > 0]
    log_entropy = 1 + np.sum((p_vec * np.log2(p_vec)) / np.log2(len(word_matrix[i])))
    entropy_vec.append(log_entropy)

In [90]:
# apply log transform and entropy weight
for i in range(0, len(word_matrix)):
    word_matrix[i] = np.log(word_matrix[i] + 1) * entropy_vec[i]

In [91]:
# Produces weighted matrix from Martin and Berry p. 39
word_matrix

array([[0.       , 0.       , 0.       , 0.       , 0.       , 0.4744836,
        0.       , 0.4744836, 0.       ],
       [0.       , 0.       , 0.4744836, 0.       , 0.4744836, 0.       ,
        0.       , 0.       , 0.       ],
       [0.       , 0.4744836, 0.       , 0.       , 0.       , 0.4744836,
        0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.4744836, 0.4744836],
       [0.       , 0.4744836, 0.4744836, 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.4744836, 0.       , 0.4744836],
       [0.3465736, 0.       , 0.       , 0.3465736, 0.3465736, 0.       ,
        0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.4744836, 0.4744836],
       [0.4744836, 0.       , 0.       , 0.4744836, 0.       , 0.       

In [92]:
# code for svd
svd = TruncatedSVD(n_components=2, algorithm='arpack')

In [93]:
# compute svd short way (directly outputs reduced matrix)
start = time.time()
reduced_mat_short = svd.fit_transform(word_matrix)
end = time.time()
end-start

0.0009722709655761719

In [94]:
reduced_mat_short

array([[ 0.4576494 ,  0.09059695],
       [ 0.04290586,  0.3276323 ],
       [ 0.22849026,  0.42746598],
       [ 0.60990256, -0.20935944],
       [ 0.11050479,  0.44521576],
       [ 0.3902528 , -0.11250804],
       [ 0.04785607,  0.3345153 ],
       [ 0.60990256, -0.20935944],
       [ 0.05107342,  0.31862834],
       [ 0.19148433,  0.33743194]], dtype=float32)

In [95]:
from scipy.linalg import svd

In [96]:
# compute svd long way
start = time.time()
U, s, VT = svd(word_matrix)
end = time.time()
end-start

0.000997781753540039

In [97]:
# produces U-type vectors from Martin and Berry p. 44 (different signs for some values)
U[:, :9]

array([[-0.41543984,  0.09424412,  0.20058942,  0.32680166,  0.47745055,
        -0.33016074, -0.45998228, -0.21067558, -0.28028417],
       [-0.0389486 ,  0.34082147, -0.08689152, -0.6655946 ,  0.2791631 ,
        -0.4347095 , -0.02107358, -0.05959602,  0.40107042],
       [-0.20741646,  0.4446748 ,  0.42424983,  0.29156008, -0.09129842,
        -0.01730594,  0.5953961 , -0.29498896,  0.20999365],
       [-0.5536504 , -0.21778789, -0.097966  , -0.11393587,  0.11713022,
         0.22560152,  0.15349817,  0.14912319,  0.11421926],
       [-0.10031281,  0.46313906,  0.28562775, -0.41349456, -0.11180256,
         0.5527629 , -0.26333505, -0.0201866 , -0.36753118],
       [-0.35425913, -0.11703755, -0.13027358, -0.16629784, -0.7206371 ,
        -0.34740764, -0.10309803, -0.36770603, -0.17415482],
       [-0.04344256,  0.34798062, -0.5435819 ,  0.02730213,  0.12394086,
        -0.15960842,  0.408657  ,  0.18090957, -0.5832927 ],
       [-0.5536504 , -0.21778789, -0.097966  , -0.11393587,  0

In [98]:
s_mat = np.zeros((9, 10))
np.fill_diagonal(s_mat, s, wrap=False)

In [99]:
# produces singular values from Martin and Berry p. 44
s_mat

array([[1.10160184, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.96130133, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.85962135, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.763798  , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.6581133 ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.47367984, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.27326408, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.       

In [100]:
# produces V-document vectors from Martin and Berry p.44 (different signs)
np.transpose(VT)

array([[-0.07400344,  0.38246807, -0.5326035 ,  0.27086723, -0.08024473,
         0.1156444 , -0.19694234,  0.50221187,  0.42472392],
       [-0.17291169,  0.5414949 ,  0.40777543,  0.0047267 , -0.27573416,
         0.43357262,  0.34217477,  0.22189537, -0.28327692],
       [-0.05998285,  0.39682254,  0.10969584, -0.6703478 ,  0.12066288,
         0.11825373, -0.49383453, -0.22893037,  0.22958563],
       [-0.03363708,  0.28905606, -0.54854923,  0.19039299,  0.04905889,
         0.21843734,  0.03745957, -0.62405676, -0.37038845],
       [-0.03044353,  0.2936798 , -0.26711744, -0.40108994,  0.26653904,
        -0.5522268 ,  0.48169783,  0.20816092, -0.17099503],
       [-0.3086438 ,  0.35941407,  0.36083716,  0.46461082,  0.14910246,
        -0.45084912,  0.0007252 , -0.3246995 ,  0.31395403],
       [-0.19295326,  0.03564385, -0.05596112, -0.02283265, -0.6488651 ,
        -0.45079023, -0.41341648,  0.07116263, -0.39702702],
       [-0.65587693, -0.16847597,  0.00257059,  0.06145695,  0

In [101]:
# reduce matrix to 2 dimensions
reduced_mat_long = U[:, :2] @ s_mat[:2, :2]

In [102]:
# reduced matrix produced long way
reduced_mat_long

array([[-0.4576493 ,  0.090597  ],
       [-0.04290584,  0.32763214],
       [-0.22849035,  0.42746647],
       [-0.60990228, -0.20935979],
       [-0.11050477,  0.44521619],
       [-0.39025251, -0.11250835],
       [-0.04785641,  0.33451423],
       [-0.60990228, -0.20935979],
       [-0.05107374,  0.31862715],
       [-0.19148447,  0.33743193]])

In [103]:
# reduced matrix produced short way
reduced_mat_short

array([[ 0.4576494 ,  0.09059695],
       [ 0.04290586,  0.3276323 ],
       [ 0.22849026,  0.42746598],
       [ 0.60990256, -0.20935944],
       [ 0.11050479,  0.44521576],
       [ 0.3902528 , -0.11250804],
       [ 0.04785607,  0.3345153 ],
       [ 0.60990256, -0.20935944],
       [ 0.05107342,  0.31862834],
       [ 0.19148433,  0.33743194]], dtype=float32)

In [104]:
normalized_long = normalize_matrix(reduced_mat_long)

In [105]:
normalized_short = normalize_matrix(reduced_mat_short)

In [106]:
#cosine matrix long svd version
normalized_long @ np.transpose(normalized_long)

array([[ 1.        ,  0.3199258 ,  0.63369264,  0.86477231,  0.42478386,
         0.88877995,  0.33116032,  0.86477231,  0.34700508,  0.65304338],
       [ 0.3199258 ,  1.        ,  0.93566194, -0.19910876,  0.99361418,
        -0.14990166,  0.99992941, -0.19910876,  0.99958747,  0.92644313],
       [ 0.63369264,  0.93566194,  1.        ,  0.15953321,  0.96950482,
         0.20865292,  0.93978887,  0.15953321,  0.94541144,  0.99968051],
       [ 0.86477231, -0.19910876,  0.15953321,  1.        , -0.0872653 ,
         0.99875128, -0.18745104,  1.        , -0.17088094,  0.1844346 ],
       [ 0.42478386,  0.99361418,  0.96950482, -0.0872653 ,  1.        ,
        -0.03738815,  0.99488465, -0.0872653 ,  0.99644488,  0.9630006 ],
       [ 0.88877995, -0.14990166,  0.20865292,  0.99875128, -0.03738815,
         1.        , -0.13814377,  0.99875128, -0.1214436 ,  0.23330601],
       [ 0.33116032,  0.99992941,  0.93978887, -0.18745104,  0.99488465,
        -0.13814377,  1.        , -0.18745104

In [107]:
#cosine matrix short svd version
normalized_short @ np.transpose(normalized_short)

array([[ 1.        ,  0.31992564,  0.6336928 ,  0.8647727 ,  0.42478395,
         0.8887805 ,  0.33115882,  0.8647727 ,  0.34700346,  0.65304303],
       [ 0.31992564,  1.        ,  0.9356618 , -0.19910811,  0.99361414,
        -0.14990078,  0.9999294 , -0.19910811,  0.9995875 ,  0.9264433 ],
       [ 0.6336928 ,  0.9356618 ,  1.        ,  0.15953416,  0.96950483,
         0.20865415,  0.9397883 ,  0.15953416,  0.94541085,  0.9996806 ],
       [ 0.8647727 , -0.19910811,  0.15953416,  0.9999999 , -0.08726436,
         0.9987513 , -0.18745178,  0.9999999 , -0.17088184,  0.18443495],
       [ 0.42478395,  0.99361414,  0.96950483, -0.08726436,  1.        ,
        -0.03738695,  0.9948845 , -0.08726436,  0.9964447 ,  0.96300083],
       [ 0.8887805 , -0.14990078,  0.20865415,  0.9987513 , -0.03738695,
         1.0000001 , -0.13814425,  0.9987513 , -0.12144426,  0.23330662],
       [ 0.33115882,  0.9999294 ,  0.9397883 , -0.18745178,  0.9948845 ,
        -0.13814425,  0.99999994, -0.18745178