In [69]:
import string
import numpy as np
import random
import csv
import time
import pickle
from collections import Counter
from sklearn.decomposition import TruncatedSVD

In [70]:
def sel_second(x):
    return x[1]

In [71]:
def normalize_vec(vec):
    if np.sum(vec**2) > 0:
        sq_vec = vec**2
        sum_sq = np.sum(sq_vec)
        mag = np.sqrt(sum_sq)
        normed_vec = vec / mag
    else:
        normed_vec = vec.copy()
    return normed_vec

In [72]:
def normalize_matrix(mat):
    sq_mat = mat**2
    sum_sq = np.sum(sq_mat, axis=1)
    mag = np.sqrt(sum_sq)
    normed_mat = np.transpose((np.transpose(mat) / mag))
    return normed_mat

In [73]:
def find_neighbors(word, dic, mat, n=20):
    cos_sim = normalize_vec(mat[dic[word]]) @ np.transpose(normalize_matrix(mat))
    cos_sim = list(zip([i for i in dic], cos_sim))
    cos_sim.sort(key = sel_second, reverse=True)
    return cos_sim[1:(n+1)]

In [74]:
def cosine(word1, word2):
    return normalize_vec(word_matrix[word_dic[word1]]) @ normalize_vec(word_matrix[word_dic[word2]])

I can't share the TASA corpus, so instead, to demostrate the LSA code, I will run the code on a set of articles from Wikipedia. This will not create the same vectors we used in the simulations, but you can download those vectors directly from my github.

In [75]:
#load corpus, corpus file available from Github
with open('mini_wiki_corpus_mar_14', 'rb') as f:
    corpus = pickle.load(f)

In [76]:
# get documents as lists of words
word_list = []
for article in corpus:
    word_list.extend(article)

In [77]:
# get frequency counts for all words
count_list = Counter(word_list)
count_list = [[i, count_list[i]] for i in count_list]
count_list.sort(key=sel_second, reverse=True)

In [78]:
# create list of 50,000 most frequent words
word_list = [i[0] for i in count_list[:50000]]

In [79]:
# dictionary to index rows of matrix based on the word
word_dic = {}
for i in range(0, len(word_list)):
    word_dic[word_list[i]] = i

In [80]:
# matrix of zeros
word_matrix = np.zeros(shape=(len(word_list), len(corpus)), dtype=np.float32)

In [81]:
# loop to count words in each document (constructs raw count matrix)
for i in range(0, len(corpus)):
    for x in corpus[i]:
        if x in word_dic:
            word_matrix[word_dic[x]][i] += 1

In [82]:
# calculate entropy weights for each row
entropy_vec = []
for i in range(0, len(word_matrix)):
    p_vec = word_matrix[i] / np.sum(word_matrix[i])
    p_vec = p_vec[p_vec > 0]
    log_entropy = 1 + np.sum((p_vec * np.log2(p_vec)) / np.log2(len(word_matrix[i])))
    entropy_vec.append(log_entropy)

In [83]:
# apply log transform and entropy weight
for i in range(0, len(word_matrix)):
    word_matrix[i] = np.log(word_matrix[i] + 1) * entropy_vec[i]

In [84]:
# code for svd
svd = TruncatedSVD(n_components=300, algorithm='arpack')

In [85]:
# compute svd (takes a while)
start = time.time()
word_matrix = svd.fit_transform(word_matrix)
end = time.time()
end-start

328.68871784210205

In [86]:
normed_matrix = normalize_matrix(word_matrix)

In [62]:
find_neighbors('nfl', word_dic, word_matrix)

[('lineman', 0.9032235),
 ('quarterback', 0.8827136),
 ('sacks', 0.87340903),
 ('tackle', 0.8715938),
 ('punter', 0.86636007),
 ('chargers', 0.86122316),
 ('packers', 0.8430953),
 ('defensive', 0.8400181),
 ('cowboys', 0.8371231),
 ('linebacker', 0.82938904),
 ('tackles', 0.8258298),
 ('interception', 0.80563587),
 ('bengals', 0.8055953),
 ('buccaneers', 0.80507517),
 ('vikings', 0.79548335),
 ('ers', 0.79499865),
 ('bowl', 0.7928308),
 ('seahawks', 0.790975),
 ('punt', 0.7885048),
 ('fumble', 0.7804296)]

In [88]:
cosine('quarterback', 'linebacker')

0.8420482