In [23]:
import numpy as np
import time
from collections import Counter
from sklearn.decomposition import TruncatedSVD

In [24]:
def sel_second(x):
    return x[1]

In [25]:
def normalize_vec(vec):
    if np.sum(vec**2) > 0:
        sq_vec = vec**2
        sum_sq = np.sum(sq_vec)
        mag = np.sqrt(sum_sq)
        normed_vec = vec / mag
    else:
        normed_vec = vec.copy()
    return normed_vec

In [26]:
def normalize_matrix(mat):
    sq_mat = mat**2
    sum_sq = np.sum(sq_mat, axis=1)
    mag = np.sqrt(sum_sq)
    normed_mat = np.transpose((np.transpose(mat) / mag))
    return normed_mat

In [27]:
def find_neighbors(word, dic, mat, n=20):
    cos_sim = normalize_vec(mat[dic[word]]) @ np.transpose(normalize_matrix(mat))
    cos_sim = list(zip([i for i in dic], cos_sim))
    cos_sim.sort(key = sel_second, reverse=True)
    return cos_sim[1:(n+1)]

In [28]:
def cosine(word1, word2):
    return normalize_vec(word_matrix[word_dic[word1]]) @ normalize_vec(word_matrix[word_dic[word2]])

I can't share the TASA corpus, so instead, to demostrate the LSA code, I will run the code on a set of articles from Wikipedia. This will not create the same vectors we used in the simulations, but you can download those vectors directly from our OSF page at: https://osf.io/6mys9/

In [29]:
#load corpus, corpus file available from Github
with open('mini_wiki_corpus.txt', 'r', encoding='utf-8') as f:
    df = f.read()

In [30]:
#Process corpus into lists of words by document
#Get list of unique words in corpus
corpus = []
word_list = []
df = df.split('\n')
for i in df:
    words = i.strip()
    words = words.split(' ')
    corpus.append(words)
    word_list.extend(words)

In [31]:
# get frequency counts for all words
count_list = Counter(word_list)
count_list = [[i, count_list[i]] for i in count_list]
count_list.sort(key=sel_second, reverse=True)

In [32]:
# create list of 50,000 most frequent words
word_list = [i[0] for i in count_list[:50000]]

In [33]:
# dictionary to index rows of matrix based on the word
word_dic = {}
for i in range(0, len(word_list)):
    word_dic[word_list[i]] = i

In [34]:
# matrix of zeros
word_matrix = np.zeros(shape=(len(word_list), len(corpus)), dtype=np.float32)

In [35]:
# loop to count words in each document (constructs raw count matrix)
for i in range(0, len(corpus)):
    for x in corpus[i]:
        if x in word_dic:
            word_matrix[word_dic[x]][i] += 1

In [36]:
# calculate entropy weights for each row
entropy_vec = []
for i in range(0, len(word_matrix)):
    p_vec = word_matrix[i] / np.sum(word_matrix[i])
    p_vec = p_vec[p_vec > 0]
    log_entropy = 1 + np.sum((p_vec * np.log2(p_vec)) / np.log2(len(word_matrix[i])))
    entropy_vec.append(log_entropy)

In [37]:
# apply log transform and entropy weight
for i in range(0, len(word_matrix)):
    word_matrix[i] = np.log(word_matrix[i] + 1) * entropy_vec[i]

In [38]:
# code for svd
svd = TruncatedSVD(n_components=300, algorithm='arpack')

In [None]:
# compute svd (takes a while)
start = time.time()
word_matrix = svd.fit_transform(word_matrix)
end = time.time()
end-start

In [40]:
# normalize matrix
normed_matrix = normalize_matrix(word_matrix)

In [None]:
find_neighbors('nfl', word_dic, word_matrix, n=10)

In [None]:
cosine('quarterback', 'linebacker')