In [33]:
import numpy as np
import pickle
from sklearn.decomposition import TruncatedSVD
import time
from itertools import product

In [34]:
def sel_second(x):
    return x[1]

In [35]:
def normalize_matrix(mat):
    sq_mat = mat**2
    sum_sq = np.sum(sq_mat, axis=1)
    mag = np.sqrt(sum_sq)
    mag[mag == 0] = 1
    normed_mat = np.transpose((np.transpose(mat) / mag))
    return normed_mat

In [36]:
# load TASA word list and nonwords from experiment 3
with open('All TASA words plus nonwords from experiment', 'rb') as f:
    word_list = pickle.load(f)

In [53]:
#function to extract seriol2 open bigrams with weights
def extract_bigrams(word, weights=[1, 0.7, 0.5]):
    seriol_word = '*' + word + '*'
    bigrams = []
    for i in range(0, len(seriol_word)):
        for x in range(1, 4):
            if (i+x) < len(seriol_word):
                bigrams.append((seriol_word[i] + seriol_word[i+x], weights[x-1]))
    bigrams = [i for i in bigrams if i[0]!='**']
    bigrams.sort(key=sel_second, reverse=True)
    return bigrams

In [54]:
# example bigrams for word 'pink'
extract_bigrams('pink')

[('*p', 1),
 ('pi', 1),
 ('in', 1),
 ('nk', 1),
 ('k*', 1),
 ('*i', 0.7),
 ('pn', 0.7),
 ('ik', 0.7),
 ('n*', 0.7),
 ('*n', 0.5),
 ('pk', 0.5),
 ('i*', 0.5)]

In [38]:
# find all unique bigrams
unique_strings = []
for i in word_list:
    unique_strings.extend(list(set([x[0] for x in extract_bigrams(i)])))
unique_strings = list(set(unique_strings))

In [39]:
# create matrix of zeros to be filled in
ortho_matrix = np.zeros((len(word_list), len(unique_strings)), dtype=np.float32)

In [40]:
#set up dictionary to index rows of matrix by word
ortho_dic = {}
for i in range(0, len(word_list)):
    ortho_dic[all_words[i]] = i

In [41]:
#set up dictionary to index columns of matrix by bigram
string_dic = {}
for i in range(0, len(unique_strings)):
    string_dic[unique_strings[i]] = i

In [42]:
# construct word by bigram matrix
for i in word_list:
    for x in extract_bigrams(i):
        ortho_matrix[ortho_dic[i]][string_dic[x[0]]] += x[1] #adds weight for each bigram to matrix

In [43]:
# set up Singular Value Decomposition
svd = TruncatedSVD(n_components=300, algorithm='arpack')

In [44]:
# conduct singular value decomposition
start = time.time()
ortho_matrix = svd.fit_transform(ortho_matrix)
end = time.time()
print(end-start)

30.34664297103882


In [45]:
#normalize matrix
ortho_matrix = normalize_matrix(ortho_matrix)

In [46]:
def find_neighbors(x, dic, mat, n=30):
    simvec = mat[dic[x]] @ np.transpose(mat)
    sim_list = list(zip([i for i in dic], simvec))
    sim_list.sort(key=sel_second, reverse=True)
    return sim_list[1:(n+1)]

In [47]:
def cosine(word1, word2):
    return ortho_matrix[ortho_dic[word1]] @ ortho_matrix[ortho_dic[word2]]

In [50]:
cosine('three', 'there')

0.9030265

In [68]:
find_neighbors('snow', ortho_dic, ortho_matrix, n=10)

[('snowy', 0.8676627),
 ('snows', 0.8621983),
 ('sno', 0.84822625),
 ('snowplow', 0.7896492),
 ('now', 0.7622118),
 ('sown', 0.75407207),
 ('snob', 0.7506656),
 ('snowplows', 0.74525374),
 ('sow', 0.71746284),
 ('snowes', 0.7093685)]