In [1]:
# import packages
import numpy as np
from sklearn.utils.extmath import randomized_svd

In [2]:
# function to select second element in a list
def sel_second(x):
    return x[1]

In [3]:
# function to normalize a vector
def normalize_vec(x):
    if np.sum(x**2)>0:
        normed_x = x / np.sqrt(np.sum(x**2))
    else:
        normed_x = x.copy()
    return normed_x

In [4]:
# function to normalize all vectors in a matrix
def normalize_matrix(mat):
    sq_mat = mat**2
    sum_sq = np.sum(sq_mat, axis=1)
    mag = np.sqrt(sum_sq)
    mag[mag == 0] = 1
    normed_mat = np.transpose((np.transpose(mat) / mag))
    return normed_mat

In [5]:
# Function to find near neighbors of a word
def find_neighbors(x, dic, mat, n=30):
    simvec = normalize_vec(mat[dic[x]]) @ np.transpose(normalize_matrix(mat))
    sim_list = list(zip([i for i in dic], simvec))
    sim_list.sort(key=sel_second, reverse=True)
    return sim_list[1:(n+1)]

In [6]:
# Function to find cosine similarity between two words
def cosine(word1, word2):
    return normalize_vec(ortho_matrix[ortho_dic[word1]]) @ normalize_vec(ortho_matrix[ortho_dic[word2]])

The word_list.txt file can be downloaded from our OSF page at https://osf.io/6mys9/. It contains the 100,000 most frequent words from the Wikipedia corpus as well as the nonwords used in Simulations 1c and 2c.

In [7]:
# load list of words to make orthographic vectors for
with open('word_list.txt', 'r', encoding='utf-8') as f:
    word_list = f.readlines()

In [8]:
# remove all white space before and after words from txt file
word_list = [i.strip() for i in word_list]

In [9]:
#function to extract seriol2 open bigrams with weights
def extract_bigrams(word, weights=[1, 0.7, 0.5]):
    seriol_word = '*' + word + '*'
    bigrams = []
    for i in range(0, len(seriol_word)):
        for x in range(1, 4):
            if (i+x) < len(seriol_word):
                bigrams.append((seriol_word[i] + seriol_word[i+x], weights[x-1]))
    bigrams = [i for i in bigrams if i[0]!='**']
    bigrams.sort(key=sel_second, reverse=True)
    return bigrams

In [10]:
# example bigrams for word 'pink'
extract_bigrams('pink')

[('*p', 1),
 ('pi', 1),
 ('in', 1),
 ('nk', 1),
 ('k*', 1),
 ('*i', 0.7),
 ('pn', 0.7),
 ('ik', 0.7),
 ('n*', 0.7),
 ('*n', 0.5),
 ('pk', 0.5),
 ('i*', 0.5)]

In [11]:
# find all unique bigrams
unique_strings = []
for i in word_list:
    unique_strings.extend(list(set([x[0] for x in extract_bigrams(i)])))
unique_strings = list(set(unique_strings))

In [12]:
# create matrix of zeros to be filled in
ortho_matrix = np.zeros((len(word_list), len(unique_strings)), dtype=np.float32)

In [13]:
#set up dictionary to index rows of matrix by word
ortho_dic = {}
for i in range(0, len(word_list)):
    ortho_dic[word_list[i]] = i

In [14]:
#set up dictionary to index columns of matrix by bigram
string_dic = {}
for i in range(0, len(unique_strings)):
    string_dic[unique_strings[i]] = i

In [15]:
# construct word by bigram matrix
for i in word_list:
    for x in extract_bigrams(i):
        ortho_matrix[ortho_dic[i]][string_dic[x[0]]] += x[1] #adds weight for each bigram to matrix

In [16]:
# Apply Singular Value Decomposition to reduce dimensionality of the matrix
ortho_matrix, s, VT = randomized_svd(ortho_matrix, n_components=250)

In [17]:
#normalize vectors in the matrix
ortho_matrix = normalize_matrix(ortho_matrix)

In [18]:
# find orthographic neighbors for a target word
find_neighbors('pair', ortho_dic, ortho_matrix, n=10)

[('paire', 0.8497597),
 ('pairs', 0.83499545),
 ('pairc', 0.73929787),
 ('air', 0.7273965),
 ('paired', 0.7271911),
 ('pari', 0.71619946),
 ('pira', 0.701304),
 ('pairing', 0.69715226),
 ('pašić', 0.6848196),
 ('pai', 0.6691694)]

In [19]:
# find cosine similarity between two words' orthography vectors
cosine('pair', 'air')

0.7273965