In [1]:
# import packages
import numpy as np
import string
from collections import Counter
from sklearn.utils.extmath import randomized_svd

In [2]:
# Function to find second element in a list
def sel_second(x):
    return x[1]

In [3]:
# Function to remove punctuation
def strip_punctuation(x):
    letters = [i for i in x if i not in string.punctuation]
    word = "".join(letters)
    return word

In [4]:
# Function to normalize a vector
def normalize_vec(x):
    if np.sum(x**2)>0:
        x_normed = x / np.sqrt(np.sum(x**2))
    else:
        x_normed = x.copy()
    return x_normed

In [5]:
# Function to normalize all vectors in a matrix
def normalize_matrix(x):
    sum_sq = np.sum(x**2, axis=1)
    mag = np.sqrt(sum_sq)
    mag[mag == 0] = 1
    normed_x = np.transpose(np.transpose(x) / mag)
    return normed_x

In [6]:
# Function to find a word's near neighbors
def find_neighbors(x, word_dic, word_matrix, n=30):
    word_list = list(word_dic)
    sim_list = normalize_vec(word_matrix[word_dic[x]]) @ np.transpose(normalize_matrix(word_matrix))
    output = list(zip(word_list, sim_list))
    output.sort(key=sel_second, reverse=True)
    return output[1:1+n]

In [7]:
# Function to find cosine simlarity between two word vectors
def cosine(x, y, word_dic, word_matrix):
    x_vec = normalize_vec(word_matrix[word_dic[x]])
    y_vec = normalize_vec(word_matrix[word_dic[y]])
    return x_vec @ y_vec

The full wikipedia corpus is too large, so I shared a snippet of the corpus to demonstrate how the code works. The snippet of the wikipedia corpus can be found at https://osf.io/6mys9/

In [8]:
# load txt file of wikipedia corpus
with open('mini_wiki_corpus.txt', 'r', encoding='utf-8') as f:
    df = f.readlines()

In [9]:
# strip paragraph indents between documents
df = [i.strip() for i in df]

In [10]:
# convert txt file to python lists of words

corpus = [] # words grouped by documents
all_words = [] # list of all words (helpful to caluculate frequency of words in the corpus)
for i in df:
    words = i.split(' ') # split string by spaces
    words = [x.strip() for x in words] # get rid of any extra white space
    words = [x.lower() for x in words] # convert letters to lowercase
    words = [strip_punctuation(x) for x in words] # remove punctuation from words
    words = [x for x in words if x!=''] # remove any blank entries
    corpus.append(words)
    all_words.extend(words)

In [11]:
# Count frequency of all words in corpus in a dictionary object
count_dic = Counter(all_words)

In [12]:
# Convert dictionary object to list object
count_list = [(i, count_dic[i]) for i in count_dic]

In [13]:
# Sort by frequency (most frequent words first)
count_list.sort(key=sel_second, reverse=True)

In [14]:
# Find 100,000 most frequent words for the rows of the matrix
target_words = [i[0] for i in count_list[:100000]]

In [15]:
# Find 25,000 most frequent words for columns of the matrix
# Excludes the 200 most frequent words, which consist mostly of function words
context_words = [i[0] for i in count_list[200:25200]]

In [16]:
# set up a dictionary to match words to rows in the matrix
target_dic = {}
for i in range(0, len(target_words)):
    target_dic[target_words[i]] = i

In [17]:
# set up a dictionary to match words to columns in the matrix
context_dic = {}
for i in range(0, len(context_words)):
    context_dic[context_words[i]] = i

In [18]:
# Create matrix of zeros
word_matrix = np.zeros((len(target_dic), len(context_dic)), dtype=np.float32)

In [19]:
# Define window (two words to left [negative values] and right [positive values])
window = [-2, -1, 1, 2]

In [20]:
# Scan corpus word-by-word and add counts to word matrix
for doc in corpus:
    for x in range(0, len(doc)):
        if doc[x] in target_dic:
            for c in window:
                if (x+c) >= 0 and (x+c) < len(doc):
                    if doc[x+c] in context_dic:
                        word_matrix[target_dic[doc[x]], context_dic[doc[x+c]]] += 1

In [21]:
# calculate column sums, row sums, and total sum of all co-occurrence counts (needed to calculate Pointwise Mutual Information)
colsums = np.sum(word_matrix, axis=0)
rowsums = np.sum(word_matrix, axis=1)
total = np.sum(word_matrix)

In [22]:
# convert co-occurrence counts to Pointwise Mutual Information (PMI) values
for i in range(0, len(word_matrix)):
    if np.sum(word_matrix[i]) > 0:
        word_matrix[i] = np.log2((word_matrix[i] * total) / (colsums * rowsums[i]))

  word_matrix[i] = np.log2((word_matrix[i] * total) / (colsums * rowsums[i]))


In [23]:
# convert PMI to Positive PMI (PPMI)
word_matrix[word_matrix < 0] = 0

In [24]:
# apply Singular Value Decomposition to reduce dimensionality of word matrix
word_matrix, s, vt = randomized_svd(word_matrix, n_components=500)

Note that for the following examples, the model was trained on a much smaller corpus compared to the full wikipedia, so the neighbors will be different compared to when the model is trained with the full corpus.

In [25]:
# Find semantic neighbors of a word
find_neighbors('football', target_dic, word_matrix, n=10)

[('basketball', 0.5864471),
 ('soccer', 0.5713351),
 ('disaffiliated', 0.5420472),
 ('heike', 0.52209055),
 ('rugby', 0.5115758),
 ('league', 0.5051365),
 ('deflated', 0.4872858),
 ('volleyball', 0.4528676),
 ('softball', 0.45131442),
 ('hanwell', 0.44987047)]

In [26]:
# Find cosine similarity between two words
cosine('football', 'rugby', target_dic, word_matrix)

0.5115758