## Create distributional semantic word vectors

In [45]:
import numpy as np
import scipy as sp
import re

* Compute the co-occurrence matrix

In [46]:
f = open("dist_sim_data.txt")
word_set = set()
lines = f.readlines()
for line in lines:
    for word in line[:-1].split(" "):
        word_set.add(word)
word_set

{'bite', 'dogs', 'feed', 'like', 'men', 'the', 'women'}

In [47]:
index_to_word = {}
word_to_index = {}
index = 0
for word in word_set:
    index_to_word[index] = word
    word_to_index[word] = index
    index += 1
index_to_word

{0: 'dogs', 1: 'men', 2: 'feed', 3: 'women', 4: 'the', 5: 'bite', 6: 'like'}

In [48]:
size = len(word_set)
count = np.zeros(size)
co_count = np.zeros((size, size))
for line in lines:
    pre_word = ""
    for i, word in enumerate(line[:-1].split(" ")):
        count[word_to_index[word]] += 1
        if i != 0:
            pre_word_index = word_to_index[pre_word]
            word_index = word_to_index[word]
            co_count[pre_word_index, word_index] += 1
            co_count[word_index, pre_word_index] += 1
        pre_word = word

In [49]:
count

array([ 9.,  8.,  4.,  5., 22.,  3.,  4.])

In [50]:
co_count

array([[0., 0., 0., 0., 9., 3., 1.],
       [0., 0., 2., 0., 8., 0., 2.],
       [0., 2., 0., 2., 4., 0., 0.],
       [0., 0., 2., 0., 5., 0., 1.],
       [9., 8., 4., 5., 0., 3., 4.],
       [3., 0., 0., 0., 3., 0., 0.],
       [1., 2., 0., 1., 4., 0., 0.]])

* Multiply your entire matrix by 10 (to pretend that we see these sentences 10 times) and then smooth the counts by adding 1 to all cells.

In [53]:
co_count_after_smooth = co_count * 10 + 1

In [54]:
p_w = count / count.sum()
p_c = p_w
p_w

array([0.16363636, 0.14545455, 0.07272727, 0.09090909, 0.4       ,
       0.05454545, 0.07272727])

In [55]:
p_wc = co_count_after_smooth / ((co_count_after_smooth.sum() - co_count_after_smooth.trace()) / 2 + co_count_after_smooth.trace())
p_wc

array([[0.00213675, 0.00213675, 0.00213675, 0.00213675, 0.19444444,
        0.06623932, 0.02350427],
       [0.00213675, 0.00213675, 0.04487179, 0.00213675, 0.17307692,
        0.00213675, 0.04487179],
       [0.00213675, 0.04487179, 0.00213675, 0.04487179, 0.08760684,
        0.00213675, 0.00213675],
       [0.00213675, 0.00213675, 0.04487179, 0.00213675, 0.10897436,
        0.00213675, 0.02350427],
       [0.19444444, 0.17307692, 0.08760684, 0.10897436, 0.00213675,
        0.06623932, 0.08760684],
       [0.06623932, 0.00213675, 0.00213675, 0.00213675, 0.06623932,
        0.00213675, 0.00213675],
       [0.02350427, 0.04487179, 0.00213675, 0.02350427, 0.08760684,
        0.00213675, 0.00213675]])

In [56]:
p_w = p_w.reshape(size, 1)
p_c = p_c.reshape(1, size)
PPMI = np.log(p_wc / (p_w * p_c))
PPMI[PPMI < 0] = 0
PPMI

array([[0.        , 0.        , 0.        , 0.        , 1.08879055,
        2.00434841, 0.68057441],
       [0.        , 0.        , 1.44498461, 0.        , 1.09016323,
        0.        , 1.44498461],
       [0.        , 1.44498461, 0.        , 1.91498824, 1.10243333,
        0.        , 0.        ],
       [0.        , 0.        , 1.91498824, 0.        , 1.09754334,
        0.        , 1.26836107],
       [1.08879055, 1.09016323, 1.10243333, 1.09754334, 0.        ,
        1.11053054, 1.10243333],
       [2.00434841, 0.        , 0.        , 0.        , 1.11053054,
        0.        , 0.        ],
       [0.68057441, 1.44498461, 0.        , 1.26836107, 1.10243333,
        0.        , 0.        ]])

* Now instead of using your count matrix, use the PPMI matrix as a weighted count matrix. Then compare the word vector for “dogs” before and after PPMI reweighting. Does PPMI do the right thing to the count matrix? Why? Explain in a few sentences how PPMI helps (short prose will do here; no need to show any math, rather just an intuitive understanding).

In [61]:
print(index_to_word)

{0: 'dogs', 1: 'men', 2: 'feed', 3: 'women', 4: 'the', 5: 'bite', 6: 'like'}


In [63]:
PPMI[word_to_index['dogs']]

array([0.        , 0.        , 0.        , 0.        , 1.08879055,
       2.00434841, 0.68057441])

In [59]:
co_count[word_to_index['dogs']]

array([0., 0., 0., 0., 9., 3., 1.])

* Compute the Euclidean distance between the following pairs (you can use the command scipy.linalg.norm to compute the length/norm of a vector)

In [73]:
def calculate_dist(word1, word2, matrix):
    word1_index = word_to_index[word1]
    word2_index = word_to_index[word2]
    dist = np.linalg.norm(matrix[word1_index] - matrix[word2_index])
    print("the distance between {} and {} is {}".format(word1, word2, dist))

In [74]:
calculate_dist('women', 'men', PPMI)
calculate_dist('women', 'dogs', PPMI)
calculate_dist('dogs', 'men', PPMI)
calculate_dist('feed', 'like', PPMI)
calculate_dist('feed', 'bite', PPMI)
calculate_dist('like', 'bite', PPMI)

the distance between women and men is 0.5021491320750741
the distance between women and dogs is 2.8337540983685705
the distance between dogs and men is 2.5864489017628296
the distance between feed and like is 0.9387801747237632
the distance between feed and bite is 3.1261219748521154
the distance between like and bite is 2.3343443434858924


* Decompose the matrix using singular-value decomposition (SVD)

In [75]:
import scipy.linalg as scipy_linalg
U, E, Vt = scipy_linalg.svd(PPMI, full_matrices=False)
U = np.matrix(U) # compute U
E = np.matrix(np.diag(E)) # compute E
Vt = np.matrix(Vt) # compute Vt = conjugage transpose of V 
V = Vt.T # compute V = conjugate transpose of Vt

In [76]:
reduced_PPMI = PPMI * V[:, 0:3]
reduced_PPMI

matrix([[-1.30374974, -0.5414387 ,  1.28534614],
        [-1.69086253, -1.43533568, -0.497668  ],
        [-1.84500612,  1.62973463, -0.76979506],
        [-1.81517978, -1.60125958, -0.64873377],
        [-2.29424987,  0.06162287,  0.25882216],
        [-1.12706364,  0.3266854 ,  1.31523362],
        [-1.78245186,  1.41545814, -0.17537147]])

* Compute the Euclidean distances of the human/animal nouns/verbs again but on the reduced PPMI-weighted count matrix.

In [77]:
calculate_dist('women', 'men', reduced_PPMI)
calculate_dist('women', 'dogs', reduced_PPMI)
calculate_dist('dogs', 'men', reduced_PPMI)
calculate_dist('feed', 'like', reduced_PPMI)
calculate_dist('feed', 'bite', reduced_PPMI)
calculate_dist('like', 'bite', reduced_PPMI)

the distance between women and men is 0.2565275547623126
the distance between women and dogs is 2.2639448068163546
the distance between dogs and men is 2.0317597198696906
the distance between feed and like is 0.6349542208353525
the distance between feed and bite is 2.561390890605675
the distance between like and bite is 1.9587912889282821
