## Create distributional semantic word vectors

In [None]:
import numpy as np
import scipy as sp
import re

* Compute the co-occurrence matrix

In [None]:
f = open("dist_sim_data.txt")
word_set = set()
lines = f.readlines()
for line in lines:
    for word in line[:-1].split(" "):
        word_set.add(word)
word_set

{'bite', 'dogs', 'feed', 'like', 'men', 'the', 'women'}

In [36]:
index_to_word = {}
word_to_index = {}
index = 0
for word in word_set:
    index_to_word[index] = word
    word_to_index[word] = index
    index += 1
index_to_word

{0: 'feed', 1: 'dogs', 2: 'bite', 3: 'men', 4: 'like', 5: 'women', 6: 'the'}

In [38]:
size = len(word_set)
count = np.zeros((size, size))
for line in lines:
    pre_word = ""
    for i, word in enumerate(line[:-1].split(" ")):
        if i != 0:
            pre_word_index = word_to_index[pre_word]
            word_index = word_to_index[word]
            count[pre_word_index, word_index] += 1
            count[word_index, pre_word_index] += 1
        pre_word = word
count

array([[0., 0., 0., 2., 0., 2., 4.],
       [0., 0., 3., 0., 1., 0., 9.],
       [0., 3., 0., 0., 0., 0., 3.],
       [2., 0., 0., 0., 2., 0., 8.],
       [0., 1., 0., 2., 0., 1., 4.],
       [2., 0., 0., 0., 1., 0., 5.],
       [4., 9., 3., 8., 4., 5., 0.]])

* Multiply your entire matrix by 10 (to pretend that we see these sentences 10 times) and then smooth the counts by adding 1 to all cells.

In [39]:
count = count * 10 + 1
count

array([[ 1.,  1.,  1., 21.,  1., 21., 41.],
       [ 1.,  1., 31.,  1., 11.,  1., 91.],
       [ 1., 31.,  1.,  1.,  1.,  1., 31.],
       [21.,  1.,  1.,  1., 21.,  1., 81.],
       [ 1., 11.,  1., 21.,  1., 11., 41.],
       [21.,  1.,  1.,  1., 11.,  1., 51.],
       [41., 91., 31., 81., 41., 51.,  1.]])

In [41]:
p_w = count.sum(axis=1) / count.sum()
p_c = p_w
p_w

array([0.09364909, 0.1474704 , 0.07212056, 0.13670614, 0.09364909,
       0.09364909, 0.36275565])

In [43]:
p_wc = count / ((count.sum() - count.trace()) / 2 + count.trace())
p_wc

array([[0.00213675, 0.00213675, 0.00213675, 0.04487179, 0.00213675,
        0.04487179, 0.08760684],
       [0.00213675, 0.00213675, 0.06623932, 0.00213675, 0.02350427,
        0.00213675, 0.19444444],
       [0.00213675, 0.06623932, 0.00213675, 0.00213675, 0.00213675,
        0.00213675, 0.06623932],
       [0.04487179, 0.00213675, 0.00213675, 0.00213675, 0.04487179,
        0.00213675, 0.17307692],
       [0.00213675, 0.02350427, 0.00213675, 0.04487179, 0.00213675,
        0.02350427, 0.08760684],
       [0.04487179, 0.00213675, 0.00213675, 0.00213675, 0.02350427,
        0.00213675, 0.10897436],
       [0.08760684, 0.19444444, 0.06623932, 0.17307692, 0.08760684,
        0.10897436, 0.00213675]])

In [62]:
p_w = p_w.reshape(size, 1)
p_c = p_c.reshape(1, size)
PPMI = np.log(p_wc / (p_w * p_c))
PPMI[PPMI < 0] = 0
PPMI

array([[0.        , 0.        , 0.        , 1.25417641, 0.        ,
        1.63245538, 0.9473302 ],
       [0.        , 0.        , 1.82906284, 0.        , 0.53175541,
        0.        , 1.29054483],
       [0.        , 1.82906284, 0.        , 0.        , 0.        ,
        0.        , 0.92896084],
       [1.25417641, 0.        , 0.        , 0.        , 1.25417641,
        0.        , 1.24992832],
       [0.        , 0.53175541, 0.        , 1.25417641, 0.        ,
        0.98582822, 0.9473302 ],
       [1.63245538, 0.        , 0.        , 0.        , 0.98582822,
        0.        , 1.16558377],
       [0.9473302 , 1.29054483, 0.92896084, 1.24992832, 0.9473302 ,
        1.16558377, 0.        ]])

* Now instead of using your count matrix, use the PPMI matrix as a weighted count matrix. Then compare the word vector for “dogs” before and after PPMI reweighting. Does PPMI do the right thing to the count matrix? Why? Explain in a few sentences how PPMI helps (short prose will do here; no need to show any math, rather just an intuitive understanding).

* Compute the Euclidean distance between the following pairs (you can use the command scipy.linalg.norm to compute the length/norm of a vector)

* Decompose the matrix using singular-value decomposition (SVD)

In [66]:
import scipy.linalg as scipy_linalg
U, E, Vt = scipy_linalg.svd(PPMI, full_matrices=False)
U = np.matrix(U) # compute U
E = np.matrix(np.diag(E)) # compute E
Vt = np.matrix(Vt) # compute Vt = conjugage transpose of V 
V = Vt.T # compute V = conjugate transpose of Vt

In [68]:
reduced_PPMI = PPMI * V[:, 0:3]
reduced_PPMI

matrix([[-1.58752293, -1.23443582, -0.72159619],
        [-1.34260467,  0.67455208,  1.14166954],
        [-1.07539883, -0.21988618,  1.15410849],
        [-1.58546891,  1.35984716, -0.48222781],
        [-1.51369124, -1.03137443, -0.21285735],
        [-1.58658081,  1.40856204, -0.60043088],
        [-2.24829813, -0.68456521,  0.18280598]])