<h1> LSA part: </h1>

<h1> Step 1: Importing packages and corpus </h1>

In [1]:
import re
import numpy as np
import math

import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.collocations import *

from scipy import sparse
from scipy.sparse import rand
from scipy.stats import uniform, pearsonr

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
text = brown.words()
text

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

<h1> Step 2: Extracting most common words and update list </h1>

In [3]:
def preprocess_text(raw_text):
    word_check = re.compile('\w+')
    filter_non_word = re.compile('.*(\W|\d).*')
    stop_words = set(stopwords.words('english'))

    processed_text = []

    for w in raw_text:
        w = w.lower()
        if (word_check.match(w)) and (w not in stop_words) and (not filter_non_word.match(w)):
            processed_text.append(w)
            
    return processed_text

processed_t = preprocess_text(text)

In [4]:
def get_frequency_dist(processed_text):
    return nltk.FreqDist(w for w in processed_text)

fdist = get_frequency_dist(processed_t)

<h3> Most common and least common words: </h3>

In [5]:
W = fdist.most_common(5000)

In [6]:
vocab = []

for w in W:
    vocab.append(w[0])

print(len(vocab))

5000


In [7]:
print("The most frequent five words in the most frequent 5000 words in the Brown Corpus:")

W[:5]

The most frequent five words in the most frequent 5000 words in the Brown Corpus:


[('one', 3292),
 ('would', 2714),
 ('said', 1961),
 ('new', 1635),
 ('could', 1601)]

In [8]:
print("The least frequent five words in the most frequent 5000 words in the Brown Corpus:")

W[-5:]

The least frequent five words in the most frequent 5000 words in the Brown Corpus:


[('advances', 18),
 ('applicable', 18),
 ('humble', 18),
 ('defended', 18),
 ('spectacle', 18)]

<h3> Update existing list: </h3>

In [9]:
new_words = ['grin', 'shore', 'gem', 'hill', 'pillow', 'cock', 'lad', 'cord', 'rooster', 'food', 'forest', 'autograph', 'monk', 'noon', 'bird', 'graveyard', 'woodland', 'car', 'crane', 'fruit', 'sage', 'cemetery', 'wizard', 'voyage', 'coast', 'implement', 'brother', 'automobile', 'journey', 'tool', 'stove', 'jewel', 'asylum', 'madhouse', 'string', 'rock', 'tumbler', 'oracle', 'cushion', 'smile', 'magician', 'boy', 'slave', 'mound', 'glass', 'serf', 'midday', 'furnace', 'signature']

def add_words(old_w_list, new_w_list):
    count = 0
    min_count = fdist[old_w_list[-1][0]]
    for w in set(new_w_list):
        if w not in vocab and fdist[w] > 0:
            old_w_list.append((tuple((w, fdist[w]))))

add_words(W, new_words)

In [10]:
len_W = len(W)
print(len_W)

5029


In [11]:
for w in W:
    if w[0] not in vocab:
        vocab.append(w[0])

print(len(vocab))

5029


<h1> Step 3: Construct a word-context vector model </h3>

In [12]:
matrix = rand(len(vocab), len(vocab), density=0, format="csr", random_state=0)
matrix

<5029x5029 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [13]:
M1 = matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
bigram_freq_dist = FreqDist()
bigram_text = nltk.bigrams(processed_t)
bigram_list = list(bigram_text)

len(bigram_list)

509266

In [15]:
for b in bigram_list:
    if b[0] in vocab and b[1] in vocab:
        M1[vocab.index(b[1])][vocab.index(b[0])] += 1.0

<h1> Step 4: Build PPMI Matrix: </h1>

In [16]:
def sum_unigram_freq(uni_W):
    f_sum = 0
    
    for i in range(len(uni_W)):
        f_sum += uni_W[i][1]
        
    return f_sum

sum_uni_freq = sum_unigram_freq(W)
print(sum_uni_freq)

389739


In [17]:
def sum_bigram_freq(bigram_W):
    f_sum = 0
    
    for i in range(len(bigram_W)):
        for j in range(len(bigram_W[i])):
            f_sum += bigram_W[i][j]
            
    return f_sum

sum_bi_freq = sum_bigram_freq(M1)
print(sum_bi_freq)

304578.0


In [18]:
def ppmi(i_w1, i_w2, freq_joint, f_uni_sum, f_bi_sum, uni_W):
    p_w1 = uni_W[i_w1][1] / float(f_uni_sum)
    p_w2 = uni_W[i_w2][1] / float(f_uni_sum)
    p_w1_w2 = freq_joint / float(f_bi_sum)
    
    try:
        if p_w1*p_w2 == 0 or p_w1_w2/float(p_w1*p_w2) == 0.0:
            return 0.0
        else:
            return max(math.log(p_w1_w2/float(p_w1*p_w2), 2), 0.0)
    except:
        0.0

In [19]:
M1_plus = matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
for i in range(len(M1_plus)):
    for j in range(len(M1_plus)):
        M1_plus[i][j] = ppmi(i, j, M1[i][j], sum_uni_freq, sum_bi_freq, W)

<h1> Step 5: Perform PCA: </h1>

In [21]:
def cal_PCA(ppmi_matrix, dim):
    fit_transformer = StandardScaler().fit_transform(ppmi_matrix)
    pca = PCA(n_components=dim)
    return pca.fit_transform(ppmi_matrix)

In [22]:
M2_10 = cal_PCA(M1_plus, 10)

In [23]:
print(len(M2_10[0]))

10


In [24]:
M2_100 = cal_PCA(M1_plus, 100)

In [25]:
print(len(M2_100[0]))

100


In [26]:
M2_300 = cal_PCA(M1_plus, 300)

In [27]:
print(len(M2_300[0]))

300


<h1> Step 6: Find all pairs of words in Table 1 and record the human-judged similarities: </h1>

In [28]:
P_dict = {('cord', 'smile'): 0.02, ('rooster', 'voyage'): 0.04, ('noon', 'string'): 0.04, 
    ('fruit', 'furnace'): 0.05, ('autograph', 'shore'): 0.06, ('automobile', 'wizard'): 0.11,
    ('mound', 'stove'): 0.14, ('grin', 'implement'): 0.18, ('asylum', 'fruit'): 0.19,
    ('asylum', 'monk'): 0.39, ('graveyard', 'madhouse'): 0.42, ('glass', 'magician'): 0.44,
    ('boy', 'rooster'): 0.44, ('cushion', 'jewel'): 0.45, ('monk', 'slave'): 0.57,
    ('asylum', 'cemetery'): 0.79, ('coast', 'forest'): 0.85, ('grin', 'lad'): 0.88,
    ('shore', 'woodland'): 0.90, ('monk', 'oracle'): 0.91, ('boy', 'sage'): 0.96,
    ('automobile', 'cushion'): 0.97, ('mound', 'shore'): 0.97, ('lad', 'wizard'): 0.99,
    ('forest', 'graveyard'): 1.00, ('food', 'rooster'): 1.09, ('cemetery', 'woodland'): 1.18,
    ('shore', 'voyage'): 1.22, ('bird', 'woodland'): 1.24, ('coast', 'hill'): 1.26,
    ('furnace', 'implement'): 1.37, ('crane', 'rooster'): 1.41, ('hill', 'woodland'): 1.48,
    ('car', 'journey'): 1.55, ('cemetery', 'mound'): 1.69, ('glass', 'jewel'): 1.78,
    ('magician', 'oracle'): 1.82, ('crane', 'implement'): 2.37, ('brother', 'lad'): 2.41,
    ('sage', 'wizard'): 2.46, ('oracle', 'sage'): 2.61, ('bird', 'crane'): 2.63,
    ('bird', 'rock'): 2.63, ('food', 'fruit'): 2.69, ('brother', 'monk'): 2.74,
    ('asylum', 'madhouse'): 3.04, ('furnace', 'stove'): 3.11, ('magician', 'wizard'): 3.21,
    ('hill', 'mound'): 3.29, ('cord', 'string'): 3.41, ('glass', 'tumbler'): 3.45,
    ('grin', 'smile'): 3.46, ('serf', 'slave'): 3.46, ('journey', 'voyage'): 3.58,
    ('autograph', 'signature'): 3.59, ('coast', 'shore'): 3.60, ('forest', 'woodland'): 3.65,
    ('implement', 'tool'): 3.66, ('cock', 'rooster'): 3.68, ('boy', 'lad'): 3.82,
    ('cushion', 'pillow'): 3.84, ('cemetery', 'graveyard'): 3.88, ('automobile', 'car'): 3.92,
    ('midday', 'noon'): 3.94, ('gem', 'jewel'): 3.94}

In [29]:
del_list = []

for key in P_dict:
    if (key[0] not in vocab) or (key[1] not in vocab):
        del_list.append(key)
        
for del_pair in del_list:
    del P_dict[del_pair]

In [30]:
print(len(P_dict))

64


In [33]:
P_list = list(P_dict.keys())
P = set(P_list)

In [34]:
S = []
for key in P_dict:
    S.append(P_dict[key])

In [35]:
print(len(S))

64


<h1> Step 7: Calculate Cosine Similarities: </h1>

In [36]:
def cal_cos_sim(word_dict, info_matrix, vocab_list):
    cos_sim_dict = {}
    for key in word_dict:
        i_w1 = vocab_list.index(key[0])
        i_w2 = vocab_list.index(key[1])
        cos_sim_dict[key] = cosine_similarity(sparse.csr_matrix(info_matrix[i_w1]), sparse.csr_matrix(info_matrix[i_w2]))
        
    return cos_sim_dict

<h4> Cosine Similarities for M1: </h4>

In [37]:
S_M1 = cal_cos_sim(P_dict, M1, vocab)

<h4> Cosine Similarities for M1_plus: </h4>

In [38]:
S_M1_plus = cal_cos_sim(P_dict, M1_plus, vocab)

<h4> Cosine Similarities for M2_10: </h4>

In [39]:
S_M2_10 = cal_cos_sim(P_dict, M2_10, vocab)

<h4> Cosine Similarities for M2_100: </h4>

In [40]:
S_M2_100 = cal_cos_sim(P_dict, M2_100, vocab)

<h4> Cosine Similarities for M2_300: </h4>

In [41]:
S_M2_300 = cal_cos_sim(P_dict, M2_300, vocab)

<h1> Step 8: Calculate Pearson Correlation: </h1>

<h4> Pearson Correlation for M1: </h4>

In [42]:
S_list_M1 = []
for key in S_M1:
    S_list_M1.append(S_M1[key][0][0])

In [43]:
corr_M1, p_value_M1 = pearsonr(S_list_M1, S)

In [44]:
print(corr_M1)

0.11843435038906105


<h4> Pearson Correlation for M1_plus: </h4>

In [45]:
S_list_M1_plus = []
for key in S_M1_plus:
    S_list_M1_plus.append(S_M1_plus[key][0][0])

In [46]:
corr_M1_plus, p_value_M1_plus = pearsonr(S_list_M1_plus, S)

In [47]:
print(corr_M1_plus)

0.11993626872007863


<h4> Pearson Correlation for M2_10: </h4>

In [48]:
S_list_M2_10 = []
for key in S_M2_10:
    S_list_M2_10.append(S_M2_10[key][0][0])

In [49]:
corr_M2_10, p_value_M2_10 = pearsonr(S_list_M2_10, S)

In [50]:
print(corr_M2_10)

0.07028427881086727


<h4> Pearson Correlation for M2_100: </h4>

In [51]:
S_list_M2_100 = []
for key in S_M2_100:
    S_list_M2_100.append(S_M2_100[key][0][0])

In [52]:
corr_M2_100, p_value_M2_100 = pearsonr(S_list_M2_100, S)

In [53]:
print(corr_M2_100)

0.1143811990634159


<h4> Pearson Correlation for M2_300: </h4>

In [54]:
S_list_M2_300 = []
for key in S_M2_300:
    S_list_M2_300.append(S_M2_300[key][0][0])

In [55]:
corr_M2_300, p_value_M2_300 = pearsonr(S_list_M2_300, S)

In [56]:
print(corr_M2_300)

0.14137103020616631


<h1> Word2Vec part: </h1>

In [57]:
from gensim.models import KeyedVectors
from gensim import utils, matutils
from numpy import dot

In [58]:
model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz',binary=True)

In [59]:
w2v_matrix = []

for item in new_words:
    w2v_matrix.append(model[item])

<h4> Cosine Similarities for w2v model: </h4>

In [60]:
S_w2v = cal_cos_sim(P_dict, w2v_matrix, new_words)

<h4> Pearson Correlation for w2v model: </h4>

In [61]:
S_list_w2v = []
for key in S_w2v:
    S_list_w2v.append(S_w2v[key][0][0])

In [62]:
corr_w2v, p_value_w2v = pearsonr(S_list_w2v, S)

In [63]:
print(corr_w2v)

0.7521151672039903


In [64]:
print(p_value_w2v)

7.849322751325408e-13


In [65]:
word_pairs = [line.rstrip('\n') for line in open('./word_test.v1.txt')]
print(len(word_pairs))

19559


In [66]:
semantic_pairs = []
syntactic_pairs = []
split_line = re.compile(': gram1-adjective-to-adverb')
wanted_line = re.compile('\W.+')
semantic = True

for line in word_pairs:
    if split_line.match(line):
        semantic = False
    if semantic and not wanted_line.match(line):
        semantic_pairs.append(line)
    elif semantic == False and not wanted_line.match(line):
        syntactic_pairs.append(line)

print(len(semantic_pairs))
print(len(syntactic_pairs))

8869
10675


In [67]:
def filter_word_pairs(list_of_lines):
    new_list = []
    
    for line in list_of_lines:
        words = line.split()
        if set(words).issubset(vocab):
            new_list.append(line)
    
    return new_list
            
semantic_pairs = filter_word_pairs(semantic_pairs)
print(len(semantic_pairs))
syntactic_pairs = filter_word_pairs(syntactic_pairs)
print(len(syntactic_pairs))

56
2080


In [68]:
def analogy_test_w2v(list_of_lines, w2v_model):
    successful = 0
    
    for line in list_of_lines:
        words = line.split()
        prediction = w2v_model.most_similar(positive=[words[3], words[0]], negative=[words[2]])[0]
        
        if prediction[0] == words[1]:
            successful += 1
                
    return successful
                
print(analogy_test_w2v(semantic_pairs, model))
print(analogy_test_w2v(syntactic_pairs, model))

51
1441


In [None]:
def analogy_test_lsa(list_of_lines, M_lsa, vocab_list):
    successful = 0
    
    for line in list_of_lines:
        words = line.split()
        prediction_vec = M_lsa[vocab_list.index(words[3])] - M_lsa[vocab_list.index(words[2])] + M_lsa[vocab_list.index(words[0])]
        cos_sim_list = []
        for word in vocab_list:
            i_w = vocab_list.index(word)
            cos_sim_list.append(cosine_similarity(sparse.csr_matrix(prediction_vec), sparse.csr_matrix(M_lsa[i_w])))
        
        if vocab_list[vocab_list.index(words[1])] == vocab_list[cos_sim_list.index(max(cos_sim_list))]:
            successful += 1
                
    return successful

print(analogy_test_lsa(semantic_pairs, M2_300, vocab))
print(analogy_test_lsa(syntactic_pairs, M2_300, vocab))

0<br>
0