In [1]:
import os
import sklearn
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats
from numpy import inf
import matplotlib.pyplot as plt

In [2]:
ai517_assn1_base_path = './ai517_assn1'


ai517_assn1_file_list = os.listdir('./ai517_assn1')

men_path = os.path.join(ai517_assn1_base_path,ai517_assn1_file_list[0])
simlex_path = os.path.join(ai517_assn1_base_path,ai517_assn1_file_list[1])
vocab_25k_path = os.path.join(ai517_assn1_base_path,ai517_assn1_file_list[3])
vocab_wordsim_path = os.path.join(ai517_assn1_base_path,ai517_assn1_file_list[5])
wiki_01percent_path = os.path.join(ai517_assn1_base_path,ai517_assn1_file_list[-2])

In [3]:
wordsim_path = os.path.join(ai517_assn1_base_path,ai517_assn1_file_list[0])

In [4]:
ai517_assn1_file_list

['men.txt',
 'simlex-999.txt',
 'vocab-25k+wordsim.txt',
 'vocab-25k.txt',
 'vocab-3k.txt',
 'vocab-wordsim.txt',
 'wiki-0.1percent.txt',
 'wiki-1percent.txt',
 '__MACOSX']

## 1.1 Distributional Counting (20 points)

In [5]:
vocab_25k = open(vocab_25k_path, encoding='UTF8')
vocab_wordsim =open(vocab_wordsim_path)
wiki_01percent_list = open(wiki_01percent_path, encoding='UTF8')

wiki_01percent_list = [word.split('\n')[0].strip() for word in list(wiki_01percent_list.readlines())]
vocab_word_sim_list = [word.split('\n')[0].strip() for word in list(vocab_wordsim.readlines())]
vocab_25k_words_list = [word.split('\n')[0].strip() for word in list(vocab_25k.readlines())]

vocab_words_index_dict = {}
vocab_25k_words_index_dict = {}

for i,word in enumerate(vocab_word_sim_list):
    vocab_words_index_dict[word]=i
for i,word in enumerate(vocab_25k_words_list):
    vocab_25k_words_index_dict[word]=i

In [6]:
def context_matrix(w):
    word_context_matrix = np.zeros((len(vocab_word_sim_list),len(vocab_25k_words_list)))
    for sentence in tqdm(wiki_01percent_list):
        word_list = ['<s>']+sentence.split(' ')+['</s>']
        for i,word in enumerate(word_list):
            word = word.strip()
            if word in vocab_word_sim_list:
                for j in range(max(i-w,0),min(i+w,len(word_list))):
                    if word_list[j] in vocab_25k_words_list:
                        word_context_matrix[vocab_words_index_dict[word],vocab_25k_words_index_dict[word_list[j]]]+=1

    return word_context_matrix
        


In [7]:
wc =context_matrix(3)

100%|██████████| 997898/997898 [06:32<00:00, 2539.24it/s]


In [8]:
wc

array([[ 75.,  95., 323., ...,   0.,   0.,   0.],
       [  7.,  22.,  56., ...,   0.,   0.,   0.],
       [101.,  65., 762., ...,   0.,   0.,   0.],
       ...,
       [ 10.,   7.,  26., ...,   0.,   0.,   0.],
       [ 99., 153., 629., ...,   0.,   0.,   0.],
       [ 47.,  40., 181., ...,   0.,   0.,   0.]])

In [9]:
simlex999_df = pd.read_csv(simlex_path,sep = "\t" )
men_df = pd.read_csv(men_path,sep = "\t" )

In [10]:
men_index_score = []
word_1_index_list = [vocab_words_index_dict[word] for word in list(men_df['word1'])]
men_index_score.append(word_1_index_list)
word_2_index_list = [vocab_words_index_dict[word] for word in list(men_df['word2'])]
men_index_score.append(word_2_index_list)
men_index_score.append(list(men_df['score']))

simplex_index_score = []
word_1_index_list = [vocab_words_index_dict[word] for word in list(simlex999_df['word1'])]
simplex_index_score.append(word_1_index_list)
word_2_index_list = [vocab_words_index_dict[word] for word in list(simlex999_df['word2'])]
simplex_index_score.append(word_2_index_list)
simplex_index_score.append(list(simlex999_df['simlex999']))

In [11]:
def cosine_similarity(score_index_list,wc=wc):
    cos_sim_res = []
    word_1_list = np.asarray(score_index_list[0])
    word_2_list = np.asarray(score_index_list[1])
    for i in range(0,len(word_1_list)):
        word1= wc[word_1_list[i]]
        word2 = wc[word_2_list[i]]
        
        dot_prod = np.inner(word1,word2)
        norm = np.linalg.norm(word1)*np.linalg.norm(word2)
        if norm ==0:
            cos_sim_res.append(0)
        else:
            cos_sim_res.append(dot_prod/norm)
    
    return cos_sim_res
    

In [12]:
men_result = cosine_similarity(men_index_score,wc)
simplex_result = cosine_similarity(simplex_index_score,wc)

To measure, consine similarity,
$$\frac{u^Tv}{||u|| ||v||}$$
First have to calulate dot product between $u$ and $v$,
$$u^Tv = \sum_{i}{u_iv_i}$$.
Now we have to calculate each norm in $u$ and $v$.
$$||u|| = \sqrt{\sum{u_i^2}}$$
$$||v|| = \sqrt{\sum{v_i^2}}$$

In this case, $u$ is word1 column in men.txt file and $v$ is word2 column in men.txt
Each rows means consist of words to compare similarity

Here are results of cosine similarity. we can see to close 22% correlation value as 20.8%.

In [13]:
stats.spearmanr(men_result,men_index_score[2])

SpearmanrResult(correlation=0.20820333877448238, pvalue=9.809684451459659e-31)

In [14]:
stats.spearmanr(simplex_result,simplex_index_score[2])

SpearmanrResult(correlation=0.09903643187567658, pvalue=0.0017242367116980877)

## Computing PMIs (10 points)

In [15]:
def pmi(wc):
    pmi_matrix = np.zeros(wc.shape)
    col_totals = wc.sum(axis=0)
    total  =wc.sum(axis=0).sum()
    row_totals =wc.sum(axis=1)
    expected = np.outer(row_totals, col_totals) / total
    
    pmi_matrix = np.nan_to_num(wc / expected)
    pmi_matrix = np.log2(pmi_matrix)
    pmi_matrix[pmi_matrix == -inf] = 0
    return (pmi_matrix+np.abs(pmi_matrix))/2

pointwise mutual information's formula is
$$pmi(x,y) =  log_2 \frac{p(x,y)}{p(x)p(y)}$$

To represent probability, we use word context matrix before we calculated. Word context matrix measure when row word is detected, column frequency of word in fixed window size. So we can represent probability by dividing total summation of word context matrix. Preventing calculating error we replace nan to 0 using np.nan_to_num function. After we apply log function and replace -inf to 0. To get Positive Point mutual  information, replace -value to 0. 

In [16]:
c_pmi = pmi(wc)

  pmi_matrix = np.nan_to_num(wc / expected)
  pmi_matrix = np.log2(pmi_matrix)


In [17]:
men_result = cosine_similarity(men_index_score,c_pmi)
simplex_result = cosine_similarity(simplex_index_score,c_pmi)

We can see improved result correlation values.

In [18]:
stats.spearmanr(men_result,men_index_score[2])

SpearmanrResult(correlation=0.5377823646816318, pvalue=1.559072394052421e-224)

In [19]:
stats.spearmanr(simplex_result,simplex_index_score[2])

SpearmanrResult(correlation=0.23667620706495082, pvalue=3.47731035304304e-14)

##  Experimentation (5 points)

In [20]:
w_1 =1
w_3 =3
w_6 =6

In [21]:
w_1_C = context_matrix(w_1)
w_1_C_pmi = pmi(w_1_C)
w_3_C = context_matrix(w_3)
w_3_C_pmi = pmi(w_3_C)
w_6_C = context_matrix(w_6)
w_6_C_pmi = pmi(w_6_C)

100%|██████████| 997898/997898 [04:33<00:00, 3646.03it/s]
  pmi_matrix = np.nan_to_num(wc / expected)
  pmi_matrix = np.log2(pmi_matrix)
100%|██████████| 997898/997898 [06:29<00:00, 2560.38it/s]
100%|██████████| 997898/997898 [09:19<00:00, 1782.36it/s]


In [22]:
w_1men_result = cosine_similarity(men_index_score,w_1_C)
w_1simplex_result = cosine_similarity(simplex_index_score,w_1_C)
w_1pmi_men_result = cosine_similarity(men_index_score,w_1_C_pmi)
w_1pmi_simplex_result = cosine_similarity(simplex_index_score,w_1_C_pmi)

In [23]:
print("w1 word context men spearman : ",stats.spearmanr(w_1men_result,men_index_score[2]))
print("w1 word context simplex spearman : ",stats.spearmanr(w_1simplex_result,simplex_index_score[2]))

print("w1 pmi word context men spearman : ",stats.spearmanr(w_1pmi_men_result,men_index_score[2]))
print("w1 pmi word context simplex spearman : ",stats.spearmanr(w_1pmi_simplex_result,simplex_index_score[2]))

w1 word context men spearman :  SpearmanrResult(correlation=0.12293212069871598, pvalue=1.4175295638434407e-11)
w1 word context simplex spearman :  SpearmanrResult(correlation=0.04712986861809134, pvalue=0.13659426390516577)
w1 pmi word context men spearman :  SpearmanrResult(correlation=0.4439109892960475, pvalue=4.3453146136887976e-145)
w1 pmi word context simplex spearman :  SpearmanrResult(correlation=0.23452325465185517, pvalue=5.99261076726805e-14)


In [24]:
w_3men_result = cosine_similarity(men_index_score,w_3_C)
w_3simplex_result = cosine_similarity(simplex_index_score,w_3_C)
w_3pmi_men_result = cosine_similarity(men_index_score,w_3_C_pmi)
w_3pmi_simplex_result = cosine_similarity(simplex_index_score,w_3_C_pmi)

In [60]:
print("w3 word context men spearman : ",stats.spearmanr(w_3men_result,men_index_score[2]))
print("w3 word context simplex spearman : ",stats.spearmanr(w_3simplex_result,simplex_index_score[2]))
print("w3 pmi word context men spearman : ",stats.spearmanr(w_3pmi_men_result,men_index_score[2]))
print("w3 pmi word context simplex spearman : ",stats.spearmanr(w_3pmi_simplex_result,simplex_index_score[2]))

w3 word context men spearman :  SpearmanrResult(correlation=0.20820333877448238, pvalue=9.809684451459659e-31)
w3 word context simplex spearman :  SpearmanrResult(correlation=0.09903643187567658, pvalue=0.0017242367116980877)
w3 pmi word context men spearman :  SpearmanrResult(correlation=0.5377823646816318, pvalue=1.559072394052421e-224)
w3 pmi word context simplex spearman :  SpearmanrResult(correlation=0.23667620706495082, pvalue=3.47731035304304e-14)


In [26]:
w_6men_result = cosine_similarity(men_index_score,w_6_C)
w_6simplex_result = cosine_similarity(simplex_index_score,w_6_C)
w_6pmi_men_result = cosine_similarity(men_index_score,w_6_C_pmi)
w_6pmi_simplex_result = cosine_similarity(simplex_index_score,w_6_C_pmi)

In [27]:
print("w6 word context men spearman : ",stats.spearmanr(w_6men_result,men_index_score[2]))
print("w6 word context simplex spearman : ",stats.spearmanr(w_6simplex_result,simplex_index_score[2]))
print("w6 pmi word context men spearman : ",stats.spearmanr(w_6pmi_men_result,men_index_score[2]))
print("w6 pmi word context simplex spearman : ",stats.spearmanr(w_6pmi_simplex_result,simplex_index_score[2]))

w6 word context men spearman :  SpearmanrResult(correlation=0.27774002424144634, pvalue=2.864642513740823e-54)
w6 word context simplex spearman :  SpearmanrResult(correlation=0.09520122480250133, pvalue=0.002594490106883811)
w6 pmi word context men spearman :  SpearmanrResult(correlation=0.5321975387032889, pvalue=4.441877569169441e-219)
w6 pmi word context simplex spearman :  SpearmanrResult(correlation=0.1697210205069143, pvalue=6.782142903503612e-08)


As we can see correlation about word men, c_i and pmi show increasing trends. Setting large window size can cover more large words. But in window size 6, pmi score of men is lower than window 3. This trend also can be found at simplex-999. To get high correlation spearman score, we should find proper window size not too small and not too large. 

## 1.4.1 Warm-up: Printing nearest neighbors (3 points)

In [28]:
def context_matrix_v2(w):
    word_context_matrix = np.zeros((len(vocab_25k_words_list),len(vocab_25k_words_list)))
    for sentence in tqdm(wiki_01percent_list):
        word_list = ['<s>']+sentence.split(' ')+['</s>']
        for i,word in enumerate(word_list):
            word = word.strip()
            if word in vocab_25k_words_list:
                for j in range(max(i-w,0),min(i+w,len(word_list))):
                    if word_list[j] in vocab_25k_words_list:
                        word_context_matrix[vocab_25k_words_index_dict[word],vocab_25k_words_index_dict[word_list[j]]]+=1
    return word_context_matrix
        


In [29]:
wc_w1_v2 = context_matrix_v2(1)
wc_w6_v2 = context_matrix_v2(6)

100%|██████████| 997898/997898 [17:45<00:00, 936.45it/s] 
100%|██████████| 997898/997898 [1:15:53<00:00, 219.16it/s]


In [30]:
pmi_w1_v2 =pmi(wc_w1_v2)
pmi_w6_v2 =pmi(wc_w6_v2)

  pmi_matrix = np.nan_to_num(wc / expected)
  pmi_matrix = np.log2(pmi_matrix)


In [57]:
def print_10_nearest_neighbors(pmi,word):
    pmi_index_list = list(pmi[vocab_25k_words_index_dict[word]])
    pmi_index_list.sort(reverse=True)
    
    for i in range(1,11):
        index = list(pmi[vocab_25k_words_index_dict[word]]).index(pmi_index_list[i])
        print(vocab_25k_words_list[index])

In [58]:
print("w=1 ")
print_10_nearest_neighbors(pmi_w1_v2,'monster')

w=1 
ness
cookie
spaghetti
delicious
duel
commonplace
humanoid
closet
mister
mighty


In [59]:
print("w=6 ")
print_10_nearest_neighbors(pmi_w6_v2,'monster')

w=6 
frankenstein
ness
bonuses
cookie
clown
loch
slaughtered
rune
psycho
gaga


## 1.4.2 Part-of-speech tag similarity in nearest neighbors (7 points)

In [83]:
print("query word ness ")
print_10_nearest_neighbors(pmi_w6_v2,'ness')

query word ness 
loch
headland
yates
kris
eliot
caledonian
polk
33rd
monster
vanessa


In [82]:
print("query word frankenstein")
print_10_nearest_neighbors(pmi_w6_v2,'frankenstein')

query word frankenstein
metallica
gorgeous
mummy
dracula
helpless
octopus
shelley
monster
fay
weighs


In window size 6, we can see frankenstein and ness nearest list contain 'monster'. both of them are in monster's nearest neighbors.

In [84]:
print("query word ness ")
print_10_nearest_neighbors(pmi_w1_v2,'ness')

query word ness 
loch
eliot
fife
marilyn
jennifer
van
mike
frederick
fair
makes


In [85]:
print("query word cookie ")
print_10_nearest_neighbors(pmi_w1_v2,'cookie')

query word cookie 
chip
butter
http
authentication
sandwich
fortune
chocolate
well-known
helps
howard


In win dow size 1, as monster's nearest neighbors, cookie and ness show their nearest neighbors list. They didn't contain monster. This means window size can affect consisting nearest neighbors.

## 1.4.3 Words with multiple senses (5 points

In [87]:
words_list = ['bank', 'blue', 'apple', 'love', 'water', 'flame', 'light', 'well']
for word in words_list:
    print('__________________________________________\n')
    print('Key words is ',word)
    print("w1 : ")
    print_10_nearest_neighbors(pmi_w1_v2,word)
    print('__________________________________________\n')
    print("w6 : ")
    print_10_nearest_neighbors(pmi_w6_v2,word)

__________________________________________

Key words is  bank
w1 : 
savings
habib
sulphur
fargo
macquarie
deutsche
pci
co-operative
grassy
planters
__________________________________________

w6 : 
robber
anglo
imf
savings
habib
robbery
lending
sava
fargo
planters
__________________________________________

Key words is  blue
w1 : 
nypd
winnipeg
greenish
caspian
cobalt
zebra
translucent
columbus
pale
brighter
__________________________________________

w6 : 
jays
jackets
nypd
ribbon
heron
shirts
pigments
shirt
stripe
indigo
__________________________________________

Key words is  apple
w1 : 
cemented
poisoned
crab
domesticated
dessert
fiona
ipad
4000
jamaican
benefited
__________________________________________

w6 : 
macintosh
ipod
pineapple
pear
strawberry
orchards
pie
s1
ipad
iphone
__________________________________________

Key words is  love
w1 : 
unconditional
courtney
passionate
angelina
denoting
dude
obsessive
careless
latex
gotta
__________________________________________



We try following word list bank, blue, apple, love, water, flame, light, well. In window size 6, it can find the reasonable nearest neighbors at query words. One interest thing is in fruit key words apple, we can find ipad as nearest neighbor. In window size 6, iphone is also founded. This means frequently used words can be grouped regardless of their properties. On the other hand, among words, each nearest neighbor is grouped based on use of frequence.