## Word similarity task

In [1]:
# imports
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, spearmanr

In [17]:
def get_pearson_spearman(filename):
    # get dictionary of word:vector
    words_dict = {}
    with open(filename, "r") as f:
        for line in f:
            word = line.split()
            words_dict[word[0]] = word[1:] 
    # get similarities of every word pair in simlex
    simlex = []
    model_simlex = []
    with open("SimLex-999.txt", "r") as f:
        next(f)
        for line in f:
            line_list = line.split()
            word1 = line_list[0]
            word2 = line_list[1]
            if word1 in words_dict and  word2 in words_dict:
                simlex.append(float(line_list[3]))
                sim = cosine_similarity([words_dict[word1]], [words_dict[word2]])
                model_simlex.append(float(sim))
    # calculate pearson and spearman with simlex
    pearson_simlex = pearsonr(model_simlex, simlex)
    spearman_simlex = spearmanr(model_simlex, simlex)
    
    # get similarities of wordpairs in MEN
    MEN = []
    model_MEN = []
    with open("MEN_dataset_natural_form_full", "r") as f:
        for line in f:
            line_list = line.split()
            word1 = line_list[0]
            word2 = line_list[1]
            if word1 in words_dict and  word2 in words_dict:
                MEN.append(float(line_list[2]))
                sim = cosine_similarity([words_dict[word1]], [words_dict[word2]])
                model_MEN.append(float(sim))
    # calculate pearson and spearman with MEN
    pearson_MEN = pearsonr(model_MEN, MEN)
    spearman_MEN = spearmanr(model_MEN, MEN)
    return[pearson_simlex, spearman_simlex, pearson_MEN, spearman_MEN]
    

In [19]:
[p_lex, s_lex, p_men, s_men] = get_pearson_spearman("deps.words")
print('simlex correlations')
print(p_lex)
print(s_lex)
print('MEN correlations')
print(p_men)
print(s_men)

simlex correlations
(0.46190134429723567, 6.838935662413988e-54)
SpearmanrResult(correlation=0.44564093493303847, pvalue=7.414295711467646e-50)
MEN correlations
(0.5974016044666725, 1.0183783114142845e-289)
SpearmanrResult(correlation=0.6178227900207052, pvalue=2.37026036e-315)


In [20]:
[p_lex, s_lex, p_men, s_men] = get_pearson_spearman("bow5.words")
print('simlex correlations')
print(p_lex)
print(s_lex)
print('MEN correlations')
print(p_men)
print(s_men)

simlex correlations
(0.3756005970668715, 8.607410572298394e-35)
SpearmanrResult(correlation=0.36739613669787896, pvalue=2.9775781067162087e-33)
MEN correlations
(0.708236248047157, 0.0)
SpearmanrResult(correlation=0.7231686561368845, pvalue=0.0)


In [21]:
[p_lex, s_lex, p_men, s_men] = get_pearson_spearman("bow2.words")
print('simlex correlations')
print(p_lex)
print(s_lex)
print('MEN correlations')
print(p_men)
print(s_men)

simlex correlations
(0.4284586618164498, 7.992996652236848e-46)
SpearmanrResult(correlation=0.41414576777339385, pvalue=1.226812869252797e-42)
MEN correlations
(0.6776982244699229, 0.0)
SpearmanrResult(correlation=0.699904755830819, pvalue=0.0)
