# Evaluating embeddings via the semantic similarity task

In [1]:
# importing libraries 
import numpy as np
import scipy
from scipy import stats
from numpy.linalg import norm

## Reading files 

In [2]:
def read_embeddings(filename):
    """read a file containing word vectors and return their normalized forms in a dict"""
    word_vecs = {}  # keys: str(words) ; values: np.array(normalized vectors)
    with (gzip.open(filename, 'rt', encoding='utf-8') if filename.endswith('.gz') else open(filename, 'r', encoding='utf-8')) as file:  # 'rt' = open for reading as text file
        for line in file:
            elements = line.strip().lower().split()
            word = elements[0]
            vec = np.array([float(val) for val in elements[1:]], dtype=float)
            
            # get Euclidean norm
            vec_norm = np.linalg.norm(vec)
            
            # normalize vector
            word_vecs[word] = vec / vec_norm

    return word_vecs

word_vecs_pretrain = read_embeddings("embeddings/vectors_datatxt_250_sg_w10_i5_c500_gensim_clean")


In [8]:
word_vecs_retro = read_embeddings("embeddings/out_naz_250.txt")

In [9]:
def get_X_Y(filename, word_vecs):
    """read a word similarity file and return arrays containing human scores and system scores respectively"""
    with open(filename, "r", encoding='utf-8') as file:
        
        X = [] # human scores
        Y = [] # system scores
        for line in file:
            
            line = line.strip().split()
     
            # skip words we don't have embeddings for
            if line[0] not in word_vecs or line[1] not in word_vecs:
                continue
                
            # human score is last element in line
            X.append(float(line[-1]))
                        
            # get dot product since vectors already normalized
            Y.append(np.dot(word_vecs[line[0]], word_vecs[line[1]]))
            
    return np.array(X), np.array(Y)
    
X, Y_pretrain = get_X_Y("datasets/ws353.txt", word_vecs_pretrain)
X, Y_retro = get_X_Y("datasets/ws353.txt", word_vecs_retro)

In [10]:
print(Y_retro.shape)
print(X.shape)

(335,)
(335,)


## Using `scipy.stats.spearmanr` to compute and compare word similarities

In [11]:
# computing Spearman correlation coefficient for pretrained vectors and retrofitted vectors
res_pretrain = scipy.stats.spearmanr(X, Y_pretrain)

res_retro = scipy.stats.spearmanr(X, Y_retro)

print(f"Spearman's correlation (original vectors): {res_pretrain[0]: .2}")
print(f"Spearman's correlation (retrofitted vectors): {res_retro[0]: .2}")

Spearman's correlation (original vectors):  0.69
Spearman's correlation (retrofitted vectors):  0.71


## Remarks

- Slight increase in Spearman correlation but original pretrained vectors already correlate relatively highly
- Same result as with vectors retrofitted using Faruqui et al.'s original program

## French data

In [12]:
# TODO