# Evaluating embeddings via the semantic similarity task

In [57]:
# importing libraries 
import numpy as np
import scipy
from scipy import stats
from numpy.linalg import norm

## Reading files 

In [58]:
def read_embeddings(filename):
    """read a file containing word vectors and return their normalized forms in a dict"""
    word_vecs = {}  # keys: str(words) ; values: np.array(normalized vectors)
    with (gzip.open(filename, 'rt', encoding='utf-8') if filename.endswith('.gz') else open(filename, 'r', encoding='utf-8')) as file:  # 'rt' = open for reading as text file
        for line in file:
            elements = line.strip().lower().split()
            word = elements[0]
            vec = np.array([float(val) for val in elements[1:]], dtype=float)
            
            # get Euclidean norm
            vec_norm = np.linalg.norm(vec)
            
            # normalize vector
            word_vecs[word] = vec / vec_norm

    return word_vecs

word_vecs_pretrain = read_embeddings("embeddings/vectors_datatxt_250_sg_w10_i5_c500_gensim_clean")


In [59]:
word_vecs_retro = read_embeddings("embeddings/out_naz_ppdb_250.txt")

In [60]:
q_hat_king = word_vecs_pretrain["king"]
q_hat_man = word_vecs_pretrain["man"]
q_hat_woman = word_vecs_pretrain["woman"]


q_pred = q_hat_king - q_hat_man + q_hat_woman

q_hat_queen = word_vecs_pretrain["queen"]

print(f"similiarity between pred embedding and `queen` embedding: {np.dot(q_pred, q_hat_queen): .2}")

similiarity between pred embedding and `queen` embedding:  0.85


In [61]:
q_king = word_vecs_retro["king"]
q_man = word_vecs_retro["man"]
q_woman = word_vecs_retro["woman"]


q_pred = q_king - q_man + q_woman

q_queen = word_vecs_retro["queen"]

print(f"similiarity between pred embedding and `queen` retrofitted embedding: {np.dot(q_pred, q_queen): .2}")

similiarity between pred embedding and `queen` retrofitted embedding:  0.85


In [62]:
q_hat_costly = word_vecs_pretrain["costly"]

q_hat_pricey = word_vecs_pretrain["pricey"]

print(f"similarity between `costly` and `pricey` before retrofitting: {np.dot(q_hat_costly, q_hat_pricey): .2}")

similarity between `costly` and `pricey` before retrofitting:  0.44


In [63]:
q_costly = word_vecs_retro["costly"]

q_pricey = word_vecs_retro["pricey"]

print(f"similarity between `costly` and `pricey` after retrofitting: {np.dot(q_costly, q_pricey): .2}")

similarity between `costly` and `pricey` after retrofitting:  0.78


In [64]:
def get_X_Y(filename, word_vecs):
    """read a word similarity file and return arrays containing human scores and system scores respectively"""
    with open(filename, "r", encoding='utf-8') as file:
        
        X = [] # human scores
        Y = [] # system scores
        for line in file:
            
            line = line.strip().split()
     
            # skip words we don't have embeddings for
            if line[0] not in word_vecs or line[1] not in word_vecs:
                continue
                
            # human score is last element in line
            X.append(float(line[-1]))
                        
            # get dot product since vectors already normalized
            Y.append(np.dot(word_vecs[line[0]], word_vecs[line[1]]))
            
    return np.array(X), np.array(Y)
    
X, Y_pretrain = get_X_Y("datasets/ws353.txt", word_vecs_pretrain)
X, Y_retro = get_X_Y("datasets/ws353.txt", word_vecs_retro)

In [65]:
print(Y_retro.shape)
print(X.shape)

(335,)
(335,)


## Using `scipy.stats.spearmanr` to compute and compare word similarities

In [66]:
# computing Spearman correlation coefficient for pretrained vectors and retrofitted ppdb vectors
res_pretrain = scipy.stats.spearmanr(X, Y_pretrain)

res_retro = scipy.stats.spearmanr(X, Y_retro)

print(f"Spearman's correlation (original vectors): {res_pretrain[0]: .2}")
print(f"Spearman's correlation (retrofitted vectors): {res_retro[0]: .2}")

Spearman's correlation (original vectors):  0.69
Spearman's correlation (retrofitted vectors):  0.64


## Remarks

- Slight decrase in Spearman correlation when using ppdb but original pretrained vectors already correlate relatively highly
- Lower as with vectors retrofitted using Faruqui et al.'s original program

## French data

In [67]:
# TODO