# Evaluating embeddings via the semantic similarity task

In [1]:
# importing libraries 
import numpy as np
import scipy
from scipy import stats
from numpy.linalg import norm

## Reading files 

In [22]:
def read_embeddings(filename):
    """read a file containing word vectors and return their normalized forms in a dict"""
    word_vecs = {}  # keys: str(words) ; values: np.array(normalized vectors)
    with (gzip.open(filename, 'rt', encoding='utf-8') if filename.endswith('.gz') else open(filename, 'r', encoding='utf-8')) as file:  # 'rt' = open for reading as text file
        for line in file:
            elements = line.strip().split()
            word = elements[0]
            vec = np.array([float(val) for val in elements[1:]], dtype=float)
            
            # get Euclidean norm
            vec_norm = np.linalg.norm(vec)
            
            # normalize vector
            word_vecs[word] = vec / vec_norm

    return word_vecs

word_vecs_pretrain = read_embeddings("embeddings/vectors_datatxt_250_sg_w10_i5_c500_gensim_clean")


In [3]:
word_vecs_retro = read_embeddings("embeddings/out_retrofitted_ppdb_250.txt")

In [4]:
q_hat_king = word_vecs_pretrain["king"]
q_hat_man = word_vecs_pretrain["man"]
q_hat_woman = word_vecs_pretrain["woman"]


q_pred = q_hat_king - q_hat_man + q_hat_woman

q_hat_queen = word_vecs_pretrain["queen"]

print(f"similiarity between pred embedding and `queen` embedding: {np.dot(q_pred, q_hat_queen): .2}")

similiarity between pred embedding and `queen` embedding:  0.85


In [5]:
q_king = word_vecs_retro["king"]
q_man = word_vecs_retro["man"]
q_woman = word_vecs_retro["woman"]


q_pred = q_king - q_man + q_woman

q_queen = word_vecs_retro["queen"]

print(f"similiarity between pred embedding and `queen` retrofitted embedding: {np.dot(q_pred, q_queen): .2}")

similiarity between pred embedding and `queen` retrofitted embedding:  0.85


In [6]:
q_hat_costly = word_vecs_pretrain["costly"]

q_hat_pricey = word_vecs_pretrain["pricey"]

print(f"similarity between `costly` and `pricey` before retrofitting: {np.dot(q_hat_costly, q_hat_pricey): .2}")

similarity between `costly` and `pricey` before retrofitting:  0.44


In [7]:
q_costly = word_vecs_retro["costly"]

q_pricey = word_vecs_retro["pricey"]

print(f"similarity between `costly` and `pricey` after retrofitting: {np.dot(q_costly, q_pricey): .2}")

similarity between `costly` and `pricey` after retrofitting:  0.78


In [23]:
def get_X_Y(filename, word_vecs):
    """read a word similarity file and return arrays containing human scores and system scores respectively"""
    with open(filename, "r", encoding='utf-8') as file:
        
        X = [] # human scores
        Y = [] # system scores
        for line in file:
            
            line = line.strip().split()
     
            # skip words we don't have embeddings for
            if line[0] not in word_vecs or line[1] not in word_vecs:
                continue
                
            # human score is last element in line
            X.append(float(line[-1]))
                        
            # get dot product since vectors already normalized
            Y.append(np.dot(word_vecs[line[0]], word_vecs[line[1]]))
            
    return np.array(X), np.array(Y)
    
X, Y_pretrain = get_X_Y("datasets/ws353.txt", word_vecs_pretrain)
X, Y_retro = get_X_Y("datasets/ws353.txt", word_vecs_retro)

In [9]:
print(Y_retro.shape)
print(X.shape)

(335,)
(335,)


## Using `scipy.stats.spearmanr` to compute and compare word similarities

In [10]:
# computing Spearman correlation coefficient for pretrained vectors and retrofitted ppdb vectors
res_pretrain = scipy.stats.spearmanr(X, Y_pretrain)

res_retro = scipy.stats.spearmanr(X, Y_retro)

print(f"Spearman's correlation (original vectors): {res_pretrain[0]: .2}")
print(f"Spearman's correlation (retrofitted vectors) (ppdb): {res_retro[0]: .2}")

Spearman's correlation (original vectors):  0.69
Spearman's correlation (retrofitted vectors) (ppdb):  0.64


In [11]:
word_vecs_retro_wn_syn = read_embeddings("embeddings/out_retrofitted_wn_syn_250.txt")
X, Y_retro_wn_syn = get_X_Y("datasets/ws353.txt", word_vecs_retro_wn_syn)
res_retro_wn_syn = scipy.stats.spearmanr(X, Y_retro_wn_syn)

print(f"Spearman's correlation (retrofitted vectors) (wn_syn): {res_retro_wn_syn[0]: .2}")

Spearman's correlation (retrofitted vectors) (wn_syn):  0.64


In [16]:
word_vecs_retro_wn_all = read_embeddings("embeddings/out_retrofitted_wn_all_250.txt")
X, Y_retro_wn_all = get_X_Y("datasets/ws353.txt", word_vecs_retro_wn_all)
res_retro_wn_all = scipy.stats.spearmanr(X, Y_retro_wn_all)

print(f"Spearman's correlation (retrofitted vectors) (wn_all): {res_retro_wn_all[0]: .2}")

Spearman's correlation (retrofitted vectors) (wn_all):  0.69


## Remarks

- Slight decrease in Spearman correlation when using ppdb but original pretrained vectors already correlate relatively highly
- ppdb version 2.0 seems to be the cause

## French data

In [13]:
word_vecs_fr = read_embeddings("embeddings/vecs100-linear-frwiki")
X, Y = get_X_Y("datasets/rg65_french.txt", word_vecs_fr)
res_pretrain_fr = scipy.stats.spearmanr(X, Y)

print(f"Spearman's correlation (pretrained french vectors): {res_pretrain_fr[0]: .2}")

Spearman's correlation (pretrained french vectors):  0.7


In [14]:
word_vecs_fr_wn_syn = read_embeddings("embeddings/out_retrofitted_fr_wn_syn_100.txt")
X, Y_retro_wn_syn = get_X_Y("datasets/rg65_french.txt", word_vecs_fr_wn_syn)
res_retro_wn_syn = scipy.stats.spearmanr(X, Y_retro_wn_syn)

print(f"Spearman's correlation (retrofitted vectors) (fr_wn_syn): {res_retro_wn_syn[0]: .2}")

Spearman's correlation (retrofitted vectors) (fr_wn_syn):  0.63


In [15]:
word_vecs_fr_wn_all = read_embeddings("embeddings/out_retrofitted_fr_wn_all_100.txt")
X, Y_retro_wn_all = get_X_Y("datasets/rg65_french.txt", word_vecs_fr_wn_all)
res_retro_wn_all = scipy.stats.spearmanr(X, Y_retro_wn_all)

print(f"Spearman's correlation (retrofitted vectors) (fr_wn_all): {res_retro_wn_all[0]: .2}")

Spearman's correlation (retrofitted vectors) (fr_wn_all):  0.67


In [24]:
word_vecs_fr_ppdb = read_embeddings("embeddings/out_retrofitted_fr_ppdb_100.txt")
X, Y_retro_ppdb = get_X_Y("datasets/rg65_french.txt", word_vecs_fr_ppdb)
res_retro_ppdb = scipy.stats.spearmanr(X, Y_retro_ppdb)

print(f"Spearman's correlation (retrofitted vectors) (ppdb): {res_retro_ppdb[0]: .2}")

Spearman's correlation (retrofitted vectors) (ppdb):  0.73
