In [29]:
import os
import pickle
import argparse
import matplotlib
import numpy as np
import pandas as pd

from operator import itemgetter
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
data_dir = 'data'
lan = 'it'
wsd_dir = 'WS353'

In [91]:
# Word similarity dataset WS353
sim_ds = pd.read_csv(os.path.join(wsd_dir, 'MWS353_Italian.txt'), sep=",")
sim_ds

Unnamed: 0,Word1,Word2,1,2,3,4,5,6,7,8,9,10,11,12,13,Average Score
0,amore,sesso,8.0,8.0,7.0,8.0,6.0,8,7.0,8.0,9.0,7,10.0,9.0,10.0,8.08
1,tigre,gatto,8.0,10.0,6.0,7.0,9.0,7,8.0,7.0,10.0,7,9.0,8.0,10.0,8.15
2,tigre,tigre,10.0,10.0,10.0,10.0,10.0,10,10.0,10.0,10.0,10,10.0,10.0,10.0,10.00
3,libro,carta,6.0,10.0,8.0,6.0,8.0,8,7.0,8.0,10.0,9,10.0,9.0,10.0,8.38
4,computer,tastiera,6.0,10.0,10.0,7.0,8.0,6,6.0,8.0,10.0,9,10.0,9.0,10.0,8.38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,doccia,alluvione,4.0,5.0,5.0,5.0,7.0,9,2.0,5.0,5.0,6,2.0,5.0,4.0,4.92
346,tempo,previsioni,9.0,8.0,7.5,8.5,8.0,6,6.0,9.0,9.0,7,8.0,9.0,8.0,7.92
347,disastro,area,7.0,7.0,3.0,4.5,2.0,9,4.0,3.0,7.0,5,4.0,6.0,7.0,5.27
348,governatore,ufficio,8.0,5.0,4.0,6.0,6.0,8,6.0,6.0,7.0,7,6.0,7.0,8.0,6.46


In [75]:
# dictionary = { word: index }
word2idx = pickle.load(open(os.path.join(data_dir, lan, 'word2idx.dat'), 'rb'))
# ndarray ( vocabulary_size x embedding_size ). Row i = embedding WITHOUT SUBWORDS' INFO of word with index i.
idx2vec = pickle.load(open(os.path.join(data_dir, lan, 'idx2vec.dat'), 'rb'))
# ndarray ( vocabulary_size x embedding_size ). Row i = embedding WITH SUBWORDS' INFO of word with index i.
idx2vec_ngrams = pickle.load(open(os.path.join(data_dir, lan, 'idx2vec_ngrams.dat'), 'rb'))

In [76]:
idx2vec = np.concatenate((idx2vec, np.zeros((1,300))), axis = 0)
idx2vec_ngrams = np.concatenate((idx2vec_ngrams, np.zeros((1,300))), axis = 0)

In [33]:
# Number of words in common between our datests

sim_words = set(ws353.Word1.values).union(set(ws353.Word2.values))
train_words = set(word2idx.keys())
common_words = [w for w in sim_words if w in train_words]
print(f"There are {len(common_words)} words in common")

There are 353 words in common


In [43]:
pair_of_words = sim_ds[['Word1', 'Word2']].values

In [51]:
pair_of_words_idx = []
for pair in pair_of_words:
    if pair[0] in common_words and pair[1] in common_words:
        pair_of_words_idx.append([word2idx[pair[0]], word2idx[pair[1]]])
    elif pair[0] in common_words:
        pair_of_words_idx.append([word2idx[pair[0]], -1])
    elif pair[1] in common_words:
        pair_of_words_idx.append([-1, word2idx[pair[1]]])
    else:
        pair_of_words_idx.append([-1,-1])

In [107]:
train_score = []
for pair in pair_of_words_idx:
    train_score.append(cosine_similarity(idx2vec[pair[0]].reshape(1,-1), idx2vec[pair[1]].reshape(1,-1)).item())
train_score = np.array(train_score).reshape(-1,1)
train_score.shape

(350, 1)

In [105]:
train_score_ngrams = []
for pair in pair_of_words_idx:
    train_score_ngrams.append(cosine_similarity(idx2vec_ngrams[pair[0]].reshape(1,-1), idx2vec_ngrams[pair[1]].reshape(1,-1)).item())
train_score_ngrams = np.array(train_score_ngrams).reshape(-1,1)
train_score_ngrams.shape

(350, 1)

In [106]:
sim_score = sim_ds['Average Score'].values.reshape(-1,1)
sim_score.shape

(350, 1)

In [98]:
from scipy.stats import spearmanr

In [108]:
spearmanr(sim_score, train_score)

SpearmanrResult(correlation=0.09829328284981041, pvalue=0.06624374165788502)

In [109]:
spearmanr(sim_score, train_score_ngrams)

SpearmanrResult(correlation=-0.009554328236838704, pvalue=0.8586367830112147)