In [172]:
import os
import pickle
import argparse
import matplotlib
import numpy as np
import pandas as pd

from operator import itemgetter
from sklearn.metrics.pairwise import cosine_similarity

In [173]:
data_dir = 'data'
lan = 'it'
wsd_dir = 'WS353'

In [174]:
# Word similarity dataset WS353
lan_to_file = {'de': 'German', 'en': 'English', 'it': 'Italian'}
print(os.path.join('6', wsd_dir, f'MWS353_{lan_to_file[lan]}.txt'))
sim_ds = pd.read_csv(os.path.join('6', wsd_dir, f'MWS353_{lan_to_file[lan]}.txt'), sep=",")
sim_ds

6/WS353/MWS353_Italian.txt


Unnamed: 0,Word1,Word2,1,2,3,4,5,6,7,8,9,10,11,12,13,Average Score
0,amore,sesso,8.0,8.0,7.0,8.0,6.0,8,7.0,8.0,9.0,7,10.0,9.0,10.0,8.08
1,tigre,gatto,8.0,10.0,6.0,7.0,9.0,7,8.0,7.0,10.0,7,9.0,8.0,10.0,8.15
2,tigre,tigre,10.0,10.0,10.0,10.0,10.0,10,10.0,10.0,10.0,10,10.0,10.0,10.0,10.0
3,libro,carta,6.0,10.0,8.0,6.0,8.0,8,7.0,8.0,10.0,9,10.0,9.0,10.0,8.38
4,computer,tastiera,6.0,10.0,10.0,7.0,8.0,6,6.0,8.0,10.0,9,10.0,9.0,10.0,8.38
5,computer,internet,7.0,10.0,10.0,7.5,8.0,8,5.0,8.5,9.0,8,10.0,9.0,10.0,8.46
6,aereo,macchina,6.0,5.0,0.0,5.0,7.0,7,3.0,5.0,9.0,7,8.0,4.0,8.0,5.69
7,treno,macchina,6.0,5.0,0.0,5.0,7.0,7,4.0,5.0,9.0,7,8.0,4.0,8.0,5.77
8,telefono,comunicazione,9.0,10.0,10.0,6.0,8.0,9,6.0,8.5,10.0,8,10.0,9.0,10.0,8.73
9,televisione,radio,7.0,9.0,10.0,5.0,8.0,7,7.0,8.0,9.0,7,9.0,7.0,9.0,7.85


In [175]:
# dictionary = { word: index }
word2idx = pickle.load(open(os.path.join(data_dir, lan, 'word2idx.dat'), 'rb'))
# ndarray ( vocabulary_size x embedding_size ). Row i = embedding WITHOUT SUBWORDS' INFO of word with index i.
idx2vec = pickle.load(open(os.path.join(data_dir, lan, 'idx2vec.dat'), 'rb'))
# ndarray ( vocabulary_size x embedding_size ). Row i = embedding WITH SUBWORDS' INFO of word with index i.
idx2vec_ngrams = pickle.load(open(os.path.join(data_dir, lan, 'idx2vec_ngrams.dat'), 'rb'))

In [176]:
idx2vec = np.concatenate((idx2vec, np.zeros((1,300))), axis = 0)
idx2vec_ngrams = np.concatenate((idx2vec_ngrams, np.zeros((1,300))), axis = 0)

In [177]:
# Number of words in common between our datests
ws353 = sim_ds
sim_words = set(ws353.Word1.values).union(set(ws353.Word2.values))
train_words = set(word2idx.keys())
common_words = [w for w in sim_words if w in train_words]
print(f"There are {len(common_words)} words in common")
common_words

There are 353 words in common


['parola',
 'rivista',
 'compagnia',
 'paura',
 'ragione',
 'imparzialità',
 'troupe',
 'pianificazione',
 'partito',
 'lusso',
 'dottore',
 'FBI',
 'pane',
 'treno',
 'rete',
 'piano',
 'ritardo',
 'comunicazione',
 'pelle',
 'bere',
 'uovo',
 'arrivo',
 'telespettatore',
 'regina',
 'guadagno',
 'macchina',
 'Gerusalemme',
 'fisica',
 'cibo',
 'occhio',
 'prossimità',
 'pregiudizio',
 'foresta',
 'assicurazione',
 'vestiti',
 'commercio',
 'metallo',
 'star',
 'costa',
 'gioco',
 'commissione',
 'profitto',
 'Arafat',
 'Marte',
 'genere',
 'bambini',
 'organismo',
 'cento',
 'presidente',
 'partita',
 'aumento',
 'scatto',
 'studente',
 'prenotazione',
 'industria',
 'blocco',
 'rendimento',
 'droga',
 'antecedente',
 'sole',
 'fratello',
 'viale',
 'vittoria',
 'ufficio',
 'popolazione',
 'situazione',
 'calcolo',
 'istituzione',
 'famiglia',
 'ministro',
 'ritiro',
 'turno',
 'tazza',
 'crisi',
 'amore',
 'acqua',
 'doccia',
 'aeroporto',
 'software',
 'campionato',
 'sistemazione'

In [178]:
pair_of_words = sim_ds[['Word1', 'Word2']].values
pair_of_words

array([['amore', 'sesso'],
       ['tigre', 'gatto'],
       ['tigre', 'tigre'],
       ['libro', 'carta'],
       ['computer', 'tastiera'],
       ['computer', 'internet'],
       ['aereo', 'macchina'],
       ['treno', 'macchina'],
       ['telefono', 'comunicazione'],
       ['televisione', 'radio'],
       ['media', 'radio'],
       ['droga', 'abuso'],
       ['pane', 'burro'],
       ['cetriolo', 'patata'],
       ['dottore', 'infermiera'],
       ['professore', 'dottore'],
       ['studente', 'professore'],
       ['intelligente', 'studente'],
       ['intelligente', 'stupido'],
       ['compagnia', 'azioni'],
       ['provvista', 'mercato'],
       ['provvista', 'telefono'],
       ['provvista', 'CD'],
       ['provvista', 'giaguaro'],
       ['provvista', 'uovo'],
       ['fertilit', 'uovo'],
       ['provvista', 'vivo'],
       ['provvista', 'vita'],
       ['libro', 'biblioteca'],
       ['banca', 'denaro'],
       ['legno', 'foresta'],
       ['denaro', 'contanti'],
       [

In [179]:
pair_of_words_idx = []
common_pairs = []
common_is = []
common_pairs_idx = []
for i, pair in enumerate(pair_of_words):
    if pair[0] in common_words and pair[1] in common_words:
        common_pairs.append(pair)
        common_is.append(i)
        common_pairs_idx.append([word2idx[pair[0]], word2idx[pair[1]]])
        pair_of_words_idx.append([word2idx[pair[0]], word2idx[pair[1]]])
    elif pair[0] in common_words:
        pair_of_words_idx.append([word2idx[pair[0]], -1])
    elif pair[1] in common_words:
        pair_of_words_idx.append([-1, word2idx[pair[1]]])
    else:
        pair_of_words_idx.append([-1,-1])

In [180]:
train_score = []
# for pair in pair_of_words_idx:
for pair in common_pairs_idx:
    train_score.append(cosine_similarity(idx2vec[pair[0]].reshape(1,-1), idx2vec[pair[1]].reshape(1,-1)).item())
train_score = np.array(train_score).reshape(-1,1)
train_score.shape

(251, 1)

In [181]:
train_score_ngrams = []
#for pair in pair_of_words_idx:
for pair in common_pairs_idx:
    train_score_ngrams.append(cosine_similarity(idx2vec_ngrams[pair[0]].reshape(1,-1), idx2vec_ngrams[pair[1]].reshape(1,-1)).item())
train_score_ngrams = np.array(train_score_ngrams).reshape(-1,1)
train_score_ngrams.shape

(251, 1)

In [182]:
common_sim = sim_ds.iloc[common_is]
sim_score = common_sim['Average Score'].values.reshape(-1,1)
sim_score.shape

(251, 1)

In [183]:
from scipy.stats import spearmanr

In [184]:
spearmanr(sim_score, train_score)

SpearmanrResult(correlation=0.19189302698057475, pvalue=0.002262576215787963)

In [185]:
spearmanr(sim_score, train_score_ngrams)

SpearmanrResult(correlation=0.012359655678332786, pvalue=0.8455152285059001)

In [186]:
# let's have a look - which words are actually correlated according to the rank?
# and which are not?
# note: if there was perfect correspondence, model score * 10 would be the avg score 
common_sim.loc[:,'base'] = np.round(train_score * 10, 2)
common_sim.loc[:,'ngram'] = np.round(train_score_ngrams * 10, 2)
common_sim.loc[:, 'bdiff'] = np.abs(common_sim['Average Score'] - common_sim['base'])
common_sim.loc[:, 'ngdiff'] = np.abs(common_sim['Average Score'] - common_sim['ngram'])
ngram_better = (common_sim['bdiff'] > common_sim['ngdiff']).sum()
print('Similarity scores of ngram based approach was better in ', ngram_better, 'of', len(common_sim), 
      'cases. (', 100*round(ngram_better/len(common_sim), 3), '%)')
pd.set_option('display.max_rows', None)
common_sim[['Word1', 'Word2', 'Average Score', 'base', 'ngram', 'bdiff', 'ngdiff']]

Similarity scores of ngram based approach was better in  222 of 251 cases. ( 88.4 %)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,Word1,Word2,Average Score,base,ngram,bdiff,ngdiff
0,amore,sesso,8.08,0.91,2.99,7.17,5.09
1,tigre,gatto,8.15,5.0,8.49,3.15,0.34
2,tigre,tigre,10.0,10.0,10.0,0.0,0.0
3,libro,carta,8.38,0.8,4.7,7.58,3.68
4,computer,tastiera,8.38,0.24,3.44,8.14,4.94
5,computer,internet,8.46,1.57,3.93,6.89,4.53
6,aereo,macchina,5.69,0.75,4.01,4.94,1.68
7,treno,macchina,5.77,0.65,3.06,5.12,2.71
8,telefono,comunicazione,8.73,0.82,4.31,7.91,4.42
9,televisione,radio,7.85,2.18,1.51,5.67,6.34


In [187]:
# how much better? And in which cases did it fail?
much_worse = common_sim[common_sim['bdiff'] < common_sim['ngdiff'] - 3]
print('largest/mean avg score where ngram model was outperformed by base model:', 
      much_worse['Average Score'].max(), '/', round(much_worse['Average Score'].mean(),2))
much_worse

largest/mean avg score where ngram model was outperformed by base model: 3.38 / 1.7


Unnamed: 0,Word1,Word2,1,2,3,4,5,6,7,8,9,10,11,12,13,Average Score,base,ngram,bdiff,ngdiff
81,cimitero,foresta,0.0,0.0,0.0,2.0,1.0,2,0.0,3.0,4.0,0,2.0,4.0,2.0,1.54,1.65,8.29,0.11,6.75
91,mezzogiorno,corda,0.0,0.0,0.0,1.5,0.0,0,0.0,1.0,0.0,0,0.0,3.0,0.0,0.42,0.32,8.45,0.1,8.03
177,volontario,motto,5.0,2.0,1.0,4.0,8.0,0,5.0,3.0,2.0,0,4.0,7.0,3.0,3.38,3.61,7.43,0.23,4.05
323,zucchero,approccio,0.0,0.0,1.0,0.0,0.0,4,2.0,1.0,3.0,4,0.0,1.0,3.0,1.46,1.28,5.9,0.18,4.44
