# Esercizio 1

la prima parte di questo esercizio consiste nell'implementare tre misure di similarità basate su WordNet.

Per ciascuna di tali misure di similarità, calcolare
- gli indici di correlazione di Spearman e
- gli indici di correlazione di Pearson fra i risultati ottenuti e quelli ‘target’ presenti nel file annotato.

In [14]:
import pandas as pd
import numpy as np
from nltk.corpus import wordnet as wn
from scipy.stats import pearsonr
from scipy.stats import spearmanr

corpus = pd.read_csv('datasets/WordSim353.csv', sep=',', engine='python')

corpus

Unnamed: 0,Word 1,Word 2,Human (mean)
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.00
3,book,paper,7.46
4,computer,keyboard,7.62
...,...,...,...
348,shower,flood,6.03
349,weather,forecast,8.34
350,disaster,area,6.25
351,governor,office,6.34


# Utils

- Termini vs sensi: sim(w1, w2) = max[sim(c1, c2)]

In [15]:
from itertools import product
from nltk.corpus import wordnet_ic

max_depth = max(max(len(hyp_path) for hyp_path in ss.hypernym_paths()) for ss in wn.all_synsets())
def get_hyponyms(synset):
    hyponyms = set()
    for hyponym in synset.hyponyms():
        hyponyms |= set(get_hyponyms(hyponym))
    return hyponyms | set(synset.hyponyms())

In [16]:
syn1 = wn.synset('love.n.01')
syn2 = wn.synset('arouse.v.07')

for path in syn1.hypernym_paths():
    path = list(reversed(path))
    print(([synset.name for synset in path]))

for path in syn2.hypernym_paths():
    path = list(reversed(path))
    print(([synset.name for synset in path]))

[<bound method Synset.name of Synset('love.n.01')>, <bound method Synset.name of Synset('emotion.n.01')>, <bound method Synset.name of Synset('feeling.n.01')>, <bound method Synset.name of Synset('state.n.02')>, <bound method Synset.name of Synset('attribute.n.02')>, <bound method Synset.name of Synset('abstraction.n.06')>, <bound method Synset.name of Synset('entity.n.01')>]
[<bound method Synset.name of Synset('arouse.v.07')>, <bound method Synset.name of Synset('stimulate.v.03')>, <bound method Synset.name of Synset('arouse.v.01')>, <bound method Synset.name of Synset('make.v.03')>]


In [17]:
def get_depth_lcs(syn1,syn2):
    paths1 = (syn1.hypernym_paths())
    paths1 = [list(reversed(path)) for path in paths1]
    
    paths2 = syn2.hypernym_paths()
    paths2 = [list(reversed(path)) for path in paths2]

    best_index_lcs = max_depth
    best_syn_lcs = None
    best_depth_lcs = 0

    for (path1, path2) in list(product(paths1, paths2)):
        index_lcs = -1
        depth_lcs = 0

        i = 0
        while i < len(path1) and index_lcs == -1:
            if path1[i] in path2:
                index_lcs = path2.index(path1[i])
                depth_lcs = len(path2[index_lcs].hypernym_paths()[0])
            i = i + 1

        if index_lcs != -1 and depth_lcs > best_depth_lcs:
            best_index_lcs = index_lcs
            best_syn_lcs = path2[best_index_lcs]
            best_depth_lcs = depth_lcs

    #brown_ic = wordnet_ic.ic('ic-brown.dat')
    #print('atteso: ', syn1.lowest_common_hypernyms(syn2 ,brown_ic)[0])
    #print('trovato: ', best_syn_lcs)

    return len(sorted(best_syn_lcs.hypernym_paths(), key=len)[0]) if (best_depth_lcs != 0) else 0


get_depth_lcs(syn1, syn2)


0

In [None]:
correct = True
for (syn1,syn2) in list(product(wn.all_synsets(),wn.all_synsets())):
    correct = (get_depth_lcs(syn1, syn2) in syn1.lowest_common_hypernyms(syn2)) & correct
    if syn1.shortest_path_distance(syn2) == get_depth_lcs(syn1, syn2):
        print('senso 1: ', syn1, ' senso 2: ',syn2)
print(correct)

In [None]:
def get_shortest_distance(syn1,syn2):
    paths1 = (syn1.hypernym_paths())
    paths1 = [list(reversed(path)) for path in paths1]
    
    paths2 = syn2.hypernym_paths()
    paths2 = [list(reversed(path)) for path in paths2]

    best_syn_distance = max_depth * 2

    for (path1, path2) in list(product(paths1, paths2)):
        index_lcs = -1
        syn_distance = max_depth * 2

        i = 0
        while i < len(path1) and index_lcs == -1:
            if path1[i] in path2:
                index_lcs = path2.index(path1[i])
                syn_distance = index_lcs + i
            i = i + 1

        if (index_lcs != -1 and syn_distance < best_syn_distance):
            best_syn_distance = syn_distance

    
    return best_syn_distance

print('atteso: ', syn1.shortest_path_distance(syn2))
print('trovato: ', get_shortest_distance(syn1, syn2))

In [None]:
correct = True
for (syn1,syn2) in list(product(wn.all_synsets(),wn.all_synsets())):
    correct = syn1.shortest_path_distance(syn2) == get_shortest_distance(syn1, syn2) & correct
    if not syn1.shortest_path_distance(syn2) == get_shortest_distance(syn1, syn2):
        print('senso 1: ', syn1, ' senso 2: ',syn2)
print(correct)

# Wu-Palmer metric

cs(s1, s2) = 2 · depth(LCS) / depth(s1) + depth(s2)

In [None]:
def wup_similarity(syn1,syn2):
    depth_lcs = get_depth_lcs(syn1, syn2)
    depth_syn1 = len(sorted(syn1.hypernym_paths(), key=len)[0])
    depth_syn2 = len(sorted(syn2.hypernym_paths(), key=len)[0])

    return ((2 * depth_lcs) / (depth_syn1 + depth_syn2)) * 10

# Shortest path metric

simpath(s1, s2)=2 · depthMax - len(s1, s2)

In [None]:
def sp_similarity(syn1,syn2):
    shortest_distance = get_shortest_distance(syn1, syn2)

    return (2 * max_depth - shortest_distance) / (2 * max_depth) * 10

# Leakcock & Chodorow metric

simLC (s1, s2) =  log (len(s1, s2) / 2 · depthMax)

In [None]:
import math

def lch_similarity(syn1,syn2):
    distance = get_shortest_distance(syn1, syn2)
    return -math.log((distance) / (2.0 * max_depth))


In [None]:
token1 = 'love'
token2 = 'sex'

syns1 = wn.synsets(token1)
syns2 = wn.synsets(token2)

best_similarity = 0
best_syn1 = None
best_syn2 = None

for (syn1, syn2) in list(product(syns1, syns2)):
    similarity = sp_similarity(syn1, syn2)

    print('syn1: ', syn1)
    print('syn2: ', syn2)
    print('atteso: ', syn1.path_similarity(syn2) * 10)
    print('trovato: ', similarity)
    print('----------------')

    if similarity != None and similarity > best_similarity:
        best_similarity = similarity
        best_syn1 = syn1
        best_syn2 = syn2

print('token1: ', token1)
print('token2: ', token2)
print('atteso: ', row[1]['Human (mean)'])
print('atteso: 6.77')
print('trovato: ', best_similarity)
print('\n')

# TEST

In [None]:
def compute_similarity(metric):
    for row in corpus.iterrows():
        token1 = row[1]['Word 1']
        token2 = row[1]['Word 2']

        syns1 = wn.synsets(token1)
        syns2 = wn.synsets(token2)

        best_similarity = 0
        best_syn1 = None
        best_syn2 = None

        for (syn1, syn2) in list(product(syns1, syns2)):
            if metric == 'wup_similarity':
                similarity = wup_similarity(syn1, syn2)
            elif metric == 'sp_similarity':
                similarity = sp_similarity(syn1, syn2)
            
            if similarity != None and similarity > best_similarity:
                best_similarity = similarity
                best_syn1 = syn1
                best_syn2 = syn2

        print('token1: ', token1)
        print('token2: ', token2)
        print('ipotizzato: ', row[1]['Human (mean)'])
        print('calcolato: ', best_similarity)
        print('\n')

In [None]:
compute_similarity('wup_similarity')

In [None]:
compute_similarity('sp_similarity')