# Esercizio 1

la prima parte di questo esercizio consiste nell'implementare tre misure di similarità basate su WordNet.

Per ciascuna di tali misure di similarità, calcolare
- gli indici di correlazione di Spearman e
- gli indici di correlazione di Pearson fra i risultati ottenuti e quelli ‘target’ presenti nel file annotato.

In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import wordnet as wn
from scipy.stats import pearsonr
from scipy.stats import spearmanr

corpus = pd.read_csv('datasets/WordSim353.csv', sep=',', engine='python')

corpus

Unnamed: 0,Word 1,Word 2,Human (mean)
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.00
3,book,paper,7.46
4,computer,keyboard,7.62
...,...,...,...
348,shower,flood,6.03
349,weather,forecast,8.34
350,disaster,area,6.25
351,governor,office,6.34


# Utils

- Termini vs sensi: sim(w1, w2) = max[sim(c1, c2)]

In [2]:
from itertools import product
from nltk.corpus import wordnet_ic

max_depth = max(max(len(hyp_path) for hyp_path in ss.hypernym_paths()) for ss in wn.all_synsets())
def get_hyponyms(synset):
    hyponyms = set()
    for hyponym in synset.hyponyms():
        hyponyms |= set(get_hyponyms(hyponym))
    return hyponyms | set(synset.hyponyms())

In [32]:
syn1 = wn.synset('cat.n.01')
syn2 = wn.synset('mouse.n.01')

for path in syn1.hypernym_paths():
    path = list(reversed(path))
    print(([synset.name for synset in path]))

for path in syn2.hypernym_paths():
    path = list(reversed(path))
    print(([synset.name for synset in path]))

[<bound method Synset.name of Synset('cat.n.01')>, <bound method Synset.name of Synset('feline.n.01')>, <bound method Synset.name of Synset('carnivore.n.01')>, <bound method Synset.name of Synset('placental.n.01')>, <bound method Synset.name of Synset('mammal.n.01')>, <bound method Synset.name of Synset('vertebrate.n.01')>, <bound method Synset.name of Synset('chordate.n.01')>, <bound method Synset.name of Synset('animal.n.01')>, <bound method Synset.name of Synset('organism.n.01')>, <bound method Synset.name of Synset('living_thing.n.01')>, <bound method Synset.name of Synset('whole.n.02')>, <bound method Synset.name of Synset('object.n.01')>, <bound method Synset.name of Synset('physical_entity.n.01')>, <bound method Synset.name of Synset('entity.n.01')>]
[<bound method Synset.name of Synset('placental.n.01')>, <bound method Synset.name of Synset('mammal.n.01')>, <bound method Synset.name of Synset('vertebrate.n.01')>, <bound method Synset.name of Synset('chordate.n.01')>, <bound met

In [30]:
def depth_lcs(syn1,syn2):
    paths1 = (syn1.hypernym_paths())
    paths1 = [list(reversed(path)) for path in paths1]
    
    paths2 = syn2.hypernym_paths()
    paths2 = [list(reversed(path)) for path in paths2]

    best_index_lcs = max_depth
    best_syn_lcs = None
    best_depth_lcs = 0

    for (path1, path2) in list(product(paths1, paths2)):
        index_lcs = -1
        depth_lcs = 0

        i = 0
        while i < len(path1) and index_lcs == -1:
            if path1[i] in path2:
                index_lcs = path2.index(path1[i])
                depth_lcs = len(path2[index_lcs].hypernym_paths()[0])
            i = i + 1

        if (index_lcs != -1 and depth_lcs > best_depth_lcs):
            best_index_lcs = index_lcs
            best_syn_lcs = path2[best_index_lcs]
            best_depth_lcs = depth_lcs

    #brown_ic = wordnet_ic.ic('ic-brown.dat')
    #print('atteso: ', syn1.lowest_common_hypernyms(syn2 ,brown_ic)[0])
    #print('trovato: ', best_syn_lcs

    paths = best_syn_lcs.hypernym_paths()
    #get the shortest path
    paths = [list(reversed(path)) for path in paths]
    paths = sorted(paths, key=len)
    print(len(paths[0]))
    return paths[0][0]    

brown_ic = wordnet_ic.ic('ic-brown.dat')
print('atteso: ', syn1.lowest_common_hypernyms(syn2 ,brown_ic)[0])
print('trovato: ', depth_lcs(syn1, syn2))


atteso:  Synset('placental.n.01')
11
trovato:  Synset('placental.n.01')


In [12]:
def shortest_distance(syn1,syn2):
    paths1 = (syn1.hypernym_paths())
    paths1 = [list(reversed(path)) for path in paths1]
    
    paths2 = syn2.hypernym_paths()
    paths2 = [list(reversed(path)) for path in paths2]

    best_syn_distance = max_depth * 2

    for (path1, path2) in list(product(paths1, paths2)):
        index_lcs = -1
        syn_distance = max_depth * 2

        i = 0
        while i < len(path1) and index_lcs == -1:
            if path1[i] in path2:
                index_lcs = path2.index(path1[i])
                syn_distance = index_lcs + i
            i = i + 1

        if (index_lcs != -1 and syn_distance < best_syn_distance):
            best_syn_distance = syn_distance

    
    return best_syn_distance

print('atteso: ', syn1.shortest_path_distance(syn2))
print('trovato: ', shortest_distance(syn1, syn2))

atteso:  5
trovato:  5


# Wu-Palmer metric

cs(s1, s2) = 2 · depth(LCS) / depth(s1) + depth(s2)

In [26]:

for row in corpus.iterrows():
    token1 = row[1]['Word 1']
    token2 = row[1]['Word 2']
    #token1 = 'love'
    #token2 = 'sex'

    syns1 = wn.synsets(token1)
    syns2 = wn.synsets(token2)

    best_similarity = 0
    best_syn1 = None
    best_syn2 = None

    for (syn1, syn2) in list(product(syns1, syns2)):
        similarity = syn1.path_similarity(syn2)
        if similarity != None and similarity > best_similarity:
            best_similarity = similarity
            best_syn1 = syn1
            best_syn2 = syn2

    print('token1: ', token1)
    print('token2: ', token2)
    print('atteso: ', row[1]['Human (mean)'])
    #print('atteso: 6.77')
    print('trovato: ', best_similarity * 10)
    print('\n')



token1:  love
token2:  sex
atteso:  6.77
trovato:  5.0


token1:  tiger
token2:  cat
atteso:  7.35
trovato:  5.0


token1:  tiger
token2:  tiger
atteso:  10.0
trovato:  10.0


token1:  book
token2:  paper
atteso:  7.46
trovato:  3.333333333333333


token1:  computer
token2:  keyboard
atteso:  7.62
trovato:  2.5


token1:  computer
token2:  internet
atteso:  7.58
trovato:  1.25


token1:  plane
token2:  car
atteso:  5.77
trovato:  1.4285714285714284


token1:  train
token2:  car
atteso:  6.31
trovato:  1.6666666666666665


token1:  telephone
token2:  communication
atteso:  7.5
trovato:  1.1111111111111112


token1:  television
token2:  radio
atteso:  6.77
trovato:  3.333333333333333


token1:  media
token2:  radio
atteso:  7.42
trovato:  2.5


token1:  drug
token2:  abuse
atteso:  6.85
trovato:  2.0


token1:  bread
token2:  butter
atteso:  6.19
trovato:  3.333333333333333


token1:  cucumber
token2:  potato
atteso:  5.92
trovato:  2.5


token1:  doctor
token2:  nurse
atteso:  7.0
trova

In [None]:
hypernyms2 = []
for hypernym in hypernyms1:
    hypernyms2.extend(hypernym.hypernyms())
print(hypernyms2)

In [None]:
for ss in wn.synsets(token1):
    print(ss, ss.definition())
    print('depth: ', len(ss.hypernym_paths()[0]))

print('\n')

for ss in wn.synsets(toekn2):
    print(ss, ss.definition())
    print('depth: ', len(ss.hypernym_paths()[0]))

Synset('love.n.01') a strong positive emotion of regard and affection
depth:  7
Synset('love.n.02') any object of warm affection or devotion
depth:  7
Synset('beloved.n.01') a beloved person; used as terms of endearment
depth:  6
Synset('love.n.04') a deep feeling of sexual desire and attraction
depth:  8
Synset('love.n.05') a score of zero in tennis or squash
depth:  7
Synset('sexual_love.n.02') sexual activities (often including sexual intercourse) between two people
depth:  7
Synset('love.v.01') have a great affection or liking for
depth:  1
Synset('love.v.02') get pleasure from
depth:  2
Synset('love.v.03') be enamored or in love with
depth:  2
Synset('sleep_together.v.01') have sexual intercourse with
depth:  4


Synset('sexual_activity.n.01') activities associated with sexual intercourse
depth:  6
Synset('sex.n.02') either of the two categories (male or female) into which most organisms are divided
depth:  6
Synset('sex.n.03') all of the feelings resulting from the urge to gratif

# Shortest path metric

simpath(s1, s2)=2 · depthMax - len(s1, s2)

# Leakcock & Chodorow metric

simLC (s1, s2) =  log (len(s1, s2) / 2 · depthMax)