# Esercizio 1

la prima parte di questo esercizio consiste nell'implementare tre misure di similarità basate su WordNet.

Per ciascuna di tali misure di similarità, calcolare
- gli indici di correlazione di Spearman e
- gli indici di correlazione di Pearson fra i risultati ottenuti e quelli ‘target’ presenti nel file annotato.

In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import wordnet as wn
from scipy.stats import pearsonr
from scipy.stats import spearmanr

corpus = pd.read_csv('datasets/WordSim353.csv', sep=',', engine='python')

corpus

Unnamed: 0,Word 1,Word 2,Human (mean)
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.00
3,book,paper,7.46
4,computer,keyboard,7.62
...,...,...,...
348,shower,flood,6.03
349,weather,forecast,8.34
350,disaster,area,6.25
351,governor,office,6.34


# Utils

- Termini vs sensi: sim(w1, w2) = max[sim(c1, c2)]

In [146]:
from itertools import product
from nltk.corpus import wordnet_ic

max_depth = max(max(len(hyp_path) for hyp_path in ss.hypernym_paths()) for ss in wn.all_synsets())
def get_hyponyms(synset):
    hyponyms = set()
    for hyponym in synset.hyponyms():
        hyponyms |= set(get_hyponyms(hyponym))
    return hyponyms | set(synset.hyponyms())

In [156]:
syn1 = wn.synset('cat.n.01')
syn2 = wn.synset('mouse.n.01')

In [157]:
def depth_lcs(syn1,syn2):
    paths1 = (syn1.hypernym_paths())
    paths1 = [list(reversed(path)) for path in paths1]
    
    paths2 = syn2.hypernym_paths()
    paths2 = [list(reversed(path)) for path in paths2]

    best_index_lcs = max_depth
    best_syn_lcs = None
    best_depth_lcs = 0

    for (path1, path2) in list(product(paths1, paths2)):
        index_lcs = -1
        depth_lcs = 0

        i = 0
        while i < len(path1) and index_lcs == -1:
            if path1[i] in path2:
                index_lcs = path2.index(path1[i])
                depth_lcs = len(path2[index_lcs].hypernym_paths()[0])
            i = i + 1

        if (index_lcs != -1 and depth_lcs > best_depth_lcs):
            best_index_lcs = index_lcs
            best_syn_lcs = path2[best_index_lcs]
            best_depth_lcs = depth_lcs
    
    return best_syn_lcs
    

brown_ic = wordnet_ic.ic('ic-brown.dat')
print('atteso: ', syn1.lowest_common_hypernyms(syn2 ,brown_ic))
print('trovato: ', depth_lcs(syn1, syn2))


atteso:  [Synset('placental.n.01')]
trovato:  Synset('placental.n.01')


In [158]:
def length_syn1_syn2(syn1,syn2):
    paths1 = (syn1.hypernym_paths())
    paths1 = [list(reversed(path)) for path in paths1]
    
    paths2 = syn2.hypernym_paths()
    paths2 = [list(reversed(path)) for path in paths2]

    best_syn_distance = max_depth * 2

    for (path1, path2) in list(product(paths1, paths2)):
        index_lcs = -1
        syn_distance = max_depth * 2

        i = 0
        while i < len(path1) and index_lcs == -1:
            if path1[i] in path2:
                index_lcs = path2.index(path1[i])
                syn_distance = index_lcs + i
            i = i + 1

        if (index_lcs != -1 and syn_distance < best_syn_distance):
            best_syn_distance = syn_distance

    
    return best_syn_distance

print('atteso: ', syn1.shortest_path_distance(syn2))
print('trovato: ', length_syn1_syn2(syn1, syn2))

atteso:  5
trovato:  5


In [139]:
print(syn1.hyponyms())
paths = syn1.hypernym_paths()

for path in paths:
    path = list(reversed(path))
    print(([synset.name for synset in path]))
    

[Synset('appointment_book.n.01'), Synset('authority.n.07'), Synset('bestiary.n.01'), Synset('booklet.n.01'), Synset('catalog.n.01'), Synset('catechism.n.02'), Synset('copybook.n.01'), Synset('curiosa.n.01'), Synset('formulary.n.01'), Synset('phrase_book.n.01'), Synset('playbook.n.02'), Synset('pop-up_book.n.01'), Synset('prayer_book.n.01'), Synset('reference_book.n.01'), Synset('review_copy.n.01'), Synset('songbook.n.01'), Synset('storybook.n.01'), Synset('textbook.n.01'), Synset('tome.n.01'), Synset('trade_book.n.01'), Synset('workbook.n.01'), Synset('yearbook.n.01')]
[<bound method Synset.name of Synset('book.n.01')>, <bound method Synset.name of Synset('publication.n.01')>, <bound method Synset.name of Synset('work.n.02')>, <bound method Synset.name of Synset('product.n.02')>, <bound method Synset.name of Synset('creation.n.02')>, <bound method Synset.name of Synset('artifact.n.01')>, <bound method Synset.name of Synset('whole.n.02')>, <bound method Synset.name of Synset('object.n.0

In [140]:
print(syn2.hypernyms())
paths = syn2.hypernym_paths()

for path in paths:
    path = list(reversed(path))
    print(([synset.name for synset in path]))

[Synset('motor_vehicle.n.01')]
[<bound method Synset.name of Synset('car.n.01')>, <bound method Synset.name of Synset('motor_vehicle.n.01')>, <bound method Synset.name of Synset('self-propelled_vehicle.n.01')>, <bound method Synset.name of Synset('wheeled_vehicle.n.01')>, <bound method Synset.name of Synset('container.n.01')>, <bound method Synset.name of Synset('instrumentality.n.03')>, <bound method Synset.name of Synset('artifact.n.01')>, <bound method Synset.name of Synset('whole.n.02')>, <bound method Synset.name of Synset('object.n.01')>, <bound method Synset.name of Synset('physical_entity.n.01')>, <bound method Synset.name of Synset('entity.n.01')>]
[<bound method Synset.name of Synset('car.n.01')>, <bound method Synset.name of Synset('motor_vehicle.n.01')>, <bound method Synset.name of Synset('self-propelled_vehicle.n.01')>, <bound method Synset.name of Synset('wheeled_vehicle.n.01')>, <bound method Synset.name of Synset('vehicle.n.01')>, <bound method Synset.name of Synset('c

# Wu-Palmer metric

cs(s1, s2) = 2 · depth(LCS) / depth(s1) + depth(s2)

In [30]:
#get first row of corpus
token1 = corpus.iloc[0]['Word 1']
toekn2 = corpus.iloc[0]['Word 2']


#get one synset for token1
synset1 = wn.synsets(token1)[0]


hypernyms1 = synset1.hypernyms()[0]
print(hypernyms)



[Synset('emotion.n.01')]


In [None]:
hypernyms2 = []
for hypernym in hypernyms1:
    hypernyms2.extend(hypernym.hypernyms())
print(hypernyms2)

In [24]:
for ss in wn.synsets(token1):
    print(ss, ss.definition())
    print('depth: ', len(ss.hypernym_paths()[0]))

print('\n')

for ss in wn.synsets(toekn2):
    print(ss, ss.definition())
    print('depth: ', len(ss.hypernym_paths()[0]))

Synset('love.n.01') a strong positive emotion of regard and affection
depth:  7
Synset('love.n.02') any object of warm affection or devotion
depth:  7
Synset('beloved.n.01') a beloved person; used as terms of endearment
depth:  6
Synset('love.n.04') a deep feeling of sexual desire and attraction
depth:  8
Synset('love.n.05') a score of zero in tennis or squash
depth:  7
Synset('sexual_love.n.02') sexual activities (often including sexual intercourse) between two people
depth:  7
Synset('love.v.01') have a great affection or liking for
depth:  1
Synset('love.v.02') get pleasure from
depth:  2
Synset('love.v.03') be enamored or in love with
depth:  2
Synset('sleep_together.v.01') have sexual intercourse with
depth:  4


Synset('sexual_activity.n.01') activities associated with sexual intercourse
depth:  6
Synset('sex.n.02') either of the two categories (male or female) into which most organisms are divided
depth:  6
Synset('sex.n.03') all of the feelings resulting from the urge to gratif

# Shortest path metric

simpath(s1, s2)=2 · depthMax - len(s1, s2)

# Leakcock & Chodorow metric

simLC (s1, s2) =  log (len(s1, s2) / 2 · depthMax)