# Esercizio 1

la prima parte di questo esercizio consiste nell'implementare tre misure di similarità basate su WordNet.

Per ciascuna di tali misure di similarità, calcolare
- gli indici di correlazione di Spearman e
- gli indici di correlazione di Pearson fra i risultati ottenuti e quelli ‘target’ presenti nel file annotato.

In [43]:
import pandas as pd
from nltk.corpus import wordnet as wn
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from itertools import product

corpus = pd.read_csv('datasets/WordSim353.csv', sep=',', engine='python')
corpus

Unnamed: 0,Word 1,Word 2,Human (mean)
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.00
3,book,paper,7.46
4,computer,keyboard,7.62
...,...,...,...
348,shower,flood,6.03
349,weather,forecast,8.34
350,disaster,area,6.25
351,governor,office,6.34


# Utils

- Termini vs sensi: sim(w1, w2) = max[sim(c1, c2)]

### CONSTANT MAX_DEPTH: calcolo la profondità massima del grafo

In [44]:
MAX_DEPTH = max(max(len(hyp_path) for hyp_path in ss.hypernym_paths()) for ss in wn.all_synsets())

### MIN_DEPTH: calcola il percorso più breve dalla radice al senso

Alcune volte synset.hypernyms() non restituiva alcun iperonimo, quindi, guardando l'implementazione del metodo min_depth di nltk abbiamo inserito negli iperonimi il risulato di synset.instance_hypernyms()

In [45]:
'''
Return the length of the shortest hypernym path from this synset to the root
'''
def min_depth(synset):
    hyperonyms = synset.hypernyms() + synset.instance_hypernyms()
    if not hyperonyms:
        return 0
    else:
        depths = []
        for hyp in hyperonyms:
            dep = 1 + min_depth(hyp)
            depths.append(dep)
        return min(depths)

#### Test min_depth

In [46]:
ret = True
for row in corpus.iterrows():
    try:
        token1 = row[1]['Word 1']
        token2 = row[1]['Word 2']

        
        for syn1 in wn.synsets(token1):
            if(syn1.min_depth()):
                if (not min_depth(syn1) == syn1.min_depth()):
                    print(syn1,min_depth(syn1),syn1.min_depth(),(min_depth(syn1) == syn1.min_depth()))
                ret = (min_depth(syn1) == syn1.min_depth()) & ret
        for syn2 in wn.synsets(token2):
            if(syn2.min_depth()):
                if (not min_depth(syn1) == syn1.min_depth()):
                    print(syn2,min_depth(syn2),syn2.min_depth(),(min_depth(syn2) == syn2.min_depth()))
                ret = (min_depth(syn2) == syn2.min_depth()) & ret
    except IndexError:
        print('problem with', token1, token2)

print(ret)

True


### MAX_DEPTH: calcola il percorso più lungo dalla radice al senso

Alcune volte synset.hypernyms() non restituiva alcun iperonimo, quindi, guardando l'implementazione del metodo min_depth di nltk abbiamo inserito negli iperonimi il risulato di synset.instance_hypernyms()

In [47]:
'''
Return the length of the longest hypernym path from this synset to the root
'''
def max_depth(synset):
    hyperonyms = synset.hypernyms() + synset.instance_hypernyms()
    
    if not hyperonyms:
        return 0
    else:
        depths = []
        for hyp in hyperonyms:
            dep = 1 + max_depth(hyp)
            depths.append(dep)
        return max(depths)

#### Test max_depth

In [48]:
ret = True
for row in corpus.iterrows():
    try:
        token1 = row[1]['Word 1']
        token2 = row[1]['Word 2']

        for syn1 in wn.synsets(token1):
            if(syn1.max_depth()):
                if (not max_depth(syn1) == syn1.max_depth()):
                    print(syn1,max_depth(syn1),syn1.max_depth(),(max_depth(syn1) == syn1.max_depth()))
                ret = (max_depth(syn1) == syn1.max_depth()) & ret
        for syn2 in wn.synsets(token2):
            if(syn2.max_depth()):
                if (not max_depth(syn1) == syn1.max_depth()):
                    print(syn2,max_depth(syn2),syn2.max_depth(),(max_depth(syn2) == syn2.max_depth()))
                ret = (max_depth(syn2) == syn2.max_depth()) & ret
    except IndexError:
        print('problem with', token1, token2)

print(ret)

True


### GET_DEPTH_LCS: calcola la profondità del lower common hyperonim

A differenza del metodo lowest_common_hypernyms che non solo considera gli iperonimi dei synset ma anche il nodo stesso, noi consideriamo solo gli iperonimi dei synset. Questo è il motivo per cui alcune volte i nostri risultati non coincidono con quelli della libreria. Si tratta di una scelta implementativa

In [54]:
'''
Restitusice il synset più vicino ad entrami i sensi
'''
def get_depth_lcs(syn1,syn2):
    paths1 = (syn1.hypernym_paths())
    paths1 = [list(reversed(path)) for path in paths1]
    
    paths2 = syn2.hypernym_paths()
    paths2 = [list(reversed(path)) for path in paths2]

    best_index_lcs = MAX_DEPTH
    best_syn_lcs = None
    best_depth_lcs = 0

    for (path1, path2) in product(paths1, paths2):
        index_lcs = -1
        depth_lcs = 0
        path1 = path1[1:]
        path2[1:]

        i = 0
        while i < len(path1) and index_lcs == -1:
            #trova l'iperonimo più vicino
            if path1[i] in path2:
                index_lcs = path2.index(path1[i])
                depth_lcs = min_depth(path2[index_lcs])
            i = i + 1

        if index_lcs != -1 and depth_lcs >= best_depth_lcs:
            best_index_lcs = index_lcs
            best_syn_lcs = path2[best_index_lcs]
            best_depth_lcs = depth_lcs

    return best_syn_lcs

#### Test get_depth_lcs

In [60]:
for row in corpus.iterrows():
    try:
        token1 = row[1]['Word 1']
        token2 = row[1]['Word 2']
    
        for syn1 in wn.synsets(token1):
            for syn2 in wn.synsets(token2):        
                predicted = get_depth_lcs(syn1,syn2)
                actual = syn1.lowest_common_hypernyms(syn2, use_min_depth= True)

                if not (actual == [] and predicted == None):
                    if (not (get_depth_lcs(syn1,syn2) in syn1.lowest_common_hypernyms(syn2, use_min_depth= True))):
                        print('syn1: ',syn1)
                        print('syn2: ',syn2)
                        for path in syn1.hypernym_paths():
                            path = reversed(path)
                            print([synset.name for synset in path])
                        for path in syn2.hypernym_paths():
                            path = reversed(path)
                            print([synset.name for synset in path])
                        print('actual', actual)
                        print('predicted', predicted)
                        print('\n')            
    except IndexError:
        print('problem with', token1, token2)


syn1:  Synset('tiger.n.01')
syn2:  Synset('guy.n.01')
[<bound method Synset.name of Synset('tiger.n.01')>, <bound method Synset.name of Synset('person.n.01')>, <bound method Synset.name of Synset('causal_agent.n.01')>, <bound method Synset.name of Synset('physical_entity.n.01')>, <bound method Synset.name of Synset('entity.n.01')>]
[<bound method Synset.name of Synset('tiger.n.01')>, <bound method Synset.name of Synset('person.n.01')>, <bound method Synset.name of Synset('organism.n.01')>, <bound method Synset.name of Synset('living_thing.n.01')>, <bound method Synset.name of Synset('whole.n.02')>, <bound method Synset.name of Synset('object.n.01')>, <bound method Synset.name of Synset('physical_entity.n.01')>, <bound method Synset.name of Synset('entity.n.01')>]
[<bound method Synset.name of Synset('guy.n.01')>, <bound method Synset.name of Synset('man.n.01')>, <bound method Synset.name of Synset('adult.n.01')>, <bound method Synset.name of Synset('person.n.01')>, <bound method Synset

In [52]:
def get_shortest_distance(syn1,syn2):
    paths1 = (syn1.hypernym_paths())
    paths1 = [list(reversed(path)) for path in paths1]
    
    paths2 = syn2.hypernym_paths()
    paths2 = [list(reversed(path)) for path in paths2]

    best_syn_distance = max_depth * 2

    for (path1, path2) in list(product(paths1, paths2)):
        index_lcs = -1
        syn_distance = max_depth * 2

        i = 0
        while i < len(path1) and index_lcs == -1:
            if path1[i] in path2:
                index_lcs = path2.index(path1[i])
                syn_distance = index_lcs + i
            i = i + 1

        if index_lcs != -1 and syn_distance < best_syn_distance:
            best_syn_distance = syn_distance

    
    return best_syn_distance

In [53]:
for row in corpus.iterrows():
    try:
        token1 = row[1]['Word 1']
        token2 = row[1]['Word 2']
        syn1 = wn.synsets(token1)[0]
        syn2 = wn.synsets(token2)[0]
        print('syn1 -> ',syn1, 'syn2 ->',syn2)
        for path in syn1.hypernym_paths():
            path = reversed(path)
            print([synset.name for synset in path])
        for path in syn2.hypernym_paths():
            path = reversed(path)
            print([synset.name for synset in path])
        print('actual', syn1.shortest_path_distance(syn2))
        print('predicted', get_shortest_distance(syn1,syn2))
        print('--------------------------')
        ret = (get_shortest_distance(syn1,syn2) == syn1.shortest_path_distance(syn2)) & ret
    except IndexError:
        print('problem with', token1, token2)
ret

syn1 ->  Synset('love.n.01') syn2 -> Synset('sexual_activity.n.01')
[<bound method Synset.name of Synset('love.n.01')>, <bound method Synset.name of Synset('emotion.n.01')>, <bound method Synset.name of Synset('feeling.n.01')>, <bound method Synset.name of Synset('state.n.02')>, <bound method Synset.name of Synset('attribute.n.02')>, <bound method Synset.name of Synset('abstraction.n.06')>, <bound method Synset.name of Synset('entity.n.01')>]
[<bound method Synset.name of Synset('sexual_activity.n.01')>, <bound method Synset.name of Synset('bodily_process.n.01')>, <bound method Synset.name of Synset('organic_process.n.01')>, <bound method Synset.name of Synset('process.n.06')>, <bound method Synset.name of Synset('physical_entity.n.01')>, <bound method Synset.name of Synset('entity.n.01')>]
actual 11


TypeError: unsupported operand type(s) for *: 'function' and 'int'

# Wu-Palmer metric

cs(s1, s2) = 2 · depth(LCS) / depth(s1) + depth(s2)

In [None]:
def wup_similarity(syn1,syn2):
    depth_lcs = get_depth_lcs(syn1, syn2)
    depth_syn1 = len(sorted(syn1.hypernym_paths(), key=len)[0])
    depth_syn2 = len(sorted(syn2.hypernym_paths(), key=len)[0])

    return ((2 * depth_lcs) / (depth_syn1 + depth_syn2)) * 10

# Shortest path metric

simpath(s1, s2)=2 · depthMax - len(s1, s2)

In [None]:
def sp_similarity(syn1,syn2):
    shortest_distance = get_shortest_distance(syn1, syn2)

    return (2 * max_depth - shortest_distance) / (2 * max_depth) * 10

# Leakcock & Chodorow metric

simLC (s1, s2) =  log (len(s1, s2) / 2 · depthMax)

In [None]:
import math

def lch_similarity(syn1,syn2):
    distance = get_shortest_distance(syn1, syn2)
    if distance is None or distance < 0 or max_depth == 0:
            return None
    return -math.log((distance) / (2.0 * max_depth))


In [None]:
for row in corpus.iterrows():
    try:
        token1 = row[1]['Word 1']
        token2 = row[1]['Word 2']
        syn1 = wn.synsets(token1)[0]
        syn2 = wn.synsets(token2)[0]
        print('syn1 -> ',syn1, 'syn2 ->',syn2)
        print('actual', syn1.lch_similarity(syn2))
        print('predicted', lch_similarity(syn1,syn2))
        print('--------------------------')
        ret = (get_shortest_distance(syn1,syn2) == syn1.shortest_path_distance(syn2)) & ret
    except IndexError:
        print('problem with', token1, token2)

ret

syn1 ->  Synset('love.n.01') syn2 -> Synset('sexual_activity.n.01')
actual 1.1526795099383855


TypeError: unsupported operand type(s) for *: 'function' and 'int'

In [None]:
token1 = 'love'
token2 = 'sex'

syns1 = wn.synsets(token1)
syns2 = wn.synsets(token2)

best_similarity = 0
best_syn1 = None
best_syn2 = None

for (syn1, syn2) in list(product(syns1, syns2)):
    similarity = sp_similarity(syn1, syn2)

    print('syn1: ', syn1)
    print('syn2: ', syn2)
    print('atteso: ', syn1.path_similarity(syn2) * 10)
    print('trovato: ', similarity)
    print('----------------')

    if similarity is not None and similarity > best_similarity:
        best_similarity = similarity
        best_syn1 = syn1
        best_syn2 = syn2

print('token1: ', token1)
print('token2: ', token2)
print('atteso: ', row[1]['Human (mean)'])
print('atteso: 6.77')
print('trovato: ', best_similarity)
print('\n')

# TEST

In [None]:
def compute_similarity(metric):
    for row in corpus.iterrows():
        token1 = row[1]['Word 1']
        token2 = row[1]['Word 2']

        syns1 = wn.synsets(token1)
        syns2 = wn.synsets(token2)

        best_similarity = 0
        best_syn1 = None
        best_syn2 = None

        for (syn1, syn2) in list(product(syns1, syns2)):
            if metric == 'wup_similarity':
                similarity = wup_similarity(syn1, syn2)
            elif metric == 'sp_similarity':
                similarity = sp_similarity(syn1, syn2)
            
            if similarity != None and similarity > best_similarity:
                best_similarity = similarity
                best_syn1 = syn1
                best_syn2 = syn2

        print('token1: ', token1)
        print('token2: ', token2)
        print('ipotizzato: ', row[1]['Human (mean)'])
        print('calcolato: ', best_similarity)
        print('\n')

In [None]:
compute_similarity('wup_similarity')

TypeError: unsupported operand type(s) for *: 'int' and 'Synset'

In [None]:
compute_similarity('sp_similarity')

token1:  love
token2:  sex
ipotizzato:  6.77
calcolato:  9.75


token1:  tiger
token2:  cat
ipotizzato:  7.35
calcolato:  9.75


token1:  tiger
token2:  tiger
ipotizzato:  10.0
calcolato:  10.0


token1:  book
token2:  paper
ipotizzato:  7.46
calcolato:  9.5


token1:  computer
token2:  keyboard
ipotizzato:  7.62
calcolato:  9.25


token1:  computer
token2:  internet
ipotizzato:  7.58
calcolato:  8.25


token1:  plane
token2:  car
ipotizzato:  5.77
calcolato:  8.5


token1:  train
token2:  car
ipotizzato:  6.31
calcolato:  8.75


token1:  telephone
token2:  communication
ipotizzato:  7.5
calcolato:  7.5


token1:  television
token2:  radio
ipotizzato:  6.77
calcolato:  9.5


token1:  media
token2:  radio
ipotizzato:  7.42
calcolato:  9.25


token1:  drug
token2:  abuse
ipotizzato:  6.85
calcolato:  7.75


token1:  bread
token2:  butter
ipotizzato:  6.19
calcolato:  9.5


token1:  cucumber
token2:  potato
ipotizzato:  5.92
calcolato:  9.25


token1:  doctor
token2:  nurse
ipotizzato:  7