# Esercizio 1

la prima parte di questo esercizio consiste nell'implementare tre misure di similarità basate su WordNet.

Per ciascuna di tali misure di similarità, calcolare
- gli indici di correlazione di Spearman e
- gli indici di correlazione di Pearson fra i risultati ottenuti e quelli ‘target’ presenti nel file annotato.

In [91]:
import pandas as pd
from nltk.corpus import wordnet as wn
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from itertools import product
import math

corpus = pd.read_csv('datasets/WordSim353.csv', sep=',', engine='python')
corpus

Unnamed: 0,Word 1,Word 2,Human (mean)
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.00
3,book,paper,7.46
4,computer,keyboard,7.62
...,...,...,...
348,shower,flood,6.03
349,weather,forecast,8.34
350,disaster,area,6.25
351,governor,office,6.34


# Utils

- Termini vs sensi: sim(w1, w2) = max[sim(c1, c2)]

### CONSTANT MAX_DEPTH: calcolo la profondità massima del grafo

In [92]:
MAX_DEPTH = max(max(len(hyp_path) for hyp_path in ss.hypernym_paths()) for ss in wn.all_synsets())

### MIN_DEPTH: calcola il percorso più breve dalla radice al senso

Alcune volte synset.hypernyms() non restituiva alcun iperonimo, quindi, guardando l'implementazione del metodo min_depth di nltk abbiamo inserito negli iperonimi il risulato di synset.instance_hypernyms()

In [93]:
'''
Return the length of the shortest hypernym path from this synset to the root
'''
def min_depth(synset):
    if not synset:
        return 0
    
    hyperonyms = synset.hypernyms() + synset.instance_hypernyms()
    if not hyperonyms:
        return 0
    else:
        depths = []
        for hyp in hyperonyms:
            dep = 1 + min_depth(hyp)
            depths.append(dep)
        return min(depths)


#### Test min_depth

In [94]:
ret = True
for row in corpus.iterrows():
    try:
        token1 = row[1]['Word 1']
        token2 = row[1]['Word 2']

        
        for syn1 in wn.synsets(token1):
            if(syn1.min_depth()):
                if (not min_depth(syn1) == syn1.min_depth()):
                    print(syn1,min_depth(syn1),syn1.min_depth(),(min_depth(syn1) == syn1.min_depth()))
        for syn2 in wn.synsets(token2):
            if(syn2.min_depth()):
                if (not min_depth(syn1) == syn1.min_depth()):
                    print(syn2,min_depth(syn2),syn2.min_depth(),(min_depth(syn2) == syn2.min_depth()))
    except IndexError:
        print('problem with', token1, token2)

### MAX_DEPTH: calcola il percorso più lungo dalla radice al senso

Alcune volte synset.hypernyms() non restituiva alcun iperonimo, quindi, guardando l'implementazione del metodo min_depth di nltk abbiamo inserito negli iperonimi il risulato di synset.instance_hypernyms()

In [95]:
'''
Return the length of the longest hypernym path from this synset to the root
'''
def max_depth(synset):
    if not synset:
        return 0

    hyperonyms = synset.hypernyms() + synset.instance_hypernyms()
    
    if not hyperonyms:
        return 0
    else:
        depths = []
        for hyp in hyperonyms:
            dep = 1 + max_depth(hyp)
            depths.append(dep)
        return max(depths)

#### Test max_depth

In [96]:
ret = True
for row in corpus.iterrows():
    try:
        token1 = row[1]['Word 1']
        token2 = row[1]['Word 2']

        for syn1 in wn.synsets(token1):
            if(syn1.max_depth()):
                if (not max_depth(syn1) == syn1.max_depth()):
                    print(syn1,max_depth(syn1),syn1.max_depth(),(max_depth(syn1) == syn1.max_depth()))
                ret = (max_depth(syn1) == syn1.max_depth()) & ret
        for syn2 in wn.synsets(token2):
            if(syn2.max_depth()):
                if (not max_depth(syn1) == syn1.max_depth()):
                    print(syn2,max_depth(syn2),syn2.max_depth(),(max_depth(syn2) == syn2.max_depth()))
                ret = (max_depth(syn2) == syn2.max_depth()) & ret
    except IndexError:
        print('problem with', token1, token2)

print(ret)

True


### GET_DEPTH_LCS: calcola la profondità del lower common hyperonim

A differenza del metodo lowest_common_hypernyms che non solo considera gli iperonimi dei synset ma anche il nodo stesso, noi consideriamo solo gli iperonimi dei synset. Questo è il motivo per cui alcune volte i nostri risultati non coincidono con quelli della libreria. Si tratta di una scelta implementativa

In [97]:
'''
Restitusice il synset più vicino ad entrami i sensi
'''
def get_depth_lcs(syn1,syn2):
    paths1 = (syn1.hypernym_paths())
    paths1 = [list(reversed(path)) for path in paths1]
    
    paths2 = syn2.hypernym_paths()
    paths2 = [list(reversed(path)) for path in paths2]

    best_index_lcs = MAX_DEPTH
    best_syn_lcs = None
    best_depth_lcs = 0

    for (path1, path2) in product(paths1, paths2):
        index_lcs = -1
        depth_lcs = 0
        path1 = path1[1:]
        path2[1:]

        i = 0
        while i < len(path1) and index_lcs == -1:
            #trova l'iperonimo più vicino
            if path1[i] in path2:
                index_lcs = path2.index(path1[i])
                depth_lcs = min_depth(path2[index_lcs])
            i = i + 1

        if index_lcs != -1 and depth_lcs >= best_depth_lcs:
            best_index_lcs = index_lcs
            best_syn_lcs = path2[best_index_lcs]
            best_depth_lcs = depth_lcs

    return best_syn_lcs

#### Test get_depth_lcs

In [98]:
for row in corpus.iterrows():
    try:
        token1 = row[1]['Word 1']
        token2 = row[1]['Word 2']
    
        for syn1 in wn.synsets(token1):
            for syn2 in wn.synsets(token2):        
                predicted = get_depth_lcs(syn1,syn2)
                actual = syn1.lowest_common_hypernyms(syn2, use_min_depth= True)

                if not (actual == [] and predicted == None):
                    if (not (get_depth_lcs(syn1,syn2) in syn1.lowest_common_hypernyms(syn2, use_min_depth= True))):
                        print('syn1: ',syn1)
                        print('syn2: ',syn2)
                        for path in syn1.hypernym_paths():
                            path = reversed(path)
                            print([synset.name for synset in path])
                        for path in syn2.hypernym_paths():
                            path = reversed(path)
                            print([synset.name for synset in path])
                        print('actual', actual)
                        print('predicted', predicted)
                        print('\n')            
    except IndexError:
        print('problem with', token1, token2)


syn1:  Synset('tiger.n.01')
syn2:  Synset('guy.n.01')
[<bound method Synset.name of Synset('tiger.n.01')>, <bound method Synset.name of Synset('person.n.01')>, <bound method Synset.name of Synset('causal_agent.n.01')>, <bound method Synset.name of Synset('physical_entity.n.01')>, <bound method Synset.name of Synset('entity.n.01')>]
[<bound method Synset.name of Synset('tiger.n.01')>, <bound method Synset.name of Synset('person.n.01')>, <bound method Synset.name of Synset('organism.n.01')>, <bound method Synset.name of Synset('living_thing.n.01')>, <bound method Synset.name of Synset('whole.n.02')>, <bound method Synset.name of Synset('object.n.01')>, <bound method Synset.name of Synset('physical_entity.n.01')>, <bound method Synset.name of Synset('entity.n.01')>]
[<bound method Synset.name of Synset('guy.n.01')>, <bound method Synset.name of Synset('man.n.01')>, <bound method Synset.name of Synset('adult.n.01')>, <bound method Synset.name of Synset('person.n.01')>, <bound method Synset

### GET_SHORTEST_DISTANCE: restituisce il minor numero di passi tra un synset e l'altro (restituisce un intero)

I risultati coincidono con quelli del metodo shortest_path_distance della libreria nltk

In [115]:
def get_shortest_distance(syn1,syn2):
    if not syn1 or not syn2:
        return 0

    paths1 = (syn1.hypernym_paths())
    paths1 = [list(reversed(path)) for path in paths1]
    
    paths2 = syn2.hypernym_paths()
    paths2 = [list(reversed(path)) for path in paths2]

    best_syn_distance = MAX_DEPTH * 2

    for (path1, path2) in list(product(paths1, paths2)):
        index_lcs = -1
        syn_distance = MAX_DEPTH * 2

        i = 0
        while i < len(path1) and index_lcs == -1:
            if path1[i] in path2:
                index_lcs = path2.index(path1[i])
                syn_distance = index_lcs + i
            i = i + 1

        if index_lcs != -1 and syn_distance < best_syn_distance:
            best_syn_distance = syn_distance

    return best_syn_distance

#### Test get_shortest_distance

In [100]:
for row in corpus.iterrows():
    try:
        token1 = row[1]['Word 1']
        token2 = row[1]['Word 2']

        for syn1 in wn.synsets(token1):
            for syn2 in wn.synsets(token2):
                actual = syn1.shortest_path_distance(syn2)
                predicted = get_shortest_distance(syn1,syn2)

                if not (actual == None and predicted == MAX_DEPTH * 2):
                    if (not (get_shortest_distance(syn1,syn2) == syn1.shortest_path_distance(syn2))):
                        print('syn1: ',syn1)
                        print('syn2: ',syn2)
                        for path in syn1.hypernym_paths():
                            path = reversed(path)
                            print([synset.name for synset in path])
                        for path in syn2.hypernym_paths():
                            path = reversed(path)
                            print([synset.name for synset in path])
                        print('actual', actual)
                        print('predicted', predicted)
                        print('\n')
    except IndexError:
        print('problem with', token1, token2)

## Wu-Palmer metric

cs(s1, s2) = 2 · depth(LCS) / depth(s1) + depth(s2)

In [130]:
def wup_similarity(syn1,syn2):
    depth_lcs = max_depth(get_depth_lcs(syn1, syn2)) + 1 #+1 perche' così al posto di calcolare i rami calcola i nodi
    depth_syn1 = len(sorted(syn1.hypernym_paths(), key=len)[0])
    depth_syn2 = len(sorted(syn2.hypernym_paths(), key=len)[0])

    return ((2 * depth_lcs) / (depth_syn1 + depth_syn2))

## Shortest path metric

simpath(s1, s2)=2 · depthMax - len(s1, s2)

In [117]:
def sp_similarity(syn1,syn2):
    shortest_distance = get_shortest_distance(syn1, syn2)
    if shortest_distance == None:
        shortest_distance = 0

    return (2 * MAX_DEPTH) - shortest_distance

## Leakcock & Chodorow metric

simLC (s1, s2) =  log (len(s1, s2) / 2 · depthMax)

In [119]:
def lch_similarity(syn1,syn2):
    distance = get_shortest_distance(syn1, syn2)
    if distance is None or distance <= 0 or MAX_DEPTH == 0:
        return None
    
    return -math.log((distance) / (2.0 * MAX_DEPTH))

### Test sulle metriche

In [120]:
for row in corpus.iterrows():
    try:
        token1 = row[1]['Word 1']
        token2 = row[1]['Word 2']

        for syn1 in wn.synsets(token1):
            for syn2 in wn.synsets(token2):
                if syn1.pos() == syn2.pos():
                    print('syn1 -> ',syn1, 'syn2 ->',syn2)

                    print('actual wup', syn1.wup_similarity(syn2))
                    print('predicted wup', wup_similarity(syn1,syn2))

                    print('actual sp', syn1.shortest_path_distance(syn2))
                    print('predicted sp', sp_similarity(syn1,syn2))

                    print('actual lch', syn1.lch_similarity(syn2))
                    print('predicted lch', lch_similarity(syn1,syn2))
                    print('\n')
                    ret = (get_shortest_distance(syn1,syn2) == syn1.shortest_path_distance(syn2)) & ret
    except IndexError:
        print('problem with', token1, token2)

print(ret)

syn1 ->  Synset('love.n.01') syn2 -> Synset('sexual_activity.n.01')
actual wup 0.15384615384615385
predicted wup 0.15384615384615385
actual sp 11
predicted sp 29
actual lch 1.1526795099383855
predicted lch 1.2909841813155656


syn1 ->  Synset('love.n.01') syn2 -> Synset('sex.n.02')
actual wup 0.3076923076923077
predicted wup 0.3076923076923077
actual sp 9
predicted sp 31
actual lch 1.3350010667323402
predicted lch 1.491654876777717


syn1 ->  Synset('love.n.01') syn2 -> Synset('sex.n.03')
actual wup 0.7692307692307693
predicted wup 0.7692307692307693
actual sp 3
predicted sp 37
actual lch 2.2512917986064953
predicted lch 2.5902671654458267


syn1 ->  Synset('love.n.01') syn2 -> Synset('sex.n.04')
actual wup 0.42857142857142855
predicted wup 0.42857142857142855
actual sp 8
predicted sp 32
actual lch 1.4403615823901665
predicted lch 1.6094379124341003


syn1 ->  Synset('love.n.02') syn2 -> Synset('sexual_activity.n.01')
actual wup 0.15384615384615385
predicted wup 0.15384615384615385
act

## TEST SUL CORPUS

In [140]:
def compute_similarity(metric='wup_similarity'):
    corpus_copy = corpus.copy()

    for row in corpus.iterrows():
        token1 = row[1]['Word 1']
        token2 = row[1]['Word 2']

        syns1 = wn.synsets(token1)
        syns2 = wn.synsets(token2)

        best_similarity = 0
        best_nltk_similarity = 0

        for (syn1, syn2) in list(product(syns1, syns2)):
            if syn1.pos() == syn2.pos():
                if metric == 'wup_similarity':
                    similarity = wup_similarity(syn1, syn2)
                    nltk_similarity = syn1.wup_similarity(syn2)
                elif metric == 'sp_similarity':
                    similarity = sp_similarity(syn1, syn2)
                    nltk_similarity = syn1.shortest_path_distance(syn2)
                elif metric == 'lch_similarity':
                    similarity = lch_similarity(syn1, syn2)
                    nltk_similarity = syn1.lch_similarity(syn2)
                
                if similarity != None and similarity > best_similarity:
                    best_similarity = similarity
                    best_nltk_similarity = nltk_similarity

        #print('token1: ', token1)
        #print('token2: ', token2)
        #print('hypotesis: ', row[1]['Human (mean)'])
        #print('predicted: ', best_similarity)
        #print('actual:', best_nltk_similarity)
        #print('\n')

        corpus_copy.loc[row[0], metric] = best_similarity

    corpus_copy.to_csv('datasets/WordSim353_' + metric + '.csv', sep=',', index=False)



In [141]:
compute_similarity('wup_similarity')

In [142]:
compute_similarity('sp_similarity')

In [143]:
compute_similarity('lch_similarity')

## Calcolo indice di correlazione

### Indice di Pearson

In [145]:
def pearson_correlation(metric='wup_similarity'):
    if (metric == 'wup_similarity'):
        corpus = pd.read_csv('datasets/WordSim353_wup_similarity.csv', sep=',', engine='python')
    elif (metric == 'sp_similarity'):
        corpus = pd.read_csv('datasets/WordSim353_sp_similarity.csv', sep=',', engine='python')
    elif (metric == 'lch_similarity'):
        corpus = pd.read_csv('datasets/WordSim353_lch_similarity.csv', sep=',', engine='python')

    actual = corpus['Human (mean)']
    predicted = corpus[metric]

    corr, _ = pearsonr(actual, predicted)
    print('Pearsons correlation: %.3f' % corr)

### Indice di Spearman

In [147]:
def spearman_correlation(metric='wup_similarity'):
    if (metric == 'wup_similarity'):
        corpus = pd.read_csv('datasets/WordSim353_wup_similarity.csv', sep=',', engine='python')
    elif (metric == 'sp_similarity'):
        corpus = pd.read_csv('datasets/WordSim353_sp_similarity.csv', sep=',', engine='python')
    elif (metric == 'lch_similarity'):
        corpus = pd.read_csv('datasets/WordSim353_lch_similarity.csv', sep=',', engine='python')

    actual = corpus['Human (mean)']
    predicted = corpus[metric]

    corr, _ = spearmanr(actual, predicted)
    print('Spearmans correlation: %.3f' % corr)

### Test sugli indici

In [148]:
print('INDICE DI PEARSON')
print('Wup similarity')
pearson_correlation('wup_similarity')
print('\n')
print('Sp similarity')
pearson_correlation('sp_similarity')
print('\n')
print('Lch similarity')
pearson_correlation('lch_similarity')
print('\n')

print('INDICE DI SPEARMAN')
print('Wup similarity')
spearman_correlation('wup_similarity')
print('\n')
print('Sp similarity')
spearman_correlation('sp_similarity')
print('\n')
print('Lch similarity')
spearman_correlation('lch_similarity')
print('\n')

INDICE DI PEARSON
Wup similarity
Pearsons correlation: 0.259


Sp similarity
Pearsons correlation: 0.167


Lch similarity
Pearsons correlation: 0.240


INDICE DI SPEARMAN
Wup similarity
Spearmans correlation: 0.289


Sp similarity
Spearmans correlation: 0.290


Lch similarity
Spearmans correlation: 0.224


