# Esercizio 2

Implementare l’algoritmo di Lesk (!= usare implementazione esistente, e.g., in nltk…).
- Estrarre 50 frasi dal corpus SemCor (corpus annotato con i synset di WN) e disambiguare (almeno) un sostantivo per frase. Calcolare l’accuratezza del sistema implementato sulla base dei sensi annotati in SemCor (SemCor è disponibile all’URL http://web.eecs.umich.edu/~mihalcea/downloads.html)
- Randomizzare la selezione delle 50 frasi e la selezione del termine da disambiguare, e restituire l’accuratezza media su (per esempio) 10 esecuzioni del programma

In [110]:
import numpy as np
import random
import nltk
from nltk.corpus import wordnet as wn
#nltk.('sedownloadmcor')
from nltk.corpus import semcor
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


In [111]:
def lemma_list(sent):
    return [l.label() if isinstance(l, nltk.tree.Tree) else None for l in sent]

In [119]:
def SimplifiedLesk(word, sentence):
    best_sense = wn.synsets(word)[0] if len(wn.synsets(word)) > 0 else None
    max_overlap = 0
    context = set(sentence)
    
    for sense in wn.synsets(word):
        signature = set(sense.definition().split()).union(set(sense.examples()))
        overlap = len(context.intersection(signature))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
    
    return best_sense

In [120]:
#get 50 random sentences from semcor
sentences_with_tag = semcor.tagged_sents(tag = 'sem')[:100]
sentences = semcor.sents()[:100]

np.random.seed(0)
indices = np.random.permutation(len(sentences_with_tag))
indices = indices[:50]

my_sentences_with_tag = [sentences_with_tag[i] for i in indices]
my_sentences = [sentences[i] for i in indices]

#print(my_sentences_with_tag[0])
#print(my_sentences[0])

for i, sentence in enumerate(my_sentences):
    #lowercase
    #sentence = [w.lower() for w in sentence]
    #remove stop words
    sentence_without_stopwords = [w for w in sentence if not w in stop_words]
    #remove punctuation
    sentence_without_stopwords = [w for w in sentence_without_stopwords if w.isalpha()]
    #get random word
    random_word = random.choice(sentence_without_stopwords)
    index = sentence.index(random_word)

    #print(random_word)

    actual_synset = []
    for j, cell in enumerate(my_sentences_with_tag[i]):
        if random_word in cell[0]:
            actual_synset.append(cell.label() if isinstance(cell, nltk.tree.Tree) else None)

    #run lesk
    predicted_synset = SimplifiedLesk(random_word, sentence)
    print(random_word)
    print(predicted_synset)
    print(actual_synset)
    print('-----------------')






divorce
Synset('divorce.n.01')
[Lemma('divorce.n.01.divorce')]
-----------------
permit
Synset('license.n.01')
[Lemma('license.n.01.permit')]
-----------------
jury
Synset('jury.n.01')
[Lemma('jury.n.01.jury')]
-----------------
item
Synset('item.n.01')
[Lemma('detail.n.01.item')]
-----------------
Felix
None
[Lemma('person.n.01.person')]
-----------------
recommend
Synset('commend.v.04')
[Lemma('recommend.v.01.recommend')]
-----------------
burden
Synset('charge.v.18')
[Lemma('burden.n.01.burden')]
-----------------
give
Synset('give.n.01')
[Lemma('give.v.10.give')]
-----------------
work
Synset('work.n.05')
[Lemma('work.v.01.work')]
-----------------
escheat
Synset('escheat.n.01')
[Lemma('escheat.n.01.escheat')]
-----------------
allotted
Synset('accord.v.02')
[Lemma('accord.v.02.allot')]
-----------------
committee
Synset('committee.n.01')
[Lemma('committee.n.01.committee'), Lemma('subcommittee.n.01.subcommittee')]
-----------------
saw
Synset('saw.v.01')
[Lemma('witness.v.02.see')]