# Esercizio 2

Implementare l’algoritmo di Lesk (!= usare implementazione esistente, e.g., in nltk…).
- Estrarre 50 frasi dal corpus SemCor (corpus annotato con i synset di WN) e disambiguare (almeno) un sostantivo per frase. Calcolare l’accuratezza del sistema implementato sulla base dei sensi annotati in SemCor (SemCor è disponibile all’URL http://web.eecs.umich.edu/~mihalcea/downloads.html)
- Randomizzare la selezione delle 50 frasi e la selezione del termine da disambiguare, e restituire l’accuratezza media su (per esempio) 10 esecuzioni del programma

Opzionale: implementare corpus_lesk_algorithm utilizando semCor

In [1]:
import numpy as np
import random
import nltk
from nltk import MWETokenizer, WordNetLemmatizer
from nltk.corpus import wordnet as wn
#nltk.('sedownloadmcor')
from nltk.corpus import semcor
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

In [2]:
#show semcore
semcor.sents()[:100]


[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', 'Atlanta', "'s", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term', 'end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [3]:
import re
stop_words = set(stopwords.words('english')) #remove stop words
mwes = [x for x in wn.all_lemma_names() if '_' in x]
mwes = [tuple(x.split('_')) for x in mwes]
tokenizer = MWETokenizer(mwes, separator=' ')
lemmatizer = WordNetLemmatizer()

def preprocessing(text):
    text = re.sub(r'[^\w\s]',' ',text) #remove punctuation
    text = text.lower()
    text = tokenizer.tokenize(text.split())
    text = [lemmatizer.lemmatize(token) for token in text]
    text = [w for w in text if not w in stop_words]
    return text

In [4]:
semcor.tagged_sents(tag = 'sem')[:100]

[[['The'], Tree(Lemma('group.n.01.group'), [Tree('NE', ['Fulton', 'County', 'Grand', 'Jury'])]), Tree(Lemma('state.v.01.say'), ['said']), Tree(Lemma('friday.n.01.Friday'), ['Friday']), ['an'], Tree(Lemma('probe.n.01.investigation'), ['investigation']), ['of'], Tree(Lemma('atlanta.n.01.Atlanta'), ['Atlanta']), ["'s"], Tree(Lemma('late.s.03.recent'), ['recent']), Tree(Lemma('primary.n.01.primary_election'), ['primary', 'election']), Tree(Lemma('produce.v.04.produce'), ['produced']), ['``'], ['no'], Tree(Lemma('evidence.n.01.evidence'), ['evidence']), ["''"], ['that'], ['any'], Tree(Lemma('abnormality.n.04.irregularity'), ['irregularities']), Tree(Lemma('happen.v.01.take_place'), ['took', 'place']), ['.']], [['The'], Tree(Lemma('jury.n.01.jury'), ['jury']), Tree(Lemma('far.r.02.far'), ['further']), Tree(Lemma('state.v.01.say'), ['said']), ['in'], Tree(Lemma('term.n.02.term'), ['term']), Tree(Lemma('end.n.02.end'), ['end']), Tree(Lemma('presentment.n.01.presentment'), ['presentments']), ['

In [5]:
def lemma_list(sent):
    return [l.label() if isinstance(l, nltk.tree.Tree) else None for l in sent]

In [6]:
def SimplifiedLesk(word, sentence):
    best_sense = wn.synsets(word)[0] if len(wn.synsets(word)) > 0 else None
    max_overlap = 0
    context = set(sentence)
    
    for sense in wn.synsets(word):
        signature = set(preprocessing(sense.definition())).union(set(preprocessing(' '.join(sense.examples()))))
        overlap = len(context.intersection(signature))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
    
    return best_sense

def corpus_lesk(corpus):
    correct = 0
    total = 0
    
    for sent in corpus.tagged_sents(tag='sem'):
        sentence = lemma_list(sent)
        for i, (word, tag) in enumerate(sent):
            if tag is not None and word not in stop_words:
                total += 1
                if SimplifiedLesk(word, sentence) == sent[i].label():
                    correct += 1
    
    return correct/total

In [None]:
def isCorrect(predicted_synset, actual_synset):


In [43]:
#get 50 random sentences from semcor
sentences_with_tag = semcor.tagged_sents(tag = 'sem')[:100]
sentences = semcor.sents()[:100]
np.random.seed(0)
indices = np.random.permutation(len(sentences_with_tag))
indices = indices[:50]

my_sentences_with_tag = [sentences_with_tag[i] for i in indices]
my_sentences = [sentences[i] for i in indices]

#print(my_sentences_with_tag[0])
#print(my_sentences[0])

for i, sentence in enumerate(my_sentences):
    #lowercase
    #sentence = [w.lower() for w in sentence]
    #remove stop words
    sentence_without_stopwords = [w for w in sentence if not w in stop_words]
    #remove punctuation
    sentence_without_stopwords = [w for w in sentence_without_stopwords if w.isalpha()]
    #get random word
    random_word = random.choice(sentence_without_stopwords)
    index = sentence.index(random_word)

    #print(random_word)

    actual_synset = []
    for j, cell in enumerate(my_sentences_with_tag[i]):
        if random_word in cell[0]:
            label = cell.label() if isinstance(cell, nltk.tree.Tree) else None
            label = str(label).split("'")[1]
            actual_synset.append(wn.synset('.'.join(str(label).split(".")[:-1])))

    #run lesk
    predicted_synset = SimplifiedLesk(random_word, sentence)
    print(random_word)
    print(predicted_synset)
    print(actual_synset[0])
    print("correct: ", predicted_synset == actual_synset[0])
    print('-----------------')

Pearl
Synset('pearl.n.01')
Synset('person.n.01')
correct:  False
-----------------
Williams
Synset('williams.n.01')
Synset('person.n.01')
correct:  False
-----------------
term
Synset('term.n.01')
Synset('term.n.02')
correct:  False
-----------------
bond
Synset('bond.v.03')
Synset('bond.n.02')
correct:  False
-----------------
agriculture
Synset('agribusiness.n.01')
Synset('agribusiness.n.01')
correct:  True
-----------------
taunted
Synset('tease.v.02')
Synset('tease.v.02')
correct:  True
-----------------
continue
Synset('retain.v.02')
Synset('continue.v.01')
correct:  False
-----------------
teacher
Synset('teacher.n.02')
Synset('teacher.n.01')
correct:  False
-----------------
expected
Synset('expect.v.03')
Synset('expect.v.01')
correct:  False
-----------------
told
Synset('tell.v.03')
Synset('state.v.01')
correct:  False
-----------------
study
Synset('sketch.n.01')
Synset('survey.n.01')
correct:  False
-----------------


IndexError: list index out of range