# Esercizio 1b

Implementare l’algoritmo di Lesk (!= usare implementazione esistente, e.g., in nltk…).
- Estrarre 50 frasi dal corpus SemCor (corpus annotato con i synset di WN) e disambiguare (almeno) un sostantivo per frase. Calcolare l’accuratezza del sistema implementato sulla base dei sensi annotati in SemCor (SemCor è disponibile all’URL http://web.eecs.umich.edu/~mihalcea/downloads.html)
- Randomizzare la selezione delle 50 frasi e la selezione del termine da disambiguare, e restituire l’accuratezza media su (per esempio) 10 esecuzioni del programma

Opzionale: implementare corpus_lesk_algorithm utilizando semCor

In [1]:
import numpy as np
import random
import nltk
import re
from nltk import MWETokenizer, WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import semcor
from nltk.corpus import stopwords
#nltk.('sedownloadmcor')

stop_words = set(stopwords.words('english'))

### Pre-processing

In [2]:
stop_words = set(stopwords.words('english')) #remove stop words
mwes = [x for x in wn.all_lemma_names() if '_' in x]
mwes = [tuple(x.split('_')) for x in mwes]
tokenizer = MWETokenizer(mwes, separator=' ')
lemmatizer = WordNetLemmatizer()

def preprocessing(text):
    text = re.sub(r'[^\w\s]',' ',text) #remove punctuation
    text = text.lower()
    text = tokenizer.tokenize(text.split())
    text = [lemmatizer.lemmatize(token) for token in text]
    text = [w for w in text if not w in stop_words]
    return text

### LEMMA_LIST: resituisce il lemma della frase del corpus etichettato

In [44]:
def lemma_list(sent):
    ret = []
    try:
        ret = [l.label() if isinstance(l, nltk.tree.Tree) else None for l in sent]
        ret = [x.synset() for x in ret]
    except ValueError:
        print('error')

    return ret

### GET_CONTEXT_FROM_SEMCOR: dato un senso, vengono estratte tutte le frasi di semcor contenenti quel senso al fine di utilizzarlo come contesto nel corpus lesk algorithm

In [45]:
def get_context_from_semcor(target_sense):
    sentences = []
    
    #get the sentences in which the target sense is present
    for sent in semcor.tagged_sents(tag = 'sem'):
        if target_sense in lemma_list(sent):
            sentences.append(sent)

    return set(preprocessing(' '.join(sentences)))

## SIMPLIFIED LESK

In [46]:
def simplified_lesk(word, sentence):
    word = wn.morphy(word) if wn.morphy(word) is not None else word
    best_sense = wn.synsets(word)[0] if len(wn.synsets(word)) > 0 else None #wordnet ordina i sysnet in base alla frequenza
    max_overlap = 0
    context = set(sentence)
    
    for sense in wn.synsets(word):
        signature = set(preprocessing(sense.definition())).union(set(preprocessing(' '.join(sense.examples()))))
        overlap = len(context.intersection(signature))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
    
    return best_sense

## CORPUS LESK

In [47]:
def corpus_lesk(word, sentence):
    best_sense = wn.synsets(word)[0] if len(wn.synsets(word)) > 0 else None #wordnet ordina i sysnet in base alla frequenza
    max_overlap = 0
    context = set(sentence)
    
    for sense in wn.synsets(word):
        signature = set(preprocessing(sense.definition())).union(set(preprocessing(' '.join(sense.examples()))))
        signature = signature.union(get_context_from_semcor(sense))
        overlap = len(context.intersection(signature))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense

    return best_sense

## TEST

In [58]:
def test():
    # prendo 50 frasi random dal corpus semcor
    sentences_with_tag = semcor.tagged_sents(tag = 'sem')[:100]
    sentences = semcor.sents()[:100]
    np.random.seed(0)
    indices = np.random.permutation(len(sentences_with_tag))
    indices = indices[:50]

    my_sentences_with_tag = [sentences_with_tag[i] for i in indices]
    my_sentences = [sentences[i] for i in indices]
    num_sentences = len(my_sentences)

    num_correct = 0

    for i, sentence in enumerate(my_sentences):
        sentence_without_stopwords = [w for w in sentence if not w in stop_words]
        sentence_without_stopwords = [w for w in sentence_without_stopwords if w.isalpha()] #tolgo punteggiatura
        
        # seleziono una parola random dalla frase
        random_word = random.choice(sentence_without_stopwords)
        actual_synset = None

        for j, cell in enumerate(my_sentences_with_tag[i]):
            if random_word in cell[0]:
                label = cell.label() if isinstance(cell, nltk.tree.Tree) else None
                if label and hasattr(label, 'synset'):
                    actual_synset = label.synset()

        predicted_synset = simplified_lesk(random_word, sentence)
        if actual_synset is not None and predicted_synset is not None:
            num_correct += predicted_synset == actual_synset
        else:
            num_sentences -= 1

    accuracy = num_correct/num_sentences
    print('accuracy: ', accuracy)
    
    return accuracy

#### Calcolo accuratezza media

In [59]:
accuracies = []
for i in range(10):
    accuracies.append(test())

print('\n')
print('mean accuracy: ', np.mean(accuracies))

accuracy:  0.34146341463414637
accuracy:  0.46511627906976744
accuracy:  0.3953488372093023
accuracy:  0.4166666666666667
accuracy:  0.425
accuracy:  0.34210526315789475
accuracy:  0.27906976744186046
accuracy:  0.36363636363636365
accuracy:  0.5681818181818182
accuracy:  0.40540540540540543


mean accuracy:  0.40019938154032253


In [42]:
semcor.tagged_sents(tag="sem")[0][1].label().synset()

Synset('group.n.01')