# Esercizio 1b

Implementare l’algoritmo di Lesk (!= usare implementazione esistente, e.g., in nltk…).
- Estrarre 50 frasi dal corpus SemCor (corpus annotato con i synset di WN) e disambiguare (almeno) un sostantivo per frase. Calcolare l’accuratezza del sistema implementato sulla base dei sensi annotati in SemCor (SemCor è disponibile all’URL http://web.eecs.umich.edu/~mihalcea/downloads.html)
- Randomizzare la selezione delle 50 frasi e la selezione del termine da disambiguare, e restituire l’accuratezza media su (per esempio) 10 esecuzioni del programma

Opzionale: implementare corpus_lesk_algorithm utilizando semCor

In [10]:
import numpy as np
import random
import nltk
import re
from nltk import MWETokenizer, WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import semcor
from nltk.corpus import stopwords
#nltk.('sedownloadmcor')

stop_words = set(stopwords.words('english'))
SEMCORE_LEN = len(semcor.tagged_sents(tag = 'sem'))

### Pre-processing

In [2]:
stop_words = set(stopwords.words('english')) #remove stop words
mwes = [x for x in wn.all_lemma_names() if '_' in x]
mwes = [tuple(x.split('_')) for x in mwes]
tokenizer = MWETokenizer(mwes, separator=' ')
lemmatizer = WordNetLemmatizer()

def preprocessing(text):
    text = re.sub(r'[^\w\s]',' ',text) #remove punctuation
    text = text.lower()
    text = tokenizer.tokenize(text.split())
    text = [lemmatizer.lemmatize(token) for token in text]
    text = [w for w in text if not w in stop_words]
    return text

### LEMMA_LIST: resituisce il lemma della frase del corpus etichettato

In [3]:
def lemma_list(sent):
    ret = []
    try:
        ret = [l.label() if isinstance(l, nltk.tree.Tree) else None for l in sent]
        ret = [label.synset() for label in ret if label and hasattr(label, 'synset')]
    except:
        print('error')

    return ret

### GET_CONTEXT_FROM_SEMCOR: dato un senso, vengono estratte tutte le frasi di semcor contenenti quel senso al fine di utilizzarlo come contesto nel corpus lesk algorithm

In [14]:
def get_context_from_semcor(target_sense):
    print('Target sense:', target_sense)
    sentences = []
    
    #get the sentences in which the target sense is present
    i = 0
    while i < SEMCORE_LEN and len(sentences) < 2:
        sent_annotated = semcor.tagged_sents(tag = 'sem')[i]
        sent = ' '.join(semcor.sents()[i])

        if target_sense in lemma_list(sent_annotated):
            sentences.append(sent)
            print('Sentence', i, 'added')
        i += 1
        
    print('Number of sentences for', target_sense, ':', len(sentences))
    return set(preprocessing(' '.join(sentences)))

## SIMPLIFIED LESK

Consideriamo anche il pos tag per andare a escludere i synset non adeguati 

In [5]:
def simplified_lesk(word, pos, context):
    word = wn.morphy(word) if wn.morphy(word) is not None else word
    best_sense = wn.synsets(word)[0] if len(wn.synsets(word)) > 0 else None #wordnet ordina i sysnet in base alla frequenza
    max_overlap = 0

    for sense in wn.synsets(word, pos=pos):
        signature = set(preprocessing(sense.definition())).union(set(preprocessing(' '.join(sense.examples()))))
        overlap = len(context.intersection(signature))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
    
    return best_sense

## CORPUS LESK

In [6]:
def corpus_lesk(word, pos, context):
    best_sense = wn.synsets(word)[0] if len(wn.synsets(word)) > 0 else None #wordnet ordina i sysnet in base alla frequenza
    max_overlap = 0
    
    for sense in wn.synsets(word, pos=pos):
        signature = set(preprocessing(sense.definition())).union(set(preprocessing(' '.join(sense.examples()))))
        signature = signature.union(get_context_from_semcor(sense))
        overlap = len(context.intersection(signature))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense

    return best_sense

## TEST

In [15]:
def test():
    # prendo 50 frasi random dal corpus semcor
    sentences_with_tag = semcor.tagged_sents(tag = 'sem')[:100]
    sentences = semcor.sents()[:100]
    np.random.seed()
    indices = np.random.permutation(len(sentences_with_tag))
    indices = indices[:50]

    my_sentences_with_tag = [sentences_with_tag[i] for i in indices]
    my_sentences = [sentences[i] for i in indices]
    num_sentences = len(my_sentences)
    num_correct = 0

    for i, sentence in enumerate(my_sentences):
        sentence_without_stopwords = [w for w in sentence if not w in stop_words]
        sentence_without_stopwords = [w for w in sentence_without_stopwords if w.isalpha()] #tolgo punteggiatura
        
        # seleziono una parola random dalla frase
        random_word = random.choice(sentence_without_stopwords)
        actual_synset = None
        predicted_synset = None

        esci = False
        for j, cell in enumerate(my_sentences_with_tag[i]):
            if esci == False:
                if random_word in cell[0]:
                    label = cell.label() if isinstance(cell, nltk.tree.Tree) else None

                    if label and hasattr(label, 'synset'):
                        actual_synset = label.synset()
                        pos = actual_synset.pos()

                        context = set(sentence)
                        predicted_synset = corpus_lesk(random_word, pos, context)
                        esci = True
        
        if actual_synset is not None and predicted_synset is not None:
            num_correct += predicted_synset == actual_synset
        else:
            num_sentences -= 1
                
    accuracy = num_correct/num_sentences
    print('accuracy: ', accuracy)
    
    return accuracy

test()

Target sense: Synset('travel.v.01')
Sentence 199 added
Sentence 238 added
Number of sentences for Synset('travel.v.01') : 2
Target sense: Synset('go.v.02')
Sentence 92 added
Sentence 496 added
Number of sentences for Synset('go.v.02') : 2
Target sense: Synset('go.v.03')


KeyboardInterrupt: 

#### Calcolo accuratezza media

In [None]:
accuracies = []
for i in range(10):
    accuracies.append(test())

print('\n')
print('mean accuracy: ', np.mean(accuracies))

AttributeError: 'NoneType' object has no attribute 'synset'