# Esercitazione 2

Frame annotati:
- Frame OLIVERio:
    - [x] Concessive
    - [x] History
    - [x] Change_resistance
    - [x] Emptying
    - [x] Performers_and_roles
    

- Frame TOMATIS
    - [x] Deciding
    - [x] Intentionally_act
    - [x] Competition
    - [x] Fairness_evaluation
    - [x] Process_continue

In [1]:
from nltk.corpus import framenet as fn
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer #tiene conto delle multiword expressions
from nltk.corpus import wordnet as wn
from nltk.corpus import framenet as fn
import re
import numpy as np
import pandas as pd

In [2]:
#creazione tokenizzatore per le multiword expressions
mwes = [x for x in wn.all_lemma_names() if '_' in x]
mwes = [tuple(x.split('_')) for x in mwes]
tokenizer = MWETokenizer(mwes, separator=' ')

def make_set(sentence):
    sentence = sentence.lower() #lowercase
    sentence = re.sub(r'[^\w\s]','',sentence) #remove punctuation
    sentence = tokenizer.tokenize(sentence.split()) #tokenize
    sentence = [w for w in sentence if not w.isdigit()] #remove numbers
    stop_words = set(stopwords.words('english')) #remove stop words
    sentence = [w for w in sentence if not w in stop_words]
    lemmatizer = WordNetLemmatizer() #lemmatization of definition
    sentence = [lemmatizer.lemmatize(w) for w in sentence]

    res = []
    for w in sentence:
        res.append(w.replace(' ', '_'))

    return set(res)

## Metodi per la creazione dei contesti (framenet e wordnet)

### Creazione contesto elementi framenet

In [3]:
def ctx_frame_name_fn(frame):
    name = frame.name
    definition = frame.definition
    FEs = frame.FE
    LUs = frame.lexUnit

    sentence = name + ' ' + definition
    for fe in FEs:
        sentence += ' ' + FEs[fe].definition
    for lu in LUs:
        sentence += ' ' + LUs[lu].definition

    return make_set(sentence)

def ctx_frame_element_fn(frame_element):
    name = frame_element.name
    definition = frame_element.definition
    #semtype = frame_element.semType.name

    sentence = name + ' ' + definition # + ' ' + semtype
   
    return make_set(sentence)

def ctx_lexical_unit_fn(lexical_unit):
    name = lexical_unit.name
    definition = lexical_unit.definition
    exemplars = lexical_unit.exemplars
    
    sentence = name + ' ' + definition
    for ex in exemplars:
        sentence += ' ' + ex.annotationSet[0].text

    return make_set(sentence)


### Crezione contesto del synset wordnet

In [4]:
def get_hyponyms(synset):
    hyponyms = set()
    for hyponym in synset.hyponyms():
        hyponyms |= set(get_hyponyms(hyponym))
    return hyponyms | set(synset.hyponyms())

def create_sentences(synset, depth=1):
    sentence = synset.definition()
    for example in synset.examples():
        sentence += ' ' + example
    for lemma in synset.lemmas():
        sentence += ' ' + lemma.name()

    if (depth >= 0):
        for hypernym in synset.hypernyms():
            sentence += ' ' + create_sentences(hypernym, depth-1)
        for hyponym in get_hyponyms(synset):
            sentence += ' ' + create_sentences(hyponym, depth-1)  

    return sentence


def ctx_synset_WN(synset):
    sentence = create_sentences(synset)
    return make_set(sentence)

## Approccio bag of words

Scelta del senso che permette di massimizzare l'intersezione tra i contesti. Non vengono associati i synset a tutti i token perchè alcuni non hanno alcun synset su wordnet (es. if, although, ...)

In [5]:
def bag_of_words(key, ctx_fn):
    best_syn = None

    token = key.split('.')[0]
    syns = wn.synsets(token)
    max_overlap = 0

    # dati i synset andiamo a prendere quello più accurato con il frame
    for syn in syns:
        overlap = len(ctx_synset_WN(syn).intersection(ctx_fn)) + 1
        if overlap > max_overlap:
            max_overlap = overlap
            best_syn = syn

    return best_syn

## Approccio grafico

In [6]:
def get_paths_between_synsets(synset1, synset2, L=3):
    paths = []
    visited = set()

    def dfs(synset, path):
        if synset in visited or len(path) > L:
            return
        if synset == synset2:
            paths.append(path + [synset])
            return
        visited.add(synset)
        for hypernym in synset.hypernyms():
            dfs(hypernym, path + [synset])
        for hyponym in synset.hyponyms():
            dfs(hyponym, path + [synset])

    dfs(synset1, [])
    return [path for path in paths if len(path) <= L]

def score(syn_fn, word_fn, ctx_fn):
    res = 0
    for word in ctx_fn:
        for syn in wn.synsets(word):
            #get all connection path between syn and syn_fn
            paths = list(get_paths_between_synsets(syn, syn_fn, 3))

            for path in paths:
                res += np.exp(-len(path)-1)

    return res

def prob(syn_fn, word_fn, ctx_fn):
    sum = 0

    for syn in wn.synsets(word_fn):
        sum += score(syn, word_fn, ctx_fn)

    return score(syn_fn, word_fn, ctx_fn) / sum

def argmax_prob(word_fn, ctx_fn):
    max_prob = 0
    max_syn = None
    for syn in wn.synsets(word_fn):
        prob_syn = prob(syn, word_fn, ctx_fn)
        if prob_syn > max_prob:
            max_prob = prob_syn
            max_syn = syn
            
    return max_syn

## Valutazione

La funzionalità di valutazione confronta i synset restituiti in output dal sistema con quelli annotati a mano; su questa base deve essere calcolata l'accuratezza del sistema, semplicemente come rapporto degli elementi corretti sul totale degli elementi.

Opzionale
- Confronto fra l'output dei due approcci descritti (bag-of-words e con grafo).
- Sviluppo di metriche che considerino anche la distanza semantica fra eventuali synset errati e corretti

In [7]:
def evaluation(corpus, method='bag_of_words'):
    current_frame_id = None
    totale = 0
    corretti = 0
    
    for index, row in corpus.iterrows():
        try:
            key = row['word'].replace(' ', '_')

            if row['fn_tag'] == 'FN': #frame name
                frame = fn.frame_by_name(row['word'])
                current_frame_id = frame.ID
                context = ctx_frame_name_fn(frame)
            elif row['fn_tag'] == 'FE': #frame element
                frame = fn.frame_by_id(current_frame_id)
                context = ctx_frame_element_fn(frame.FE[row['word']])
            elif row['fn_tag'] == 'LU': #lexical unit
                frame = fn.frame_by_id(current_frame_id)
                context = ctx_lexical_unit_fn(frame.lexUnit[row['word']])

            if row['syn'] != 'None':
                predicted_syn = None
                
                if method == 'bag_of_words':
                    predicted_syn = bag_of_words(key, context)
                elif method == 'approccio_grafico':
                    predicted_syn = argmax_prob(key, context)

                if predicted_syn is not None:
                    actual_syn = wn.synset(row['syn'])

                    #check if the predicted synset is correct
                    #print('Predicted:', predicted_syn.name(), 'actual:',  actual_syn.name())
                    if predicted_syn.name() == actual_syn.name():
                        corretti += 1
                
                totale += 1
        except:
            pass   

    return corretti/totale

In [8]:
corpus = pd.read_csv('frame_annotati.csv', error_bad_lines=False)

print('Accuracy bag of words:', evaluation(corpus, method='bag_of_words'))
print('Accuracy approccio grafico:', evaluation(corpus, method='approccio_grafico'))



  corpus = pd.read_csv('frame_annotati.csv', error_bad_lines=False)


Evaluation...
Accuracy bag of words: 0.40476190476190477
Evaluation...
Accuracy approccio grafico: 0.30120481927710846
