# Step 1: annotazione

Frame OLIVERio:
- [x] Concessive
- [x] History
- [x] Change_resistance
- [x] Emptying
- [x] Performers_and_roles

Frame TOMATIS
- [x] Deciding
- [x] Intentionally_act
- [x] Competition
- [x] Fairness_evaluation
- [x] Process_continue

In [2]:
from nltk.corpus import framenet as fn
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer #tiene conto delle multiword expressions
from nltk.corpus import wordnet as wn
from nltk.corpus import framenet as fn
import re
import numpy as np
import pandas as pd

frame_names = [
    'Concessive',
    'History',
    'Change_resistance',
    'Emptying',
    'Performers_and_roles',
    'Deciding',
    'Intentionally_act',
    'Competition',
    'Fairness_evaluation',
    'Process_continue',
]

In [3]:
#creazione tokenizzatore per le multiword expressions
mwes = [x for x in wn.all_lemma_names() if '_' in x]
mwes = [tuple(x.split('_')) for x in mwes]
tokenizer = MWETokenizer(mwes, separator=' ')

def make_set(sentence):
    sentence = sentence.lower() #lowercase
    sentence = re.sub(r'[^\w\s]','',sentence) #remove punctuation
    sentence = tokenizer.tokenize(sentence.split()) #tokenize
    sentence = [w for w in sentence if not w.isdigit()] #remove numbers
    stop_words = set(stopwords.words('english')) #remove stop words
    sentence = [w for w in sentence if not w in stop_words]
    lemmatizer = WordNetLemmatizer() #lemmatization of definition
    sentence = [lemmatizer.lemmatize(w) for w in sentence]

    res = []
    for w in sentence:
        res.append(w.replace(' ', '_'))

    return set(res)

# Step 2: mapping automatico

## Approccio bag of words

Scelta del senso che permette di massimizzare l'intersezione tra i contesti

In [4]:
def ctx_frame_name_fn(frame):
    name = frame.name
    definition = frame.definition
    FEs = frame.FE
    LUs = frame.lexUnit

    sentence = name + ' ' + definition
    for fe in FEs:
        sentence += ' ' + FEs[fe].definition
    for lu in LUs:
        sentence += ' ' + LUs[lu].definition

    return make_set(sentence)

def ctx_frame_element_fn(frame_element):
    name = frame_element.name
    definition = frame_element.definition
    #semtype = frame_element.semType.name

    sentence = name + ' ' + definition # + ' ' + semtype
   
    return make_set(sentence)

def ctx_lexical_unit_fn(lexical_unit):
    name = lexical_unit.name
    definition = lexical_unit.definition
    exemplars = lexical_unit.exemplars
    
    sentence = name + ' ' + definition
    for ex in exemplars:
        sentence += ' ' + ex.annotationSet[0].text

    return make_set(sentence)


### Crezione del contesto (WordNet)

In [5]:
def get_hyponyms(synset):
    hyponyms = set()
    for hyponym in synset.hyponyms():
        hyponyms |= set(get_hyponyms(hyponym))
    return hyponyms | set(synset.hyponyms())

### CREATE_SENTENCES

In [6]:
def create_sentences(synset, depth=1):
    sentence = synset.definition()
    for example in synset.examples():
        sentence += ' ' + example
    for lemma in synset.lemmas():
        sentence += ' ' + lemma.name()

    if (depth >= 0):
        for hypernym in synset.hypernyms():
            sentence += ' ' + create_sentences(hypernym, depth-1)
        for hyponym in get_hyponyms(synset):
            sentence += ' ' + create_sentences(hyponym, depth-1)  

    return sentence


def ctx_synset_WN(synset):
    sentence = create_sentences(synset)
    return make_set(sentence)

### BAG_OF_WORDS: 

In [7]:
def bag_of_words(ctx_fn):
    mappings = {}
    for key in ctx_fn:
        token = key.split('.')[0]
        syns = wn.synsets(token)
        max_overlap = 0

        # dati i synset andiamo a prendere quello più accurato con il frame
        for syn in syns:
            overlap = len(ctx_synset_WN(syn).intersection(ctx_fn[key])) + 1
            if overlap > max_overlap:
                max_overlap = overlap
                mappings[key] = syn

    return mappings

#### Creazione contesto frame net

In [8]:
def get_context_fn(frame):
    ctx_fn = {}

    #frame
    ctx_fn[frame.name.replace(' ', '_')] = ctx_frame_name_fn(frame)

    #FEs
    for fe in frame.FE:
        ctx_fn[fe.replace(' ', '_')] = ctx_frame_element_fn(frame.FE[fe])
        break

    #LUs
    for lu in frame.lexUnit:
        ctx_fn[lu.replace(' ', '_')] = ctx_lexical_unit_fn(frame.lexUnit[lu])

    return ctx_fn

#### bag of words

Non vengono associati i synset a tutti i token perchè alcuni non hanno alcun synset su wordnet (es. if, although, ...)

In [9]:
for frame_name in frame_names:
    frame = fn.frame_by_name(frame_name)
    ctx_fn = get_context_fn(frame)
    print(frame_name, bag_of_words(ctx_fn))
    print('\n')

Concessive {'Concessive': Synset('concessive.a.01'), 'despite.prep': Synset('contempt.n.01'), 'though.scon': Synset('though.r.01'), 'while.scon': Synset('while.n.01'), 'much_as.scon': Synset('much_as.r.01')}


History {'History': Synset('history.n.05'), 'Topic': Synset('topic.n.02'), 'history.n': Synset('history.n.05')}


Change_resistance {'Agent': Synset('agent.n.01'), 'buttress.v': Synset('buttress.v.01'), 'bulwark.v': Synset('bulwark.v.01'), 'brace.v': Synset('brace.n.01')}


Emptying {'Emptying': Synset('empty.v.01'), 'Agent': Synset('agent.n.01'), 'clear.v': Synset('unclutter.v.01'), 'drain.v': Synset('drain.n.02'), 'empty.v': Synset('empty.v.01'), 'purge.v': Synset('purify.v.02'), 'strip.v': Synset('strip.n.02'), 'divest.v': Synset('strip.v.13'), 'rid.v': Synset('rid.v.01'), 'disembowel.v': Synset('disembowel.v.01'), 'skin.v': Synset('skin.n.01'), 'core.v': Synset('kernel.n.03'), 'peel.v': Synset('peel.n.02'), 'gut.v': Synset('gut.v.02'), 'bone.v': Synset('bone.n.02'), 'debug.v'

Traceback (most recent call last):
  File "C:\Users\OliverioM\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\OliverioM\AppData\Local\Temp\ipykernel_11472\3085725460.py", line 4, in <module>
    print(frame_name, bag_of_words(ctx_fn))
                      ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\OliverioM\AppData\Local\Temp\ipykernel_11472\2514460105.py", line 10, in bag_of_words
    overlap = len(ctx_synset_WN(syn).intersection(ctx_fn[key])) + 1
                  ^^^^^^^^^^^^^^^^^^
  File "C:\Users\OliverioM\AppData\Local\Temp\ipykernel_11472\1374051683.py", line 19, in ctx_synset_WN
    return make_set(sentence)
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\OliverioM\AppData\Local\Temp\ipykernel_11472\2863402790.py", line 14, in make_set
    sentence = [lemmatizer.lemmatize(w) for w in sentence]
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  Fil

## Approccio grafico

In [10]:
def get_context_fn_2(frame):
    ctx_fn = {}

    #FEs
    for fe in frame.FE: 
        ctx_fn[fe.replace(' ', '_')] = ctx_frame_element_fn(frame.FE[fe])
    #LUs
    for lu in frame.lexUnit:
        ctx_fn[lu.replace(' ', '_')] = ctx_lexical_unit_fn(frame.lexUnit[lu])

    return ctx_fn

In [11]:
def get_paths_between_synsets(synset1, synset2, L=3):
    paths = []
    visited = set()

    def dfs(synset, path):
        if synset in visited or len(path) > L:
            return
        if synset == synset2:
            paths.append(path + [synset])
            return
        visited.add(synset)
        for hypernym in synset.hypernyms():
            dfs(hypernym, path + [synset])
        for hyponym in synset.hyponyms():
            dfs(hyponym, path + [synset])

    dfs(synset1, [])
    return [path for path in paths if len(path) <= L]

In [12]:
def score(syn_fn, word_fn, ctx_fn):
    ctx_word = ctx_fn[word_fn]

    res = 0
    for word in ctx_word:
        for syn in wn.synsets(word):
            #get all connection path between syn and syn_fn
            paths = list(get_paths_between_synsets(syn, syn_fn, 3))

            for path in paths:
                res += np.exp(-len(path)-1)

    return res

def prob(syn_fn, word_fn, ctx_fn):
    sum = 0
    for key in ctx_fn:
        for syn in wn.synsets(key):
            sum += score(syn, key, ctx_fn)

    return score(syn_fn, word_fn, ctx_fn) / sum

def argmax_prob(word_fn, ctx_fn):
    max_prob = 0
    max_syn = None
    for syn in wn.synsets(word_fn):
        prob_syn = prob(syn, word_fn, ctx_fn)
        if prob_syn > max_prob:
            max_prob = prob_syn
            max_syn = syn
            
    return max_syn

argmax_prob('Deciding', get_context_fn(fn.frame_by_name('Deciding')))

KeyboardInterrupt: 

# Step 3: valutazione

La funzionalità di valutazione confronta i synset restituiti in output dal sistema con quelli annotati a mano; su questa base deve essere calcolata l'accuratezza del sistema, semplicemente come rapporto degli elementi corretti sul totale degli elementi.

Opzionale
- Confronto fra l'output dei due approcci descritti (bag-of-words e con grafo).
- Sviluppo di metriche che considerino anche la distanza semantica fra eventuali synset errati e corretti

In [101]:
corpus = pd.read_csv('frame_annotati.csv', error_bad_lines=False)



  corpus = pd.read_csv('frame_annotati.csv', error_bad_lines=False)
Skipping line 21: expected 3 fields, saw 4



In [27]:
def bag_of_words2(key, ctx_fn):
    best_syn = None

    token = key.split('.')[0]
    syns = wn.synsets(token)
    max_overlap = 0

    # dati i synset andiamo a prendere quello più accurato con il frame
    for syn in syns:
        overlap = len(ctx_synset_WN(syn).intersection(ctx_fn)) + 1
        if overlap > max_overlap:
            max_overlap = overlap
            best_syn = syn

    return best_syn

In [102]:
current_frame_id = None

totale = 0
corretti = 0
for index, row in corpus.iterrows():
    try:
        key = row['word'].replace(' ', '_')

        if row['fn_tag'] == 'FN': #frame name
            frame = fn.frame_by_name(row['word'])
            current_frame_id = frame.ID
            context = ctx_frame_name_fn(frame)
        elif row['fn_tag'] == 'FE': #frame element
            frame = fn.frame_by_id(current_frame_id)
            context = ctx_frame_element_fn(frame.FE[row['word']])
        elif row['fn_tag'] == 'LU': #lexical unit
            frame = fn.frame_by_id(current_frame_id)
            context = ctx_lexical_unit_fn(frame.lexUnit[row['word']])

        if row['syn'] != 'None':
            predicted_syn = bag_of_words2(key, context)

            if predicted_syn is not None:
                actual_syn = wn.synset(row['syn'])

                
                #check if the predicted synset is correct
                print('Predicted:', predicted_syn.name(), 'actual:',  actual_syn.name())
                if predicted_syn.name() == actual_syn.name():
                    corretti += 1
            
            totale += 1
    except:
        pass
       

print('Accuracy:', corretti/totale)

decide.v.01
Predicted: decision_making.n.01 actual: decide.v.01
None
decision.n.01
Predicted: decision.n.01 actual: decision.n.01
topographic_point.n.01
Predicted: topographic_point.n.01 actual: topographic_point.n.01
possibility.n.02
Predicted: hypothesis.n.02 actual: possibility.n.02
time.n.01
Predicted: time.n.05 actual: time.n.01
circumstance.n.01
Predicted: circumstance.n.01 actual: circumstance.n.01
explanation.n.03
Predicted: explanation.n.02 actual: explanation.n.03
purpose.n.01
manner.n.01
Predicted: manner.n.01 actual: manner.n.01
decide.v.01
Predicted: decide.v.02 actual: decide.v.01
determine.v.03
Predicted: determine.v.02 actual: determine.v.03
rule_out.v.02
Predicted: rule_out.v.02 actual: rule_out.v.02
 None
concessive.a.01
Predicted: concessive.a.01 actual: concessive.a.01
situation.n.01
subject.n.01
Predicted: topic.n.02 actual: subject.n.01
 None
emptying.n.01
Predicted: empty.v.01 actual: emptying.n.01
agent.n.01
Predicted: agent.n.01 actual: agent.n.01
subject.n.01
