This notebook generates the SynsetExplorer component and its results that will be useful for the generation of the dataset

# Setup dependencies and variables

In [1]:
! pip install nltk



In [2]:
import os, sys

import numpy as np
import torch

import requests
import zipfile

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
from copy import deepcopy

nltk.download('wordnet')
nltk.download('omw-1.4')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to /home/marco/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/marco/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
wordnet.synset('entity.n.01').hyponyms()[0] in wordnet.synset('entity.n.01').hyponyms()

True

In [4]:
va_download_url = 'https://verbatlas.org/downloads/VerbAtlas-1.1.0.zip'

va_res_path = './va_resources/'
os.makedirs(va_res_path, exist_ok=True)
va_folder_path = os.path.join(va_res_path,'VerbAtlas-1.1.0')
va_root_tsvs = os.path.join(va_folder_path, 'VerbAtlas-1.1.0')

va_results_name = 'results.tsv'
va_results_path = os.path.join(va_res_path, va_results_name)

datasets_path = './datasets/'

In [5]:
if not os.path.isdir(va_folder_path):
    print('downloading data...')
    
    r = requests.get(va_download_url, stream=True)
    with open(va_folder_path+'.zip', 'wb') as fd:
        for chunk in r.iter_content(chunk_size=128):
            fd.write(chunk)
    with zipfile.ZipFile(va_folder_path+'.zip',"r") as zip_ref:
        zip_ref.extractall(va_res_path)

    print('data download complete.')
    os.remove(va_folder_path+'.zip')

else:
    print('data already downloaded.')

data already downloaded.


In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

Setting the seed for reproducibility:

In [7]:
SEED = 28

# random.seed(SEED) # not used
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Creating the class for exploring WordNet and connecting it with BabelNet and VerbAtlas

In [8]:
class SynsetExplorer():
    def __init__(self, tsvs_folder, wordnet):
        """A class useful for mapping and exploring VerbAtlas, BabelNet and WordNet.

        Args:
            tsvs_folder (string): the path to the folder containing all .tsv files necessary for mapping synsets (e.g. VA_bn2va.tsv)
            wordnet (WordNetCorpusReader): the instance of the WordNet reader from NLTK
        """
        self.va2bns = {} # from VerbAtlas frame to list of BabelNet synsets
        self.bn2va = {} # from BabelNet synset to VerbAtlas synset
        self.bn2wn = {} # from BabelNet synset to WordNet synset
        self.wn2bn = {} # from WordNet synset to BabelNet synset
        self.va2name = {} # from VerbAtlas frame id to VerbAtlas name
        self.name2va = {} # from VerbAtlas name to VerbAtlas frame id
        self.wn = wordnet
        with open(os.path.join(tsvs_folder,'VA_bn2va.tsv')) as file:
            for line in file:
                l = line.strip('\n').split('\t')
                if len(l) == 2: # l[0] = bn, l[1] = va
                    self.bn2va[l[0]] = l[1]
                    self.va2bns[l[1]] = [l[0]] if l[1] not in self.va2bns else self.va2bns[l[1]] + [l[0]]
        with open(os.path.join(tsvs_folder,'bn2wn.tsv')) as file:
            for line in file:
                l = line.strip('\n').split('\t')
                if len(l) == 2: # l[0] = bn, l[1] = wn
                    self.bn2wn[l[0]] = l[1]
                    self.wn2bn[l[1]] = l[0]
        with open(os.path.join(tsvs_folder,'VA_frame_info.tsv')) as file:
            for line in file:
                l = line.strip('\n').split('\t')
                if len(l) > 2: # l[0] = va, l[1] = name
                    self.va2name[l[0]] = l[1].upper()
                    self.name2va[l[1].upper()] = l[0]
                    
    def explore_hypernyms(self, negative_synsets = []):
        """
        Explore the graph of the derivationally related forms and their hypernyms, 
        starting from VerbAtlas frames.

        Args:
            negative_synsets (list, optional): a list of synsets that, if encountered via DFS on the observed synset, it means
            that the observed synset is must not be returned. Defaults to [].

        Returns:
            list: a list of lists, with this format: [[synset_name, number_of_visits, synset_definition], ...].\n
            The list is ordered by descending oder, so that the first elements are the most visited synsets.
        """
        counter = Counter()
        
        for va_frame in self.va2bns.keys():
            va_frame_name = self.va2name[va_frame]
            
            for bn_offset_pos in self.va2bns[va_frame]:
                wn_offset_pos = self.bn2wn[bn_offset_pos]
                wn_synset = self.get_synset_wn(wn_offset_pos)
                # for each lemma in the synset, we check its derivationally related form
                for wn_lemma in wn_synset.lemmas(): 
                    wn_der_rel_forms = wn_lemma.derivationally_related_forms()
                    # for each derivationally related forms, we collect all hypernyms and
                    # their definition, with also how many synsets they are connected to
                    for wn_der_rel_form in wn_der_rel_forms: 
                        wn_der_rel_form_synset = wn_der_rel_form.synset()

                        if len(negative_synsets) == 0 or (not self.dfs(wn_der_rel_form_synset, negative_synsets)):

                            counter[wn_der_rel_form_synset] += 1
                            explore_list = wn_der_rel_form_synset.hypernyms()
                            while len(explore_list) > 0:
                                popped_synset = explore_list.pop(0)

                                if len(negative_synsets) == 0 or (not self.dfs(popped_synset, negative_synsets)):

                                    counter[popped_synset] += 1
                                explore_list = popped_synset.hypernyms() + explore_list

        return [[s.name(), str(n), s.definition()] for s,n in counter.most_common()]

    def get_candidates(self, positive_synsets = [], extend_to_hypernyms = False, remove_multiframe_synsets = False):
        """
        Explore the graph of the derivationally related forms and their hypernyms, 
        starting from VerbAtlas frames.

        Args:
            positive_synsets (list, optional): a list of synsets that, if encountered via DFS on the observed synset, it means
            that the observed synset is a possible candidate for the events. Defaults to [].
            extend_to_hypernyms (bool, optional): if true, the synsets saved in the results are also hypernyms of derivationally related forms. Defaults to False.
            remove_multiframe_synsets (bool, optional): if true, the synsets that have more than one associated frame are excluded. Defaults to False.

        Returns:
            dict: a dictionary, where the key is the synset name and the values are:\n
            - definition (str): the string definition of that synset\n
            - frames (set): the set of frames where its verbal synsets are directly connected with it (using derivationally related form)\n
            - indirect_frames (set): the set of frames where its verbal synsets are connected with it using hypernyms\n
            - wn_bases (set): the set of verbal synsets that are directly connected with it (using derivationally related form)\n
            - indirect_wn_bases (set): the set of verbal synsets that are connected with it using hypernyms\n
            - is_drf: if that synset is directly connected with a frame (using derivationally related form)
        """
        result = {}
        for va_frame in self.va2bns.keys():
            va_frame_name = self.va2name[va_frame]
            
            for bn_offset_pos in self.va2bns[va_frame]:
                wn_offset_pos = self.bn2wn[bn_offset_pos]
                wn_synset = self.get_synset_wn(wn_offset_pos)
                # for each lemma in the synset, we check its derivationally related forms candidates
                for wn_lemma in wn_synset.lemmas(): 
                    wn_der_rel_forms = wn_lemma.derivationally_related_forms()
                    # for each derivationally related forms candidates, we check if by exploring
                    # the hypernym path we find any synset included in positive_synsets. If so,
                    # that candidate is included in the result.
                    for wn_der_rel_form in wn_der_rel_forms: 
                        wn_der_rel_form_synset = wn_der_rel_form.synset()
                        if self.dfs(wn_der_rel_form_synset, positive_synsets):
                            if wn_der_rel_form_synset.name() not in result:
                                result[wn_der_rel_form_synset.name()] = {
                                    'definition':wn_der_rel_form_synset.definition(), 
                                    'frames':{va_frame_name}, 'indirect_frames':set(), 
                                    'wn_bases':{wn_synset.name()}, 'indirect_wn_bases':set(), 'is_drf': True}
                            else:
                                result[wn_der_rel_form_synset.name()]['frames'].add(va_frame_name)
                                result[wn_der_rel_form_synset.name()]['wn_bases'].add(wn_synset.name())
                            result[wn_der_rel_form_synset.name()]['is_drf'] = True

                            if extend_to_hypernyms:
                                explore_list = wn_der_rel_form_synset.hypernyms()
                                while len(explore_list) > 0:
                                    popped_synset = explore_list.pop(0)

                                    if self.dfs(popped_synset, positive_synsets):
                                        if popped_synset.name() not in result:
                                            result[popped_synset.name()] = {
                                                'definition':popped_synset.definition(), 
                                                'frames':set(), 'indirect_frames':{va_frame_name}, 
                                                'wn_bases':set(), 'indirect_wn_bases':{wn_synset.name()}, 'is_drf': False}
                                        else:
                                            result[popped_synset.name()]['indirect_frames'].add(va_frame_name)
                                            result[popped_synset.name()]['indirect_wn_bases'].add(wn_synset.name())

                                    explore_list = popped_synset.hypernyms() + explore_list

        return {k:v for k,v in result.items() if len(v['frames']) == 1} if remove_multiframe_synsets else result

    def resolve_ambiguity_statistically(self, positive_synsets = [], print_resolution = False):
        """
        It uses the returning dictionary in get_candidates() (see documentation).\n
        The frame in which the derivationally related form is connected the most (or its hypernyms if there is uncertainty), it will be put in it.

        Args:
            positive_synsets (list, optional): a list of synsets that, if encountered via DFS on the observed synset, it means
            that the observed synset is a possible candidate for the events. Defaults to [].
            print_resolution (bool, optional): if true, it prints out the resolved frame for each synset. Use for debugging. Defaults to False.

        Returns:
            dict: the dictionary returned from get_candidates(), but without hypernyms and solving ambiguity.
        """
        candidates_extended = self.get_candidates(positive_synsets, extend_to_hypernyms=True, remove_multiframe_synsets=False)
        for candidate_synset_name, candidate_values in candidates_extended.items():
            
            if len(candidate_values['frames']) > 1 and candidate_values['is_drf']: # ambiguous candidate connected to a baseframe
                frames_connections = {
                    frame_name:{
                        'current' : set(), # current verbal synsets of frame_name encountered by the candidate synset (or its hypernyms)
                        'v_synsets' : len(set( self.get_synset_wn(self.bn2wn[bn]).name() for bn in self.va2bns[self.name2va[frame_name]] )) # number of verbal synsets of frame_name
                    } for frame_name in candidate_values['frames']
                }
                # implementing a DFS using a list: if there is only one frame in which the candidate synset is directly connected the most (through derivationally related form),
                # then the candidate synset will be put in that frame; else if multiple frames have the same number of direct connections, the ambiguity is solved by using also
                # the hypernyms until there will be only one predominant frame
                curr_candidate_synset_list = [ self.wn.synset(candidate_synset_name) ]
                candidate_frame = list(candidate_values['frames'])[0] # random taking here, it will be updated after with the right candidate
                while True:
                    curr_candidate_synset = curr_candidate_synset_list.pop(0)
                    curr_candidate_synset_name = curr_candidate_synset.name()
                    if curr_candidate_synset_name in candidates_extended.keys():
                        for wn_v_direct in candidates_extended[curr_candidate_synset_name]['wn_bases']:
                            va_frame_origin_name = self.va2name[self.bn2va[self.wn2bn[self.get_encoded_wn(self.wn.synset(wn_v_direct))]]]
                            if va_frame_origin_name in frames_connections.keys():
                                frames_connections[va_frame_origin_name]['current'].add(wn_v_direct)
                    
                    best_frames = {k:v for k,v in frames_connections.items() if len(v['current']) == len(frames_connections[max(frames_connections, key = lambda f: len(frames_connections[f]['current']))]['current']) }
                    candidate_frame = list(best_frames.keys())[0]
                    curr_candidate_synset_list = curr_candidate_synset.hypernyms() + curr_candidate_synset_list
                    if len(best_frames) == 1 or curr_candidate_synset_list == []:
                        break
                
                if print_resolution:
                    print(candidate_synset_name,'with frames',candidates_extended[candidate_synset_name]['frames'],'is put in',candidate_frame)

                candidates_extended[candidate_synset_name]['frames'] = {candidate_frame} # ambiguity resolved

        return {k:v for k,v in candidates_extended.items() if v['is_drf']} # removing hypernyms from results

    def dfs(self, wn_synset, positive_synsets = []):
        """Simple implementation of a Depth First Search algorithm

        Args:
            wn_synset (Synset): a WordNet Synset from wich to start from
            positive_synsets (list, optional): the names of the synsets that, if encountered, stops the DFS search 
            and returns true. If none of them is ever encounteted, then the algorithm returns false. Defaults to [].

        Returns:
            bool: If true, then the starting synset or one of its hypernyms are present in the positive_synsets. If false,
            then the algorithm didn't find any of the synsets name in the positive_synsets list.
        """
        if wn_synset.name() in positive_synsets:
            return True
        for wn_synset_hypernym in wn_synset.hypernyms():
            if self.dfs(wn_synset_hypernym, positive_synsets):
                return True
        return False

    def calculate_frames_connections(self, candidates = {}):
        """Compute how many VerbAtlas frames are connected with nominal synsets via a counter.

        Args:
            candidates (dict, optional): all possible candidates for the specified problem, generally
            computed by get_candidates(). Defaults to {}.

        Returns:
            list: a list of pairs in that format: [(frame_name, number_of_connections), ...]
        """
        frames_connections = Counter({n:0 for k,n in self.va2name.items()})
        for cf in candidates.values():
            for frame in cf['frames']:
                frames_connections[frame] += 1
        return frames_connections.most_common()

    def _path_finding(self, syn_name_from,syn_name_to, limit = 80, path = []):
        """function used for path_finding function. Do not use separately.

        Args:
            s_name_from (str): the WordNet verbal synset that is in a VerbAtlas frame to start with.
            s_name_to (str): the WordNet nominal synset to find.
            limit (int, optional): limit of recursion. The higher, the better for finding paths, but more expensive. Defaults to 80.
            path (list, optional): list used recursively to save final path. Do not initialize it. Defaults to [].

        Returns:
            list: list of synsets that represent the path chosen
        """
        if syn_name_from == syn_name_to:
            return path + [syn_name_from]
        elif limit == 0:
            return []
        for wn_synset_hypernym in self.wn.synset(syn_name_from).hypernyms():
            new_path = self._path_finding(wn_synset_hypernym.name(), syn_name_to, limit - 1, path+[syn_name_from])
            if new_path != []:
                return new_path
        return path + [syn_name_from]

    def path_finding(self, s_name_from, s_name_to, positive_synsets = []):
        """Find the path between a synset in a VerbAtlas frame and a possible nominal synset.

        Args:
            s_name_from (str): the WordNet verbal synset that is in a VerbAtlas frame to start with.
            s_name_to (str): the WordNet nominal synset to find.
            positive_synsets (list, optional): a list of synsets that, if encountered via DFS on the observed synset, it means
            that the observed synset is a possible candidate for the events. Defaults to [].

        Returns:
            str: A detailed string representing the path computed to go from s_name_from to s_name_to
        """
        explored_graph_list = list(self.get_candidates(positive_synsets, extend_to_hypernyms = True, remove_multiframe_synsets = False).keys())
        s = self.wn.synset(s_name_from)
        va_frame_origin = self.va2name[self.bn2va[self.wn2bn[self.get_encoded_wn(s)]]]
        
        for l in s.lemmas():
            for drf in l.derivationally_related_forms():
                ss = drf.synset()
                pp = self._path_finding(ss.name(), s_name_to)
                if all(elem in explored_graph_list for elem in pp):
                    return f'|VA_FRAME:{va_frame_origin}| {s.name()} --lemma--> {l.name()} --drf--> {drf.name()} --synset--> {pp[0]} {"--hypern-->" if len(pp[1:])>0 else ""} {" --hypern--> ".join(pp[1:])}'
    
    def save_results(self, list_of_lists, output_path):
        """Saves a list of lists in a .tsv format.

        Args:
            list_of_lists (list): a list of lists
            output_path (str): the output filepath
        """
        with open(os.path.join(output_path), 'w') as tsvfile:
            for values in list_of_lists:
                row = '\t'.join(values)
                print(row, file=tsvfile)

    def get_synset_wn(self, wn_offset_pos):
        """Returns the NLTK Synset instance from a offset-POS string synset

        Args:
            wn_offset_pos (str): a string of that format: "wn:<offset><POS>"

        Returns:
            Synset: the NLTK Synset instance
        """
        return self.wn.synset_from_pos_and_offset( wn_offset_pos[-1] , int(wn_offset_pos[3:-1]) )

    def get_encoded_wn(self, wn_synset):
        """Returns the offset-POS string synset from a NLTK Synset instance

        Args:
            wn_synset (Synset): the NLTK Synset instance

        Returns:
            str: a string of that format: "wn:<offset><POS>"
        """
        wn_pos = str(wn_synset.pos())
        wn_offset = str(wn_synset.offset())
        wn_offset = (8-len(wn_offset))*'0' + wn_offset
        return 'wn:'+wn_offset+wn_pos



explorer = SynsetExplorer(va_root_tsvs, wordnet)

VerbAtlas frames:

In [9]:
(print(sorted(list(explorer.name2va.keys()))))

['ABSORB', 'ABSTAIN_AVOID_REFRAIN', 'ACCOMPANY', 'ACCUSE', 'ACHIEVE', 'ADD', 'ADJUST_CORRECT', 'AFFECT', 'AFFIRM', 'AGREE_ACCEPT', 'AIR', 'ALLY_ASSOCIATE_MARRY', 'ALTERNATE', 'AMASS', 'AMELIORATE', 'ANALYZE', 'ANSWER', 'APPEAR', 'APPLY', 'APPROVE_PRAISE', 'ARGUE-IN-DEFENSE', 'AROUSE_WAKE_ENLIVEN', 'ARRIVE', 'ASCRIBE', 'ASK_REQUEST', 'ASSIGN-SMT-TO-SMN', 'ATTACH', 'ATTACK_BOMB', 'ATTEND', 'ATTRACT_SUCK', 'AUTHORIZE_ADMIT', 'AUTOMATIZE', 'AUXILIARY', 'AUX_MOD', 'BE-LOCATED_BASE', 'BEFRIEND', 'BEGIN', 'BEHAVE', 'BELIEVE', 'BEND', 'BENEFIT_EXPLOIT', 'BETRAY', 'BEWITCH', 'BID', 'BLIND', 'BORDER', 'BREAK_DETERIORATE', 'BREATH_BLOW', 'BRING', 'BULGE-OUT', 'BURDEN_BEAR', 'BURN', 'BURY_PLANT', 'BUY', 'CAGE_IMPRISON', 'CALCULATE_ESTIMATE', 'CANCEL_ELIMINATE', 'CARRY-OUT-ACTION', 'CARRY_TRANSPORT', 'CASTRATE', 'CATCH', 'CATCH_EMBARK', 'CAUSE-MENTAL-STATE', 'CAUSE-SMT', 'CAVE_CARVE', 'CELEBRATE_PARTY', 'CHANGE-APPEARANCE/STATE', 'CHANGE-HANDS', 'CHANGE-TASTE', 'CHANGE_SWITCH', 'CHARGE', 'CHASE', '

Each VerbAtlas Frame is composed of a cluster of BabelNet synsets:

In [10]:
va_frame_example = 'va:0051f' # = EAT_BITE
bn_synsets_offset_example = explorer.va2bns[va_frame_example]
print(bn_synsets_offset_example)

['bn:00083099v', 'bn:00083503v', 'bn:00083515v', 'bn:00083519v', 'bn:00083752v', 'bn:00083753v', 'bn:00083755v', 'bn:00084009v', 'bn:00084145v', 'bn:00084147v', 'bn:00084153v', 'bn:00084464v', 'bn:00084764v', 'bn:00084765v', 'bn:00084843v', 'bn:00084888v', 'bn:00085688v', 'bn:00085689v', 'bn:00086112v', 'bn:00086765v', 'bn:00086815v', 'bn:00086819v', 'bn:00086843v', 'bn:00086845v', 'bn:00087374v', 'bn:00087460v', 'bn:00087461v', 'bn:00087462v', 'bn:00087468v', 'bn:00088039v', 'bn:00088181v', 'bn:00088449v', 'bn:00088574v', 'bn:00088759v', 'bn:00088911v', 'bn:00088950v', 'bn:00089118v', 'bn:00090519v', 'bn:00090694v', 'bn:00090748v', 'bn:00091055v', 'bn:00091056v', 'bn:00091057v', 'bn:00091065v', 'bn:00091091v', 'bn:00091449v', 'bn:00091507v', 'bn:00091509v', 'bn:00091611v', 'bn:00091847v', 'bn:00091942v', 'bn:00092210v', 'bn:00092393v', 'bn:00093167v', 'bn:00093334v', 'bn:00093918v', 'bn:00093968v', 'bn:00094593v', 'bn:00094750v', 'bn:00095538v', 'bn:00095654v', 'bn:00095787v', 'bn:000

Each BabelNet synset can be converted into a WordNet synset:

In [11]:
wn_synset_offset_example = explorer.bn2wn[bn_synsets_offset_example[0]]
print('e.g.:',bn_synsets_offset_example[0], '->', wn_synset_offset_example, '=', explorer.get_synset_wn(wn_synset_offset_example))
print([explorer.get_synset_wn(explorer.bn2wn[e]).name() for e in bn_synsets_offset_example])

e.g.: bn:00083099v -> wn:01185981v = Synset('feast.v.01')
['feast.v.01', 'gorge.v.01', 'bite.v.01', 'snap_at.v.01', 'bolt.v.03', 'gobble.v.01', 'garbage_down.v.01', 'breakfast.v.01', 'crop.v.05', 'browse.v.04', 'brunch.v.01', 'cannibalize.v.01', 'champ.v.01', 'chomp.v.01', 'chaw.v.01', 'chew.v.01', 'devour.v.03', 'consume.v.02', 'crunch.v.03', 'devour.v.04', 'pitch_in.v.01', 'digest.v.01', 'dine.v.01', 'eat_in.v.01', 'drop.v.17', 'eat.v.01', 'eat.v.02', 'feed.v.06', 'eat_up.v.01', 'fare.v.02', 'fill_up.v.04', 'forage.v.02', 'gluttonize.v.01', 'swallow.v.01', 'gnaw.v.01', 'go_down.v.05', 'mumble.v.02', 'lunch.v.01', 'masticate.v.01', 'mess.v.01', 'nibble.v.01', 'nibble.v.02', 'nibble.v.03', 'nip.v.02', 'nosh.v.01', 'partake.v.03', 'peck.v.02', 'pick_at.v.02', 'picnic.v.01', 'pop.v.11', 'predigest.v.01', 'tuck_in.v.01', 'raven.v.04', 'ruminate.v.01', 'scavenge.v.03', 'slurp.v.01', 'snap.v.12', 'sup.v.01', 'take_out.v.12', 'victual.v.03', 'wash_down.v.01', 'wine_and_dine.v.01', 'wolf.v.01

The first thing to notice is that a very small part of the VerbAtlas frames have no BabelNet synsets connected to them:

In [12]:
n_of_bn_synsets = 0
for va_frame in explorer.va2bns.keys():
    for bn_offset_pos in explorer.va2bns[va_frame]:
        n_of_bn_synsets+=1

print(len(explorer.va2bns.keys()),'VerbAtlas frames are mapped into BabelNet synsets out of',len(explorer.va2name.keys()))
print('The total number of BabelNet synsets in the latter is',n_of_bn_synsets)

425 VerbAtlas frames are mapped into BabelNet synsets out of 432
The total number of BabelNet synsets in the latter is 13767


# Exploring WordNet graph

In order to identify (and exploit) the most common synsets in the graph, some procedures must be done: <br>
<ul>
    <li> For each VerbAtlas frame, we pick every BabelNet synset.
    <li> For each of them, we convert it to the corresponding WordNet synset.
    <li> Then, by using the derivationally related forms of the lemmas of each WordNet synset, we explore the corresponding hypernyms recursively, via DFS. 
</ul>

In [13]:
explored_graph = explorer.explore_hypernyms()
explorer.save_results(explored_graph, os.path.join(va_res_path,'./explored_graph.tsv'))

The output of the file has three columns:
<ul>
    <li> the WordNet id
    <li> the number of times that the entity is visited
    <li> its definition 
</ul>
Here are some examples:

In [14]:
explored_graph[:4]

[['entity.n.01',
  '27293',
  'that which is perceived or known or inferred to have its own distinct existence (living or nonliving)'],
 ['abstraction.n.06',
  '14845',
  'a general concept formed by extracting common features from specific examples'],
 ['physical_entity.n.01', '12442', 'an entity that has physical existence'],
 ['psychological_feature.n.01',
  '9321',
  'a feature of the mental life of a living organism']]

# Finding best hypernyms

Now, by searching for possible candidates (using keywords like "act", "event and so on") and by manually double-checking them using synset's hyponyms and definitions, we can retrieve possible candidates to use in order to identify hyponyms that can be considered nominal events!

Computing the candidates:

Take the most significant ones and check if there are others not explored by the temporary potential candidates!
Let's put the most prominent ones in there and see if (and which) remaining synsets still remains untouched (via DFS):<br>
<ol>
<li> Explore the graph and prune it with the temporary positive_synsets: i.e., if a synset is an hyponym in the positive_synsets list, it is removed (because the possible candidate synset in the final results.tsv file will have as possible hypernyms the ones present in positive_synsets)
<li> Check for remaining synsets in the generated file explored_graph.tsv that could be put in positive_synsets. If so, redo from step 1., else stop
</ol>

1)

In [15]:
positive_synsets = [
    'event.n.01',
    'act.n.02',
    # communication.n.02, # something that is communicated by or to or between people or groups | I don't think it's a good candidate, need to understand better
    'process.n.06',
    # process.n.02, # (psychology) the performance of some composite cognitive activity; an operation that affects mental contents
    # feeling.n.01,
    # thinking.n.01,
]

explored_graph = explorer.explore_hypernyms(negative_synsets=positive_synsets)
explorer.save_results(explored_graph, os.path.join(va_res_path,'explored_graph.tsv'))

2)

In [16]:
# We can check for both methods synsets that are ambiguous by seeing not only its definition, but also of the hyponyms:
excluded_synset = 'feeling.n.01'
sysn = wordnet.synset(excluded_synset)
print('#', sysn, sysn.definition())
print('-----------------------')
for ssysn in sysn.hyponyms():
    print(ssysn, ssysn.definition())

# Synset('feeling.n.01') the experiencing of affective and emotional states
-----------------------
Synset('affect.n.01') the conscious subjective aspect of feeling or emotion
Synset('affection.n.01') a positive feeling of liking
Synset('agitation.n.03') the feeling of being agitated; not calm
Synset('ambivalence.n.01') mixed feelings or emotions
Synset('apathy.n.01') an absence of emotion or enthusiasm
Synset('astonishment.n.01') the feeling that accompanies something extremely surprising
Synset('calmness.n.03') a feeling of calm; an absence of agitation or excitement
Synset('complex.n.03') (psychoanalysis) a combination of emotions and impulses that have been rejected from awareness but still influence a person's behavior
Synset('desire.n.01') the feeling that accompanies an unsatisfied state
Synset('despair.n.02') the feeling that everything is wrong and nothing will turn out well
Synset('devastation.n.02') the feeling of being confounded or overwhelmed
Synset('dislike.n.02') a feel

In [17]:
poss_cand = explorer.get_candidates(positive_synsets, extend_to_hypernyms = False, remove_multiframe_synsets = False)
print('There are',len(poss_cand.keys()),'possible candidates!')

There are 3974 possible candidates!


We have obtained the same result with less effort and synsets!

With the method get_candidates() we have obtained around 4000 possible candidates to be part of the nominal resource. One possible example:

In [18]:
ex_pc = 'eating.n.01'
print(ex_pc, poss_cand[ex_pc])

eating.n.01 {'definition': 'the act of consuming food', 'frames': {'EAT_BITE'}, 'indirect_frames': set(), 'wn_bases': {'eat.v.02', 'feed.v.06', 'eat.v.01'}, 'indirect_wn_bases': set(), 'is_drf': True}


# Resolving ambiguity

There is one main problem that was not addressed so far:
<ul>
    <li> What if a synset is ambiguous? (i.e. it can be in multiple frames?)
</ul><br>
Here is an example of ambiguity:

In [19]:
ex_amb = 'adoption.n.01'
print(ex_amb, poss_cand[ex_amb])

adoption.n.01 {'definition': 'the act of accepting with approval; favorable reception', 'frames': {'FOLLOW_SUPPORT_SPONSOR_FUND', 'AGREE_ACCEPT', 'TOLERATE'}, 'indirect_frames': set(), 'wn_bases': {'accept.v.04', 'accept.v.03', 'accept.v.02', 'adopt.v.01'}, 'indirect_wn_bases': set(), 'is_drf': True}


A clever solution must be found. If we remove the ambiguous ones:

In [20]:
poss_cand_unamb = explorer.get_candidates(positive_synsets, remove_multiframe_synsets = True)
print('There are',len(poss_cand_unamb.keys()),'unambiguous candidates, so we have',len(poss_cand.keys())-len(poss_cand_unamb.keys()),'ambiguous!')

There are 2899 unambiguous candidates, so we have 1075 ambiguous!


There are various solutions that can be made:
<ol>
<li> Put the ambiguous synset in all the corresponding frames. This could be a solution in order to understand if two or more frames need to be merged.
<li> Remove the ambiguous synset in all the corresponding frames. This can't be an optimal solution.
<li> Put the ambiguous synset in a particular frame using some heuristics. This is the optimal solution, removing the ambiguity of the frame.
</ol>
The third solution will be applied, using statistics: the frame in which the derivationally related form is connected the most (or its hypernyms if there is uncertanty), it will be put in it.

Here I will print an example of an ambiguous synset candidate, with also its hypernym:

In [21]:
candidates_extended = explorer.get_candidates(positive_synsets, extend_to_hypernyms=True, remove_multiframe_synsets=False)

candidate_synset_name = 'articulation.n.03'
candidate_values = candidates_extended[candidate_synset_name]

candidate_synset_name_hyper = wordnet.synset(candidate_synset_name).hypernyms()[0].name()
candidate_values_hyper = candidates_extended[candidate_synset_name_hyper]

print(candidate_synset_name, '---hypern--->', candidate_synset_name_hyper)
print('--------------- synset: ----------------')
print(candidate_values)
print('--------------- hypernymy: ----------------')
print(candidate_values_hyper)

articulation.n.03 ---hypern---> expression.n.03
--------------- synset: ----------------
{'definition': 'expressing in coherent verbal form', 'frames': {'PRONOUNCE', 'SPEAK', 'EXPLAIN'}, 'indirect_frames': set(), 'wn_bases': {'voice.v.01', 'give_voice.v.01', 'pronounce.v.01', 'articulate.v.05'}, 'indirect_wn_bases': set(), 'is_drf': True}
--------------- hypernymy: ----------------
{'definition': 'the communication (in speech or writing) of your beliefs or opinions', 'frames': {'SPEAK'}, 'indirect_frames': {'CAUSE-MENTAL-STATE', 'SPEAK', 'APPROVE_PRAISE', 'PRONOUNCE', 'EXPLAIN'}, 'wn_bases': {'express.v.02'}, 'indirect_wn_bases': {'voice.v.01', 'congratulate.v.02', 'articulate.v.05', 'pronounce.v.01', 'compliment.v.01', 'pride.v.01', 'give_voice.v.01'}, 'is_drf': True}


Here, by using path finding for the candidate and its hypernym, I will show the path to reach the candidate starting from the verbal synsets:

In [22]:
for wn_base in candidate_values['wn_bases']:
    print( explorer.path_finding(wn_base, candidate_synset_name, positive_synsets) )
print('---------- hypernymy direct connections (as drf) -------')
for wn_base in candidate_values_hyper['wn_bases']:
    print( explorer.path_finding(wn_base, candidate_synset_name_hyper, positive_synsets) )
print('---------- hypernymy indirect connections (e.g. hypernymy of...) -------')
for wn_base in candidate_values_hyper['indirect_wn_bases']:
    print( explorer.path_finding(wn_base, candidate_synset_name_hyper, positive_synsets) )

|VA_FRAME:SPEAK| voice.v.01 --lemma--> voice --drf--> voice --synset--> articulation.n.03  
|VA_FRAME:SPEAK| give_voice.v.01 --lemma--> articulate --drf--> articulation --synset--> articulation.n.03  
|VA_FRAME:PRONOUNCE| pronounce.v.01 --lemma--> articulate --drf--> articulation --synset--> articulation.n.03  
|VA_FRAME:EXPLAIN| articulate.v.05 --lemma--> articulate --drf--> articulation --synset--> articulation.n.03  
---------- hypernymy direct connections (as drf) -------
|VA_FRAME:SPEAK| express.v.02 --lemma--> express --drf--> expression --synset--> expression.n.03  
---------- hypernymy indirect connections (e.g. hypernymy of...) -------
|VA_FRAME:SPEAK| voice.v.01 --lemma--> voice --drf--> voice --synset--> articulation.n.03 --hypern--> expression.n.03
|VA_FRAME:APPROVE_PRAISE| congratulate.v.02 --lemma--> congratulate --drf--> congratulation --synset--> congratulation.n.02 --hypern--> expression.n.03
|VA_FRAME:EXPLAIN| articulate.v.05 --lemma--> articulate --drf--> articulatio

Now, solving ambiguity statistically:

In [23]:
poss_cand_solved_statistically = explorer.resolve_ambiguity_statistically(positive_synsets=positive_synsets)
n_ambiguous_now = {k:v for k,v in poss_cand_solved_statistically.items() if len(v['frames']) > 1}
print('There are',len(n_ambiguous_now.keys()),'ambiguous candidates now!', 'Total synsets now: ', len(poss_cand_solved_statistically))

There are 0 ambiguous candidates now! Total synsets now:  3974


We can also see how many frames are not connected with any noun:

In [24]:
frame_connections = explorer.calculate_frames_connections(poss_cand_solved_statistically)
frames_not_touched = [e for e in frame_connections if e[1] == 0]
print('There are',len(frames_not_touched),'frames that are not connected with nominal synsets out of',len(frame_connections))

There are 37 frames that are not connected with nominal synsets out of 432


In [25]:
explorer.save_results([[s,v['definition']] + list(v['frames']) for s,v in poss_cand_solved_statistically.items()], os.path.join(va_res_path,'./possible_candidates.tsv'))

After a final double-check, we are ready to save them!

# Saving final results

In [26]:
explorer.save_results([[s,list(v['frames'])[0]] for s,v in poss_cand_solved_statistically.items()], os.path.join(va_res_path,'./results.tsv'))

# Saving nominal synset graph

In [27]:
import json

class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        return json.JSONEncoder.default(self, obj)

explored_nominal_graph = explorer.get_candidates(positive_synsets, extend_to_hypernyms = True, remove_multiframe_synsets = False)
explored_nominal_graph_unambiguous = explorer.get_candidates(positive_synsets, extend_to_hypernyms = True, remove_multiframe_synsets = True)
candidates_unambiguous = explorer.get_candidates(positive_synsets, extend_to_hypernyms = False, remove_multiframe_synsets = True)

with open(os.path.join(va_res_path,'explored_nominal_graph.json'), 'w') as outfile:
    json.dump(explored_nominal_graph, outfile, cls=SetEncoder, indent=4)

with open(os.path.join(va_res_path,'explored_nominal_graph_unambiguous.json'), 'w') as outfile:
    json.dump(explored_nominal_graph_unambiguous, outfile, cls=SetEncoder, indent=4)

with open(os.path.join(va_res_path,'candidates_unambiguous.json'), 'w') as outfile:
    json.dump(candidates_unambiguous, outfile, cls=SetEncoder, indent=4)

with open(os.path.join(va_res_path,'poss_cand_solved_statistically.json'), 'w') as outfile:
    json.dump(poss_cand_solved_statistically, outfile, cls=SetEncoder, indent=4)

In [28]:
len(explored_nominal_graph), len(explored_nominal_graph_unambiguous), len(candidates_unambiguous)

(4346, 2899, 2899)

In [29]:
with open(os.path.join(va_res_path,'positive_synsets.json'), 'w') as outfile:
    json.dump(positive_synsets, outfile, cls=SetEncoder, indent=4)

# Creating dataset for Nominal Identificator

In [30]:
ratio = 0.8

ni_train = []
ni_valid = []

nominal_part = explorer.get_candidates(positive_synsets, extend_to_hypernyms = True, remove_multiframe_synsets = False)
ni_train_dim = int(len(nominal_part)*0.8)
ni_valid_dim = int(len(nominal_part)*0.2)

for i, (wn_syn, vals) in enumerate(nominal_part.items()):
    result_sample = [
        wn_syn.split('.')[0].replace('_',' '),
        vals['definition'], 
        '1',
        wn_syn]
    if i < ni_train_dim:
        ni_train.append(result_sample)
    elif i < ni_train_dim + ni_valid_dim:
        ni_valid.append(result_sample)
    else:
        break

num_of_samples = 0
for va_frame in explorer.va2bns.keys():
    va_frame_name = explorer.va2name[va_frame]
    
    for bn_offset_pos in explorer.va2bns[va_frame]:
        wn_offset_pos = explorer.bn2wn[bn_offset_pos]
        wn_synset = explorer.get_synset_wn(wn_offset_pos)
        
        for wn_lemma in wn_synset.lemmas(): 
            wn_der_rel_forms = wn_lemma.derivationally_related_forms()
            for wn_der_rel_form in wn_der_rel_forms: 
                wn_der_rel_form_synset = wn_der_rel_form.synset()
                if not explorer.dfs(wn_der_rel_form_synset, positive_synsets):

                    result_sample = [
                        wn_der_rel_form_synset.name().split('.')[0].replace('_',' '), 
                        wn_der_rel_form_synset.definition(), 
                        '0',
                        wn_der_rel_form_synset.name()]

                    if num_of_samples < ni_train_dim:
                        ni_train.append(result_sample)
                    elif num_of_samples < ni_train_dim + ni_valid_dim:
                        ni_valid.append(result_sample)
                    else:
                        break

                    num_of_samples += 1

    if num_of_samples >= ni_train_dim + ni_valid_dim:
        break


In [31]:
os.makedirs(os.path.join(datasets_path, 'nominal_iden_dataset'),exist_ok=True)
explorer.save_results(ni_train, os.path.join(datasets_path, 'nominal_iden_dataset', 'train.tsv'))
explorer.save_results(ni_valid, os.path.join(datasets_path, 'nominal_iden_dataset', 'valid.tsv'))