In this notebook, the nominal synset disambiguation task is approached using a zero-shot classification model

## Prerequisites

In [1]:
import os, sys

import numpy as np
import tqdm
import torch

  from .autonotebook import tqdm as notebook_tqdm


Important paths for the notebook:

In [2]:
datasets_root_path = './temp_files/'
srl_dataset_path = os.path.join(datasets_root_path, 'maven_nounited_srl')
checkpoints_dir_path = './checkpoints/'
model_dir_path = os.path.join(checkpoints_dir_path, 'other')

srl_dataset_dict_paths = {}
for lang in os.listdir(srl_dataset_path):
    dataset_lang_path = os.path.join(srl_dataset_path, lang)
    if os.path.isdir(dataset_lang_path):
        srl_dataset_dict_paths[lang] = {}
        for d_type in os.listdir(dataset_lang_path):
            d_name = d_type.split('.')[0]
            srl_dataset_dict_paths[lang][d_name] = os.path.join(dataset_lang_path, d_type)

In [3]:
%load_ext autoreload
%autoreload 2

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

Setting the seed for reproducibility:

In [4]:
SEED = 28

# random.seed(SEED) # not used
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
global_params = np.load(os.path.join(checkpoints_dir_path, 'models_nounited_maven', 'global_params.npy'), allow_pickle=True).tolist()

getting the nominal synsets with hypernyms that were explored via SynsetExplorer

In [6]:
import json

va_frame_info = {}
with open('./va_resources/VerbAtlas-1.1.0/VerbAtlas-1.1.0/VA_frame_info.tsv') as file:
    for line in file:
        l = line.strip('\n').split('\t')
        if len(l) > 2: # l[0] = va, l[1] = name, l[2] = info
            va_frame_info[l[1].upper()] = l[2]

explored_nominal_graph_path = './va_resources/explored_nominal_graph.json'
with open(explored_nominal_graph_path, 'r') as json_file:
    explored_nominal_graph = json.load(json_file)

poss_cand_solved_statistically_path = './va_resources/poss_cand_solved_statistically.json'
with open(poss_cand_solved_statistically_path, 'r') as json_file:
    poss_cand_solved_statistically = json.load(json_file)

getting only the ones directly connected with a verbal synset of VerbAtlas

In [7]:
explored_nominal_graph_drf = {k:v for k,v in explored_nominal_graph.items() if v['is_drf'] == True}

getting only the ambiguous ones:

In [8]:
explored_nominal_graph_drf_ambiguous = {k:v for k,v in explored_nominal_graph_drf.items() if len(v['frames']) > 1}

getting only the unambiguous ones:

In [9]:
explored_nominal_graph_drf_unambiguous = {k:v for k,v in explored_nominal_graph_drf.items() if len(v['frames']) == 1}

# Dataset

In [10]:
from torch.utils.data import Dataset

class NCDataset(Dataset):
    def __init__(self, data_dict, ambiguous = False):
        self.data = []
        for syn, e in data_dict.items():
            self.data.append({
                'synset': syn,
                'word': syn.split('.')[0],
                'definition': e['definition'],
                'label': e['frames'][0] if not ambiguous else e['frames'],
            })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

    @staticmethod
    def create_collate_fn():
        def collate_fn(batch):
            batch_formatted = {
                'word':[sample['word'] for sample in batch], 
                'definition':[sample['definition'] for sample in batch], 
                'label':[sample['label'] for sample in batch]
            }
            return batch_formatted

In [11]:
nc_all_dataset = NCDataset(explored_nominal_graph_drf, ambiguous = False)

In [46]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [47]:
classifier.device

device(type='cpu')

## Getting results

In [13]:
va_frame_info = {}
with open('./va_resources/VerbAtlas-1.1.0/VerbAtlas-1.1.0/VA_frame_info.tsv') as file:
    for line in file:
        l = line.strip('\n').split('\t')
        if len(l) > 2: # l[0] = va, l[1] = name, l[2] = info
            va_frame_info[l[1].upper()] = l[2]

In [19]:
results = {}
for e in nc_all_dataset:
    predict = classifier(e['definition'], explored_nominal_graph_drf[e['synset']]['frames'])
    predicted_frame = explored_nominal_graph_drf[e['synset']]['frames'][ np.argmax(predict['scores']) ]
    results[e['synset']] = [e['definition'], predicted_frame, va_frame_info[predicted_frame]]

In [20]:
with open(os.path.join('./temp_files/synsets_solved_with_zero_shot.tsv'), 'w') as tsvfile:
    for synset, values in results.items():
        row = synset + '\t' + '\t'.join(values)
        print(row, file=tsvfile)

In [21]:
in_frames = 0
for syn, values in results.items():
    if values[1] in explored_nominal_graph_drf[syn]['frames']:
        in_frames += 1
print(in_frames, '/', len(results), '=', in_frames/len(results))

3974 / 3974 = 1.0


In [23]:
in_frames = 0
for syn, values in results.items():
    if len(explored_nominal_graph_drf[syn]['frames']) > 1:
        print(syn, '|', values[0], '|', values[1], '|', values[2])
        print(explored_nominal_graph_drf[syn]['frames'])
        print('----------------------------')

management.n.01 | the act of managing something | MANAGE | An agent MANAGES a theme using an instrument to achieve a goal in favour of a beneficiary
['MANAGE', 'WATCH_LOOK-OUT']
----------------------------
act.n.02 | something that people do or cause to happen | INCITE_INDUCE | An agent INCITES-INDUCES a patient using an instrument to do a result (+attribute)
['INCITE_INDUCE', 'CARRY-OUT-ACTION', 'AROUSE_WAKE_ENLIVEN']
----------------------------
continuance.n.01 | the act of continuing an activity without interruption | GO-FORWARD | A cause makes an agent GO FORWARD from a source to a destination  for an extent using an instrument on a location to achieve a goal
['GO-FORWARD', 'CONTINUE', 'SPEAK']
----------------------------
adoption.n.01 | the act of accepting with approval; favorable reception | FOLLOW_SUPPORT_SPONSOR_FUND | An agent FOLLOWS-SUPPORTS-SPONSORS-FUNDS a beneficiary with an instrument to achieve a goal (+attribute)
['FOLLOW_SUPPORT_SPONSOR_FUND', 'AGREE_ACCEPT', 'TOL

This approach works only if the labels are few, so it must be used in combination with the synset explorer. Positive factors of using this strategy include the removal of the use of an intermediate dataset for the target synsets.