In [1]:
# Load root folder of the Git project
from pathlib import Path
import subprocess

repo_root = Path(
    subprocess.check_output(
        ["git", "rev-parse", "--show-toplevel"],
        text=True
    ).strip()
)

In [2]:
import pandas as pd

INPUT_DATASET = repo_root / "offline_datasets/processed_dataset/WordNet/specific_definitions_test.parquet"

df_specific= pd.read_parquet(INPUT_DATASET)
print(f"Loaded dataset contains {df_specific.shape[0]:,} rows")
df_specific.head(5)

Loaded dataset contains 1,953 rows


Unnamed: 0,meanings_definitions,concepts
0,the legal document stating the reasons for a j...,[opinion]
1,the legal right to sit as a member in a legisl...,[seat]
2,the legal system that allows an accused person...,[bail]
3,the legislative hall where members debate and ...,[floor]
4,the length of a line segment between the cente...,[radius]


### First, the specific definitions
Given a specific concept (e.g., `the legal document stating the reasons for a judicial decision`), we want to verify whether the model can **associate it with the corresponding definition**

In [3]:
# isolate unique definitions
set_definitions = set(df_specific.meanings_definitions.unique())
print(f"The dataset contains {len(set_definitions):,} definitions")

The dataset contains 1,953 definitions


In [4]:
import random
from tqdm import tqdm
SEED = 29
random.seed(SEED)
N_NEGATIVES = 100
dataset_single_concepts_vs_definitions = []
for row_id in tqdm(range(df_specific.shape[0])):
    current_concept = list(df_specific.iloc[row_id].concepts)
    current_definition = df_specific.iloc[row_id].meanings_definitions
    # Exclude the current definition from the set of definitions we could sample from
    samplable_definitions = list(set_definitions - {current_definition})
    # Extract N_NEGATIVES from the remaining samples
    negative_definitions = list(random.sample(samplable_definitions, N_NEGATIVES-1))
    # Store the output
    current_output = {
        "concept": current_concept,
        "true_definition": current_definition, 
        "negative_definitions": negative_definitions
    }
    dataset_single_concepts_vs_definitions.append(current_output)
print(f"Generated {len(dataset_single_concepts_vs_definitions):,} samples")

100%|██████████| 1953/1953 [00:00<00:00, 3504.21it/s]

Generated 1,953 samples





### See one example

In [5]:
print(f"The sampled concept is: {dataset_single_concepts_vs_definitions[0]['concept']}")
print(f"\nThe corresponding definition is:\n\t{dataset_single_concepts_vs_definitions[0]['true_definition']}")
negative_examples = '\n\t'.join(dataset_single_concepts_vs_definitions[0]['negative_definitions'][:30])
print(f"\nExamples of negative definitions:\n\t{negative_examples}")

The sampled concept is: ['opinion']

The corresponding definition is:
	the legal document stating the reasons for a judicial decision

Examples of negative definitions:
	travel in front of; go in advance of others
	the second sign of the zodiac; the sun is in this sign from about April 20 to May 20
	travel around something
	write about
	the object upon which interest and attention focuses
	the property of being the extent of something from beginning to end
	the marked and rapid transformation of a larva into an adult that occurs in some animals
	usually large pulpy deep-yellow round fruit of the squash family maturing in late summer or early autumn
	the season for gathering crops
	very light rain; stronger than mist but less than a shower
	the relation between two different species of organisms that are interdependent; each gains benefits from the other
	turn or become green
	throw or flash the light of (a lamp)
	toiletry consisting of any of various substances in the form of a thick l

In [6]:
# Save file
import json
import os
OUTPUT_PATH = repo_root / "offline_datasets/processed_dataset/WordNet/rules_vs_definitions/"
os.makedirs(OUTPUT_PATH, exist_ok=True)
with open(os.path.join(OUTPUT_PATH, "specific_concepts_vs_definitions.json"), "w+") as f:
    json.dump(dataset_single_concepts_vs_definitions, f, indent=4)

### Now, we move to the generic definitions

In [7]:
import pandas as pd
INPUT_DATASET = repo_root / "offline_datasets/processed_dataset/WordNet/generic_definitions_test.parquet"
df_generic = pd.read_parquet(INPUT_DATASET)
print(f"Loaded dataset contains {df_generic.shape[0]:,} rows")
df_generic.head(5)

Loaded dataset contains 681 rows


Unnamed: 0,hypernym_definitions,concepts
0,the lower side of anything,"[heel, base]"
1,the lowest part of anything,"[foot, base]"
2,the magnitude of something in a particular dir...,"[length, time, height]"
3,the main organ of photosynthesis and transpira...,"[pad, pitcher, blade, scale, frond, greenery]"
4,the male of species Equus caballus,[stallion]


In [8]:
# isolate unique definitions
set_definitions = set(df_generic.hypernym_definitions.unique())
print(f"The dataset contains {len(set_definitions):,} definitions")

The dataset contains 681 definitions


In [9]:
dataset_multiple_concepts_vs_definitions = []
for row_id in tqdm(range(df_generic.shape[0])):
    current_concept = list(df_generic.iloc[row_id].concepts)
    current_definition = df_generic.iloc[row_id].hypernym_definitions
    # Exclude the current definition from the set of definitions we could sample from
    samplable_definitions = list(set_definitions - {current_definition})
    # Extract N_NEGATIVES from the remaining samples
    negative_definitions = list(random.sample(samplable_definitions, N_NEGATIVES-1))
    # Store the output
    current_output = {
        "concept": current_concept,
        "true_definition": current_definition, 
        "negative_definitions": negative_definitions
    }
    dataset_multiple_concepts_vs_definitions.append(current_output)
print(f"Generated {len(dataset_multiple_concepts_vs_definitions):,} samples")

100%|██████████| 681/681 [00:00<00:00, 4324.88it/s]

Generated 681 samples





### See an example

In [10]:
print(f"The sampled concepts are: {dataset_multiple_concepts_vs_definitions[0]['concept']}")
print(f"\nThe corresponding definition is:\n\t{dataset_multiple_concepts_vs_definitions[0]['true_definition']}")
negative_examples = '\n\t'.join(dataset_multiple_concepts_vs_definitions[0]['negative_definitions'][:30])
print(f"\nExamples of negative definitions:\n\t{negative_examples}")

The sampled concepts are: ['heel', 'base']

The corresponding definition is:
	the lower side of anything

Examples of negative definitions:
	violent action that is hostile and usually unprovoked
	the quality of having the properties that are right for a specific purpose
	undergo training or instruction in preparation for a particular role, function, or profession
	the male organ of copulation (`member' is a euphemism)
	the open circular discharging end of a gun
	the relative magnitude of something with reference to a criterion
	the totality of surrounding conditions
	the state of actually existing objectively
	the marks used to clarify meaning by indicating separation of words into sentences and clauses and phrases
	the subjective sensation of hearing something
	the painted structures of a stage set that are intended to suggest a particular locale
	the style of a particular artist or school or movement
	the time of day immediately following sunset
	wild or domesticated South American c

In [11]:
OUTPUT_PATH = repo_root / "offline_datasets/processed_dataset/WordNet/rules_vs_definitions/generic_concepts_vs_definitions.json"
with open(OUTPUT_PATH, "w+") as f:
    json.dump(dataset_multiple_concepts_vs_definitions, f, indent=4)