### Download the WordNet dataset

#### First, create a directory to store the dataset

In [1]:
from pathlib import Path
import subprocess

repo_root = Path(
    subprocess.check_output(
        ["git", "rev-parse", "--show-toplevel"],
        text=True
    ).strip()
)

DESTINATION_PATH = repo_root / "offline_datasets/original/WordNet/"
DESTINATION_PATH.mkdir(parents=True, exist_ok=True)

#### Then, download the WordNet dataset into that directory

In [2]:
import nltk
from nltk.corpus import wordnet as wn

# Append the DESTINATION_PATH to the available source of data
nltk.data.path.append(DESTINATION_PATH)
# Download wordnet
nltk.download('wordnet', download_dir=DESTINATION_PATH)

[nltk_data] Downloading package wordnet to /home/det_user/mboffa/Proje
[nltk_data]     cts/Jiaxuan_project/LLM_Rule_Constrainer/offline_datas
[nltk_data]     ets/original/WordNet...


True

### Read CommonGen dataset
#### We will constructed a new dataset using CommonGen as "inspiration"

In [3]:
import datasets
import pandas as pd
DATASET = "GEM/common_gen"
CACHE_DIR = repo_root / "offline_datasets/original/CommonGen/"
CACHE_DIR.mkdir(parents=True, exist_ok=True)
data = datasets.load_dataset(DATASET, cache_dir=CACHE_DIR)
df_train = pd.DataFrame(data["train"])
df_valid = pd.DataFrame(data["validation"])
df_test = pd.concat([pd.DataFrame(data["challenge_train_sample"]),
                     pd.DataFrame(data["challenge_validation_sample"])])

  from .autonotebook import tqdm as notebook_tqdm
Downloading data: 100%|██████████| 1.85M/1.85M [00:00<00:00, 2.10MB/s]
Downloading data: 100%|██████████| 87.8k/87.8k [00:00<00:00, 340kB/s]
Generating train split: 100%|██████████| 67389/67389 [00:04<00:00, 14106.16 examples/s]
Generating validation split: 100%|██████████| 993/993 [00:00<00:00, 11448.92 examples/s]
Generating test split: 100%|██████████| 1497/1497 [00:00<00:00, 13702.30 examples/s]
Generating challenge_train_sample split: 100%|██████████| 500/500 [00:00<00:00, 13911.64 examples/s]
Generating challenge_validation_sample split: 100%|██████████| 500/500 [00:00<00:00, 11865.28 examples/s]
Generating challenge_test_scramble split: 100%|██████████| 500/500 [00:00<00:00, 14200.94 examples/s]


### Extract all the CommonGen unique concepts

In [4]:
full_df = pd.concat([df_train, df_valid, df_test])
all_concepts = full_df[["concepts"]].explode("concepts")
unique_concepts = all_concepts.concepts.unique()
print(f"Full CommonGen dataset has {len(unique_concepts):,} unique concepts")

Full CommonGen dataset has 4,792 unique concepts


### Now, for each concept we will try extracting:
- The available examples for each possible meaning 
- The corresponding hypernyms (if any)
- The available examples per hypernyms

In [5]:
from tqdm import tqdm
dataset_synonyms_hypernyms = {}
for concept in tqdm(unique_concepts):
    dataset_synonyms_hypernyms[concept] = {}
    meanings = wn.synsets(concept)
    for it, meaning in enumerate(meanings):
        dataset_synonyms_hypernyms[concept][meaning.name()] = {
            "definition": meaning.definition(),
            "examples": [],
            "hypernyms": [],
        }
        examples = meaning.examples()
        dataset_synonyms_hypernyms[concept][meaning.name()]["examples"] = examples
        higher_order_names = meaning.hypernyms()
        if len(higher_order_names) != 0:
            dict_hypernym = {}
            for higher_order_name in higher_order_names:
                dict_hypernym["name"] = higher_order_name.name()
                dict_hypernym["definition"] = higher_order_name.definition()
                dict_hypernym["examples"] = higher_order_name.examples()
            dataset_synonyms_hypernyms[concept][meaning.name()]["hypernyms"].append(dict_hypernym)

100%|██████████| 4792/4792 [00:09<00:00, 491.55it/s] 


In [6]:
# Initialize all lists outside both loops
concepts, meanings, meaning_definitions, meaning_examples = [], [], [], []
out_hypernym_names, out_hypernym_definitions, out_hypernym_examples = [], [], []

for original_concept in tqdm(dataset_synonyms_hypernyms.keys()):
    total_el = 0
    for meaning in dataset_synonyms_hypernyms[original_concept]:
        inn_hypernym_names, inn_hypernym_definitions, inn_hypernym_examples = [], [], []
        # Process hypernyms for current meaning
        for hypernym in dataset_synonyms_hypernyms[original_concept][meaning]["hypernyms"]:
            inn_hypernym_names.append(hypernym["name"])
            inn_hypernym_definitions.append(hypernym["definition"])
            if len(hypernym['examples']) != 0:
                inn_hypernym_examples.append(hypernym['examples'])
            else:
                inn_hypernym_examples.append(["not available"])
        # Handle case with no hypernyms
        if len(inn_hypernym_names) == 0:
            inn_hypernym_names.append("not available")
            inn_hypernym_definitions.append("not available")
            inn_hypernym_examples.append(["not available"])
        # Add hypernyms to output lists
        out_hypernym_names.extend(inn_hypernym_names)
        out_hypernym_definitions.extend(inn_hypernym_definitions)
        out_hypernym_examples.extend(inn_hypernym_examples)
        # Calculate number of rows needed for this meaning
        n_hypernyms = len(inn_hypernym_names)
        # Add meaning information
        meaning_definitions.extend([dataset_synonyms_hypernyms[original_concept][meaning]["definition"]] * n_hypernyms)
        # Add meaning examples
        if len(dataset_synonyms_hypernyms[original_concept][meaning]["examples"]) != 0:
            meaning_examples.extend([dataset_synonyms_hypernyms[original_concept][meaning]["examples"]] * n_hypernyms)
        else:
            meaning_examples.extend([["not available"]] * n_hypernyms)
        # Add meaning names
        meanings.extend([meaning] * n_hypernyms)
        # Update total elements counter
        total_el += n_hypernyms
    # Add concept names
    concepts.extend([original_concept] * total_el)

100%|██████████| 4792/4792 [00:00<00:00, 32823.39it/s]


In [7]:
# Create the final dataframe
df = pd.DataFrame(
    zip(concepts, meanings, meaning_definitions, meaning_examples, 
        out_hypernym_names, out_hypernym_definitions, out_hypernym_examples),
    columns=["concepts", "meanings", "meanings_definitions", "meaning_examples",
            "hypernym_names", "hypernym_definitions", "hypernym_examples"]
)
df.head(2)

Unnamed: 0,concepts,meanings,meanings_definitions,meaning_examples,hypernym_names,hypernym_definitions,hypernym_examples
0,mountain,mountain.n.01,a land mass that projects well above its surro...,[not available],natural_elevation.n.01,a raised or elevated geological formation,[not available]
1,mountain,batch.n.02,(often followed by `of') a large number or amo...,"[a batch of letters, a deal of trouble, a lot ...",large_indefinite_quantity.n.01,an indefinite quantity that is above the avera...,[not available]


In [8]:
df['meaning_has_examples'] = df.meaning_examples.apply(lambda el: 'not available' not in el)
df['hypernym_has_examples'] = df.hypernym_examples.apply(lambda el: 'not available' not in el)
df.head(5)

Unnamed: 0,concepts,meanings,meanings_definitions,meaning_examples,hypernym_names,hypernym_definitions,hypernym_examples,meaning_has_examples,hypernym_has_examples
0,mountain,mountain.n.01,a land mass that projects well above its surro...,[not available],natural_elevation.n.01,a raised or elevated geological formation,[not available],False,False
1,mountain,batch.n.02,(often followed by `of') a large number or amo...,"[a batch of letters, a deal of trouble, a lot ...",large_indefinite_quantity.n.01,an indefinite quantity that is above the avera...,[not available],True,False
2,ski,ski.n.01,narrow wood or metal or plastic runners used i...,[not available],runner.n.09,device consisting of the parts on which someth...,[not available],False,False
3,ski,ski.v.01,move along on skis,"[We love to ski the Rockies, My children don't...",travel.v.01,"change location; move, travel, or proceed, als...","[How fast does your new car go?, We travelled ...",True,True
4,skier,skier.n.01,someone who skis,[not available],athlete.n.01,a person trained to compete in sports,[not available],False,False


### Save output

In [9]:
import os
OUTPUT_PATH = repo_root / "offline_datasets/processed_dataset/WordNet/"
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
df.to_parquet(os.path.join(OUTPUT_PATH, "common_gen_scraping.parquet"), index=False)

### Extract some stats
#### Hypernyms

**Definition**: a word with a broad meaning constituting a category into which words with more specific meanings fall; a superordinate. For example, colour is a hypernym of red.

Reporting as it is a relevant information when constructing pairs of positive/negative examples for WordChecker

In [10]:
hypernyms_info = df[['hypernym_names', 'hypernym_has_examples']].drop_duplicates()
print(f"The dataset contains {hypernyms_info.shape[0]:,} collective meanings")
print("Among them, how many has associated examples?")
hypernyms_info.value_counts("hypernym_has_examples")

The dataset contains 6,810 collective meanings
Among them, how many has associated examples?


hypernym_has_examples
False    3560
True     3250
Name: count, dtype: int64

#### Meanings

In [11]:
meanings_info = df[['meanings', 'meanings_definitions', 'hypernym_names', 'meaning_has_examples', 'hypernym_definitions']].drop_duplicates()
# Remove those rows in which we do not have an associated hypernym
filtered_meaning_info = meanings_info[meanings_info.hypernym_names != "not available"].copy()
print(f"The dataset contains {filtered_meaning_info.shape[0]:,} valid meanings (tuple 'meaning'+'hypernym')")
print("Among them, how many has associated examples?")
filtered_meaning_info.value_counts("meaning_has_examples")

The dataset contains 17,682 valid meanings (tuple 'meaning'+'hypernym')
Among them, how many has associated examples?


meaning_has_examples
True     9258
False    8424
Name: count, dtype: int64

### Lets check how many specification we have per hypernym

In [12]:
agg_by_hypernym = filtered_meaning_info.groupby(["hypernym_names", 'hypernym_definitions']).agg({
                                                                                                'meanings': list,
                                                                                                'meanings_definitions': list
                                                                                                }).reset_index()
agg_by_hypernym['n_meanings'] = agg_by_hypernym.meanings.apply(lambda el: len(el))
agg_by_hypernym.sort_values(by='n_meanings', ascending=False).head(3)

Unnamed: 0,hypernym_names,hypernym_definitions,meanings,meanings_definitions,n_meanings
973,change.v.01,cause to change; make different; cause a trans...,"[equal.v.03, cloud.v.06, shade.v.04, put.v.02,...","[make equal, uniform, corresponding, or matchi...",136
538,be.v.01,"have the quality of being; (copula, used with ...","[lie.v.04, hang.v.06, hail.v.02, fall.v.04, to...",[be and remain in a particular state or condit...,102
6368,travel.v.01,"change location; move, travel, or proceed, als...","[ski.v.01, carry.v.36, drive.v.02, drive.v.14,...","[move along on skis, cover a certain distance ...",95


#### See one example

In [16]:
first_row = agg_by_hypernym.sort_values(by='n_meanings', ascending=False).iloc[0]
print(f"All hypernyms associated to the concept: '{first_row.hypernym_names}'")
for meaning, definition in zip(first_row.meanings, first_row.meanings_definitions):
    print(f"\t{meaning} --> Definition: {definition}")

All hypernyms associated to the concept: 'change.v.01'
	equal.v.03 --> Definition: make equal, uniform, corresponding, or matching
	cloud.v.06 --> Definition: make less clear
	shade.v.04 --> Definition: vary slightly
	put.v.02 --> Definition: cause to be in a certain state; cause to be in a certain relation
	color.v.01 --> Definition: add color to
	fill.v.01 --> Definition: make full, also in a metaphorical sense
	accelerate.v.02 --> Definition: cause to move faster
	lend.v.01 --> Definition: bestow a quality on
	commercialize.v.02 --> Definition: make commercial
	sauce.v.03 --> Definition: add zest or flavor to, make more interesting
	decorate.v.01 --> Definition: make more attractive by adding ornament, colour, etc.
	port.v.08 --> Definition: modify (software) for use on a different machine or platform
	bring.v.05 --> Definition: bring into a different state
	lift.v.10 --> Definition: raise in rank or condition
	shape.v.03 --> Definition: give shape or form to
	turn.v.16 --> Definiti