In [1]:
# Load root folder of the Git project
from pathlib import Path
import subprocess

repo_root = Path(
    subprocess.check_output(
        ["git", "rev-parse", "--show-toplevel"],
        text=True
    ).strip()
)

In [3]:
import pandas as pd
WORDNET_INPUT_DATASET = repo_root / "offline_datasets/processed_dataset/WordNet/common_gen_scraping.parquet"
df_wordnet = pd.read_parquet(WORDNET_INPUT_DATASET)
print(f"Loaded dataset contains {df_wordnet.shape[0]:,} rows")
df_wordnet.head(5)

Loaded dataset contains 24,886 rows


Unnamed: 0,concepts,meanings,meanings_definitions,meaning_examples,hypernym_names,hypernym_definitions,hypernym_examples,meaning_has_examples,hypernym_has_examples
0,mountain,mountain.n.01,a land mass that projects well above its surro...,[not available],natural_elevation.n.01,a raised or elevated geological formation,[not available],False,False
1,mountain,batch.n.02,(often followed by `of') a large number or amo...,"[a batch of letters, a deal of trouble, a lot ...",large_indefinite_quantity.n.01,an indefinite quantity that is above the avera...,[not available],True,False
2,ski,ski.n.01,narrow wood or metal or plastic runners used i...,[not available],runner.n.09,device consisting of the parts on which someth...,[not available],False,False
3,ski,ski.v.01,move along on skis,"[We love to ski the Rockies, My children don't...",travel.v.01,"change location; move, travel, or proceed, als...","[How fast does your new car go?, We travelled ...",True,True
4,skier,skier.n.01,someone who skis,[not available],athlete.n.01,a person trained to compete in sports,[not available],False,False


### Get specific definitions > ones related to the single concepts

In [4]:
df_specific_definitions = df_wordnet.groupby("meanings_definitions")['concepts'].apply(lambda el: list(set(el))).reset_index()
print(f"Identified {df_specific_definitions.shape[0]:,} definitions for specific concepts")
df_specific_definitions.head(2)

Identified 19,527 definitions for specific concepts


Unnamed: 0,meanings_definitions,concepts
0,(American football) a complete play to advance...,[down]
1,(American football) a play in which a defensiv...,[mousetrap]


In [5]:
### Get stats on the number of concepts per target
n_concepts_x_target = df_specific_definitions['concepts'].apply(len)
n_concepts_x_target.describe()

count    19527.000000
mean         1.274338
std          0.662936
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         15.000000
Name: concepts, dtype: float64

### Get generic definitions > ones related to the concepts' hypernyms

In [6]:
df_generic_definitions = df_wordnet.groupby("hypernym_definitions")['concepts'].apply(lambda el: list(set(el))).reset_index()
print(f"Identified {df_generic_definitions.shape[0]:,} definitions for broader concepts")
df_generic_definitions.head(2)

Identified 6,801 definitions for broader concepts


Unnamed: 0,hypernym_definitions,concepts
0,(American football) a play by the offensive team,"[pass, run, tackle]"
1,(American football) a play in which a player a...,"[sweep, draw, rush, reverse, return]"


In [7]:
### Get stats on the number of concepts per target
n_concepts_x_target = df_generic_definitions['concepts'].apply(len)
n_concepts_x_target.describe()

count    6801.000000
mean        3.276871
std        12.613075
min         1.000000
25%         1.000000
50%         2.000000
75%         3.000000
max       946.000000
Name: concepts, dtype: float64

### For each dataset, select a subset of data we will use only for test

In [8]:
# First specific definitions train/test split
import os
TRAIN_PERCENTAGE = 0.9
OUTPUT_PATH = repo_root / "offline_datasets/processed_dataset/WordNet/"
n_specific_definitions = df_specific_definitions.shape[0]
df_tmp = df_specific_definitions.copy()
df_specific_definitions_train = df_tmp.iloc[:int(n_specific_definitions*TRAIN_PERCENTAGE)].copy()
df_specific_definitions_test = df_tmp.iloc[int(n_specific_definitions*TRAIN_PERCENTAGE):].copy()
print(f"Reserved {df_specific_definitions_train.shape[0]:,} specific definitions for train and {df_specific_definitions_test.shape[0]:,} for test")
df_specific_definitions_train.to_parquet(os.path.join(OUTPUT_PATH, "specific_definitions_train.parquet"), index=False)
df_specific_definitions_test.to_parquet(os.path.join(OUTPUT_PATH, "specific_definitions_test.parquet"), index=False)

Reserved 17,574 specific definitions for train and 1,953 for test


In [9]:
# Then, generic definitions train/test split
n_generic_definitions = df_generic_definitions.shape[0]
df_tmp = df_generic_definitions.copy()
df_generic_definitions_train = df_tmp.iloc[:int(n_generic_definitions*TRAIN_PERCENTAGE)].copy()
df_generic_definitions_test = df_tmp.iloc[int(n_generic_definitions*TRAIN_PERCENTAGE):].copy()
print(f"Reserved {df_generic_definitions_train.shape[0]:,} generic definitions for train and {df_generic_definitions_test.shape[0]:,} for test")
df_generic_definitions_train.to_parquet(os.path.join(OUTPUT_PATH, "generic_definitions_train.parquet"), index=False)
df_generic_definitions_test.to_parquet(os.path.join(OUTPUT_PATH, "generic_definitions_test.parquet"), index=False)

Reserved 6,120 generic definitions for train and 681 for test


### Now, get all the available examples

In [10]:
specific_examples = df_wordnet[['concepts', 'meaning_examples']].explode('meaning_examples')
# Filter the ones in which an example is 'not available'
valid_specific_examples = specific_examples[specific_examples.meaning_examples != 'not available'].copy()
print(f"Isolated other {valid_specific_examples.shape[0]:,} examples linking a word to a specific meaning")
valid_specific_examples.head(2)

Isolated other 24,496 examples linking a word to a specific meaning


Unnamed: 0,concepts,meaning_examples
1,mountain,a batch of letters
1,mountain,a deal of trouble


### Notice: it could be that some word shares the same example
Let's group by 'meaning_examples' and save also those examples for which it is the case!

In [11]:
composed_examples = valid_specific_examples.groupby('meaning_examples')['concepts'].apply(lambda el: sorted(list(set(el)))).reset_index()
# only keep samples with more than 1 concept
n_concepts_x_meaning = composed_examples.concepts.apply(len)
filtered_composed_examples = composed_examples[n_concepts_x_meaning>1].copy()
print(f"There are {filtered_composed_examples.shape[0]:,} meanings that relates to composed concepts")
filtered_composed_examples.head(2)

There are 5,074 meanings that relates to composed concepts


Unnamed: 0,meaning_examples,concepts
1,'tis now the very witching time of night,"[magic, wizard]"
2,...catapulted Einstein to the pinnacle of fame,"[elevation, height, peak, summit, top]"


## Join the resulting dataset

In [12]:
# Convert concepts to lists
valid_specific_examples['concepts'] = valid_specific_examples['concepts'].apply(lambda el: [el])
joined_df = pd.concat([valid_specific_examples, filtered_composed_examples])
print(f"Isolated other {joined_df.shape[0]:,} examples linking set of words to a specific meaning")
joined_df.head(2)

Isolated other 29,570 examples linking set of words to a specific meaning


Unnamed: 0,concepts,meaning_examples
1,[mountain],a batch of letters
1,[mountain],a deal of trouble


### Now, join the three datasets composing WordNet
Also add a label, to specify that those data come from the WordNet dataset

In [13]:
df_specific_definitions_train.rename({'meanings_definitions':'target'}, axis=1, inplace=True)
df_generic_definitions_train.rename({'hypernym_definitions':'target'}, axis=1, inplace=True)
joined_df.rename({'meaning_examples':'target'}, axis=1, inplace=True)
wordnet_df = pd.concat([df_specific_definitions_train, df_generic_definitions_train, joined_df])
wordnet_df['dataset'] = 'WordNet'
print(f"WordNet contains {wordnet_df.shape[0]:,} elements")
wordnet_df.head(2)

WordNet contains 53,264 elements


Unnamed: 0,target,concepts,dataset
0,(American football) a complete play to advance...,[down],WordNet
1,(American football) a play in which a defensiv...,[mousetrap],WordNet


### Export the new dataset

In [14]:
# Shuffle before saving!
SEED = 29
wordnet_df = wordnet_df.sample(frac=1, random_state=SEED)
wordnet_df.head(2)

Unnamed: 0,target,concepts,dataset
9164,crowd or draw together,[huddle],WordNet
1446,a cord that is drawn through eyelets or around...,[lace],WordNet


In [15]:
OUTPUT_PATH = repo_root / "offline_datasets/original/CommonNet/"
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
wordnet_df.to_parquet(os.path.join(OUTPUT_PATH, "wordNet.parquet"), index=False)