In [1]:
# Load root folder of the Git project
from pathlib import Path
import subprocess

repo_root = Path(
    subprocess.check_output(
        ["git", "rev-parse", "--show-toplevel"],
        text=True
    ).strip()
)

### Loading CommonGen dataset

In [2]:
import datasets
import pandas as pd
DATASET = "GEM/common_gen"
CACHE_DIR = repo_root / "offline_datasets/original/CommonGen/"
data = datasets.load_dataset(DATASET, cache_dir=CACHE_DIR)
df_train = pd.DataFrame(data["train"])
df_valid = pd.DataFrame(data["validation"])
full_df = pd.concat([df_train, df_valid])[['concepts', 'target']]
print(f"CommonGen contains {full_df.shape[0]:,} elements")
full_df.head(2)

  from .autonotebook import tqdm as notebook_tqdm


CommonGen contains 68,382 elements


Unnamed: 0,concepts,target
0,"[mountain, ski, skier]",Skier skis down the mountain
1,"[mountain, ski, skier]",A skier is skiing down a mountain.


### Now we want to augment the dataset
Intuition is that each row is also an example of a smaller concept (e.g., 'Skier skis down the mountain' > ['mountain', 'sky', 'skier'], but also ['mountain', 'sky'])

- For each row, we're going to iterate from 1 to R concepts (where R is the number of concepts for that rule)
- For each tuple of `i` elements (`i` $\in$ R), we will sample `i` random elements from the concepts

Eventually, we'll have a dataset where each target is used possibly multiple times.

In [3]:
# Obtain unique concepts
tmp_df = full_df.copy()
tmp_df['string_concepts'] = tmp_df.concepts.apply(lambda el: " - ".join(sorted(el)))
unique_concepts = tmp_df.string_concepts.unique()
print(f'Dataset contains {len(unique_concepts):,} unique concepts')
unique_concepts[:5]

Dataset contains 33,462 unique concepts


array(['mountain - ski - skier', 'dog - tail - wag',
       'canoe - lake - paddle', 'pull - station - train',
       'eat - hay - horse'], dtype=object)

In [4]:
# Also, for each unique concept extract all associated targets 
targets_x_concept = tmp_df.groupby('string_concepts')['target'].apply(set)
print(f'Identified {targets_x_concept.shape[0]:,} groups of targets')

Identified 33,462 groups of targets


In [5]:
import random 
from tqdm import tqdm
SEED = 29
random.seed(SEED)
# Start iterating over all the rows
new_commonGen_df = []
for concept in tqdm(unique_concepts, desc="Creating new examples starting from current..."):
    list_concepts = concept.split(" - ") 
    # gather all samples associated to a given concept
    targets_for_concept = list(targets_x_concept.loc[concept])
    # for each concept, extract the number of micro-rules
    R_CONCEPTS = len(list_concepts)
    # with the number of micro-rules increasing, create new concepts as sub-sets of the existing ones
    for n_concepts_to_sample in range(1, R_CONCEPTS+1):
        # extract n_concepts_to_sample from the list of concepts
        sampled_concepts = random.sample(list_concepts, k=n_concepts_to_sample)
        new_commonGen_df.append({
            'concepts': sampled_concepts,
            'target': targets_for_concept
        })
new_commonGen_df = pd.DataFrame(new_commonGen_df)
print(f'New dataset has {new_commonGen_df.shape[0]:,} unique concepts')
new_commonGen_df.head(5)


Creating new examples starting from current...: 100%|██████████| 33462/33462 [00:01<00:00, 19317.91it/s]


New dataset has 112,088 unique concepts


Unnamed: 0,concepts,target
0,[skier],[Two skiers traveling down a snowy mountain on...
1,"[mountain, ski]",[Two skiers traveling down a snowy mountain on...
2,"[skier, ski, mountain]",[Two skiers traveling down a snowy mountain on...
3,[wag],"[a dog wags its tail with its heart, The dog i..."
4,"[tail, wag]","[a dog wags its tail with its heart, The dog i..."


In [6]:
# Eventually, explode the target column
commonGen_df = new_commonGen_df.explode('target')
print(f"New CommonGen dataset contains {commonGen_df.shape[0]:,} elements")
commonGen_df.head(5)

New CommonGen dataset contains 221,565 elements


Unnamed: 0,concepts,target
0,[skier],Two skiers traveling down a snowy mountain on ...
0,[skier],A skier is skiing down a mountain.
0,[skier],Skier skis down the mountain
0,[skier],Three skiers are skiing on a snowy mountain.
0,[skier],A skier in mid air on skis in the mountains


In [7]:
# Add a label to specify the belonging dataset
commonGen_df['dataset'] = 'CommonGen'
commonGen_df.head(2)

Unnamed: 0,concepts,target,dataset
0,[skier],Two skiers traveling down a snowy mountain on ...,CommonGen
0,[skier],A skier is skiing down a mountain.,CommonGen


### Save output dataset

In [8]:
import os
OUTPUT_PATH = repo_root / "offline_datasets/original/CommonNet/"
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
commonGen_df.to_parquet(os.path.join(OUTPUT_PATH, 'commonGen.parquet'), index=False)