In [1]:
# Load root folder of the Git project
from pathlib import Path
import subprocess

repo_root = Path(
    subprocess.check_output(
        ["git", "rev-parse", "--show-toplevel"],
        text=True
    ).strip()
)

In [2]:
import datasets
DATASET = "GEM/common_gen"
CACHE_DIR = repo_root / "offline_datasets/original/CommonGen/"
data = datasets.load_dataset(DATASET, cache_dir=CACHE_DIR)
data

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['gem_id', 'gem_parent_id', 'concept_set_id', 'concepts', 'target', 'references'],
        num_rows: 67389
    })
    validation: Dataset({
        features: ['gem_id', 'gem_parent_id', 'concept_set_id', 'concepts', 'target', 'references'],
        num_rows: 993
    })
    test: Dataset({
        features: ['gem_id', 'gem_parent_id', 'concept_set_id', 'concepts', 'target', 'references'],
        num_rows: 1497
    })
    challenge_train_sample: Dataset({
        features: ['gem_id', 'gem_parent_id', 'concept_set_id', 'concepts', 'target', 'references'],
        num_rows: 500
    })
    challenge_validation_sample: Dataset({
        features: ['gem_id', 'gem_parent_id', 'concept_set_id', 'concepts', 'target', 'references'],
        num_rows: 500
    })
    challenge_test_scramble: Dataset({
        features: ['gem_id', 'gem_parent_id', 'concept_set_id', 'concepts', 'target', 'references'],
        num_rows: 500
    })
})

## Convert dataset to dataframes

In [3]:
import pandas as pd

df_train = pd.DataFrame(data["train"])
df_valid = pd.DataFrame(data["validation"])
df_test = pd.concat([pd.DataFrame(data["challenge_train_sample"]),
                     pd.DataFrame(data["challenge_validation_sample"])])
df_test.head(2)

Unnamed: 0,gem_id,gem_parent_id,concept_set_id,concepts,target,references
0,common_gen-challenge_train_sample-0,common_gen-train-56135,25441,"[bathroom, include, shower, toilet]",The interior of a modern bathroom including to...,[]
1,common_gen-challenge_train_sample-1,common_gen-train-57322,25968,"[image, number, step, title]",image titled change the number of recent items...,[]


## Now get all the single concepts  

In [4]:
training_concepts = df_train["concepts"].explode("concepts").to_list()
validation_concepts = df_valid["concepts"].explode("concepts").to_list()
test_concepts = df_test["concepts"].explode("concepts").to_list()
full_concepts = set(training_concepts + validation_concepts + test_concepts)
print(f"The dataset contains {len(full_concepts):,} unique concepts.")

The dataset contains 4,792 unique concepts.


## Now, let's focus on the test samples
- We want to make sure to randomly create samples respecting the rules and other not respecting them
- Each sample here must contains hundreds, if not thousands, of rules

In [5]:
import random
from tqdm import tqdm

# Set the seed for reproducibility
SEED = 29
random.seed(SEED)

n_rules = [5, 10, 100, 500, 1_000]
p_contains_world = 0.5
output_dict = {}

max_rules = max(n_rules)

# We will create different versions of the datasets (with different number of rules)
for n_rule in tqdm(n_rules, total=len(n_rules), desc="Processing..."):
    output_dict[n_rule] = []

# IMPORTANT: build each test row once, then slice for each n_rule
for test_row_id in tqdm(range(df_test.shape[0]), desc="Building nested pools..."):
    test_row = df_test.iloc[test_row_id]
    test_concepts = list(test_row["concepts"])
    test_sentence = test_row["target"]

    # Decide if this row is a "match" row (same decision reused for all pool sizes)
    contains_words = (random.random() < p_contains_world)

    if contains_words:
        # choose how many concepts are actually matched (fixed for this row across pool sizes)
        # (kept similar to your original; adjust if you want to allow all concepts)
        n_matching_concepts = random.randint(1, len(test_concepts) - 1)
        matching_concepts = test_concepts[:n_matching_concepts]
    else:
        matching_concepts = []

    # Build ONE long ordered list of distractors for this row (so pools are nested)
    samplable_concepts = list(full_concepts - set(test_concepts))
    # sample enough distractors to support the largest pool size
    distractors = random.sample(samplable_concepts, k=max_rules)

    for n_rule in n_rules:
        # Number of distractors needed at this pool size
        n_distractors_needed = n_rule - len(matching_concepts)
        if n_distractors_needed < 0:
            # If matching_concepts exceeds n_rule, truncate matching_concepts for this pool size
            # (rare unless n_rule < len(matching_concepts))
            used_matching = matching_concepts[:n_rule]
            used_distractors = []
        else:
            used_matching = matching_concepts
            used_distractors = distractors[:n_distractors_needed]

        sampled_concepts = used_matching + used_distractors

        # Shuffle presentation order (seeded random makes this reproducible)
        shuffled_concepts = sampled_concepts[:]
        random.shuffle(shuffled_concepts)

        sample = {
            "sentence": test_sentence,
            "concepts": shuffled_concepts,
            "contains_concept": contains_words,
            "concepts_contained": used_matching,
        }
        output_dict[n_rule].append(sample)

for n_rule in n_rules:
    print(f"Generated {len(output_dict[n_rule]):,} examples for the scenario with {n_rule} rules")

Processing...: 100%|██████████| 5/5 [00:00<00:00, 28263.50it/s]
Building nested pools...: 100%|██████████| 1000/1000 [00:02<00:00, 383.25it/s]

Generated 1,000 examples for the scenario with 5 rules
Generated 1,000 examples for the scenario with 10 rules
Generated 1,000 examples for the scenario with 100 rules
Generated 1,000 examples for the scenario with 500 rules
Generated 1,000 examples for the scenario with 1000 rules





### Now, an example

In [6]:
EXAMPLE_ID = 36
SCENARIO = 5
print("Contains rule: " + str(output_dict[SCENARIO][EXAMPLE_ID]["contains_concept"]))
if len(output_dict[SCENARIO][EXAMPLE_ID]["concepts_contained"])!=0:
    print(f"\tContains the concepts: {output_dict[SCENARIO][EXAMPLE_ID]['concepts_contained']}")
print()
print("Task: I need to check if a sentence contains one of the following set of words. Just answer True or False.")
print("Sentence: " + output_dict[SCENARIO][EXAMPLE_ID]["sentence"])
concepts = output_dict[SCENARIO][EXAMPLE_ID]["concepts"]
print(f"\nRules ({len(concepts)}): " + " - ".join(concepts))

Contains rule: False

Task: I need to check if a sentence contains one of the following set of words. Just answer True or False.
Sentence: how did horses and traders change the way of life of ethnicity in fiction

Rules (5): poetry - leader - ornate - glacier - cricket


## Save output files

In [8]:
import json
import os
OUTPUT_DIR = repo_root / "offline_datasets/processed_dataset/CommonGen/test_many_rules/"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
for scenario, item in output_dict.items():
    full_path = OUTPUT_DIR / f"exact_matching_{scenario}.json"
    with open(full_path, "w+") as f:
        json.dump(item, f, indent=4)