In [9]:
# Load root folder of the Git project
from pathlib import Path
import subprocess

repo_root = Path(
    subprocess.check_output(
        ["git", "rev-parse", "--show-toplevel"],
        text=True
    ).strip()
)

In [10]:
import pandas as pd

PATH_TEST = repo_root / "offline_datasets/processed_dataset/CommonNet/kfold_division/test.parquet"

df = pd.read_parquet(PATH_TEST)
print(f"Original test set contains {df.shape[0]:,}")
df.head(2)

Original test set contains 54,896


Unnamed: 0,concepts,target,dataset,hash_rule_sample,positive_sample_id
0,home,all bedrooms of this home for sale have their ...,CommonGen,6570c08fba59de8c0275956ba351b4f8,106a6c241b8797f52e1e77317b96a201
1,home,heavy smoke was still coming from the home abo...,CommonGen,c66edff2eab85ab3642bcbdfb1533c23,12de3a4dab98ef8a7d67aace8150b540-015f28b9df1bd...


In [11]:
# Only keeping CommonGen samples
filtered_df = df[df.dataset=="CommonGen"].copy()
print(f"The filtered dataset contains {filtered_df.shape[0]:,} elements")

The filtered dataset contains 47,904 elements


In [12]:
filtered_df.head(2)

Unnamed: 0,concepts,target,dataset,hash_rule_sample,positive_sample_id
0,home,all bedrooms of this home for sale have their ...,CommonGen,6570c08fba59de8c0275956ba351b4f8,106a6c241b8797f52e1e77317b96a201
1,home,heavy smoke was still coming from the home abo...,CommonGen,c66edff2eab85ab3642bcbdfb1533c23,12de3a4dab98ef8a7d67aace8150b540-015f28b9df1bd...


In [13]:
from core.data_extraction.utils import export_k_fold_dataset
OUTPUT_DIR = repo_root / "offline_datasets/processed_dataset/CommonNet/kfold_division/"
export_k_fold_dataset(
    df=filtered_df,
    output_dir=OUTPUT_DIR,
    fold_idx=None,
    partition="filtered_test",
)

Also extract a single rule dataset to see whether the model memorize only certain patterns

In [14]:
from core.data_extraction.utils import create_positive_sample_id

single_rule_dataset = filtered_df.copy()
is_single_rule = single_rule_dataset["positive_sample_id"].apply(lambda x: len(x.split("-")) == 1)
single_rule_dataset = single_rule_dataset[is_single_rule].copy()
print(f"Single rule dataset contains {single_rule_dataset.shape[0]:,} elements")
single_rule_dataset.head(5)

Single rule dataset contains 7,081 elements


Unnamed: 0,concepts,target,dataset,hash_rule_sample,positive_sample_id
0,home,all bedrooms of this home for sale have their ...,CommonGen,6570c08fba59de8c0275956ba351b4f8,106a6c241b8797f52e1e77317b96a201
38,leave,A red fire hydrant surrounded by red leaves.,CommonGen,b60379baf81163b98eae50fc1f126009,6c374e70334072aeeb62ed46ea987838
45,add,fans add to the colour on show at their clash .,CommonGen,9e43a9b2f279336eb9ec2d5311d89e88,34ec78fcc91ffb1e54cd85e4a0924332
52,leave,The band has left the building.,CommonGen,f2e543f4f3866087293e5cf622b5e8c0,6c374e70334072aeeb62ed46ea987838
73,hang,posters hanging on the wall above the green couch,CommonGen,5ecd5c9971f08f761d543b1d371d8b3d,8aaf938064ccbc2f6989eb543beeaca5


In [15]:
export_k_fold_dataset(
    df=single_rule_dataset,
    output_dir=OUTPUT_DIR,
    fold_idx=None,
    partition="single_rules_test_set",
)