In [4]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import torch

from elk_generalization.results import viz

# Datasets

### Quirky Integer Comparison (QIC)


In [None]:
from elk_generalization.datasets.integer_comparison_dataset import IntComparisonDataset
n = 300_000
ds_full = IntComparisonDataset(
    base_examples=n, 
    err_symbols=('<', '>'), 
    working_dir=r"\experiments\quirky_intcomparison")
ds_full.save_balanced(4096, 1024)


In [None]:
# Create datasets with neg_statements
# Could have also taken the ones directly from the source, 
# this is just necessary because we need a datasetdict with the non-negated and negated samples
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset

save_dir = Path("experiments\diversify\got_remake")
neg_dataset_configs = [
    ("datasets\geometry-of-truth\cities.csv", " is in ", " is not in "),
    ("datasets\geometry-of-truth\larger_than.csv", " is larger than ", " is smaller than "),
    ("datasets\geometry-of-truth\sp_en_trans.csv", " means ", " does not mean "),
    ]

test_size = 0.0
for csv_path, target, replacement in neg_dataset_configs:
    # Add neg_statement
    df = pd.read_csv(csv_path)
    df["neg_statement"] = df["statement"].str.replace(target, replacement)


    if test_size == 0.0:
        dataset = Dataset.from_pandas(df)
        print(f"Saving {Path(csv_path).name} (n={len(df)})")
        dataset_dict_tuples = DatasetDict({"full": dataset})
        save_path = save_dir / Path(csv_path).stem
        dataset_dict_tuples.save_to_disk(save_path)
    else:
        # Split in train and test
        train_df, test_df = train_test_split(df, test_size=test_size, random_state=42)

        # Convert DataFrames to Datasets
        train_dataset = Dataset.from_pandas(train_df)
        test_dataset = Dataset.from_pandas(test_df)

        # Create a DatasetDict and save it to disk
        print(f"Saving {Path(csv_path).name} (n_train={len(train_dataset)}, n_test={len(test_dataset)})")
        dataset_dict_tuples = DatasetDict({"train": train_dataset, "test": test_dataset})
        save_path = save_dir / Path(csv_path).stem
        dataset_dict_tuples.save_to_disk(save_path)


In [None]:
# Create datasets from GoT without tuples
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset

save_dir = Path("\experiments\diversify\got_remake")
dataset_paths = [
    r"\datasets\geometry-of-truth\cities_cities_conj.csv",
    r"\datasets\geometry-of-truth\cities_cities_disj.csv",
    r"\datasets\geometry-of-truth\common_claim_true_false.csv",
    r"\datasets\geometry-of-truth\companies_true_false.csv",
    r"\datasets\geometry-of-truth\counterfact_true_false.csv",
    r"\datasets\geometry-of-truth\neg_cities.csv",
    r"\datasets\geometry-of-truth\neg_sp_en_trans.csv",
    r"\datasets\geometry-of-truth\smaller_than.csv",
]


test_size = 0.0
for csv_path in dataset_paths:
    # Add neg_statement
    df = pd.read_csv(csv_path)
    if test_size == 0.0:
        dataset = Dataset.from_pandas(df)
        print(f"Saving {Path(csv_path).name} (n={len(df)})")
        dataset_dict = DatasetDict({"full": dataset})
        save_path = save_dir / Path(csv_path).stem
        dataset_dict.save_to_disk(save_path)
    else:

        # Split in train and test
        train_df, test_df = train_test_split(df, test_size=test_size, random_state=42)

        # Convert DataFrames to Datasets
        train_dataset = Dataset.from_pandas(train_df)
        test_dataset = Dataset.from_pandas(test_df)

        # Create a DatasetDict and save it to disk
        print(f"Saving {Path(csv_path).name} (n_train={len(train_dataset)}, n_test={len(test_dataset)})")
        dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})
        save_path = save_dir / Path(csv_path).stem
        dataset_dict.save_to_disk(save_path)

In [None]:
# Create datasets from GoT/counterfact_true_false with tuples
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset

csv_path = Path("\datasets\geometry-of-truth\counterfact_true_false.csv")
save_dir = Path("\experiments\diversify\got_remake")
test_size = 0.0


# Add neg_statement
df = pd.read_csv(csv_path)

# Create dfs with true/false elements at even/odd positions
df_true = df.iloc[::2].reset_index(drop=True)
df_false = df.iloc[1::2].reset_index(drop=True)

print("Sanitycheck: The following statements are contrast tuples:")
index = np.random.randint(0, len(df_true))
print(df_true.loc[index, "statement"])
print(df_false.loc[index, "statement"])

# Create a new column "neg_statement" with NaN values
df["neg_statement"] = pd.Series([None] * len(df))
# Assign every other row's "statement" value to the "neg_statement" column of the previous row
df.loc[::2, "neg_statement"] = df["statement"].shift(-1)
# Remove rows where "neg_statement" is NaN, as they are not needed
df = df.dropna(subset=["neg_statement"])



if test_size == 0.0:
    for ds_df, csv_name in zip([df, df_true, df_false], ["counterfact_tuples", "counterfact_true", "counterfact_false"]):
        dataset = Dataset.from_pandas(ds_df)
        print(f"Saving {csv_name} (n={len(ds_df)})")
        dataset_dict = DatasetDict({"full": dataset})
        dataset_dict.save_to_disk(save_dir / csv_name)

else:
    for ds_df, csv_name in zip([df, df_true, df_false], ["counterfact_tuples", "counterfact_true", "counterfact_false"]):
        # Split in train and test
        train_df, test_df = train_test_split(ds_df, test_size=test_size, random_state=42)

        # Convert DataFrames to Datasets
        train_dataset = Dataset.from_pandas(train_df)
        test_dataset = Dataset.from_pandas(test_df)

        # Create a DatasetDict and save it to disk
        print(f"Saving {csv_name} (n_train={len(train_dataset)}, n_test={len(test_dataset)})")
        dataset_dict_tuples = DatasetDict({"train": train_dataset, "test": test_dataset})
        dataset_dict_tuples.save_to_disk(save_dir / csv_name)

In [None]:
# Create tuple datasets from Azaria & Mitchells datasets
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset

save_dir = Path(r"\experiments\diversify_remake")
input_dir = Path(r"\datasets\mitchell_true_false")

configs = [
    (
        "animals_true_false", 
        [
            (" has ", " does not have "), 
            (" is ", " is not "), 
            (" uses ", " does not use "), 
            (" can ", " can not "), 
            (" lacks ", " does not lack "), 
            (" migrates ", " does not migrate ")
        ]
    ),
    (
        "inventions_true_false", 
        [
            (" lived ", " did not live "), 
            (" invented ", " did not invent ")
        ]
    ),
    (
        "elements_true_false",
        [
            (" has ", " does not have "),
            (" is ", " is not "),
            (" appears ", " does not appear "),
            (" forms ", " does not form "),
        ]
    ),
    (
        "facts_true_false",
        [
            (" has ", " does not have "),
            (" is ", " is not "),
            (" are ", " are not "),
            (" exclude ", " include "),
            (" can ", " can not "),
        ]
    )
]

for csv_name, replacements in configs:
    # Add neg_statement
    df = pd.read_csv(input_dir / (csv_name + ".csv"))
    negations = []
    failed = 0
    for i in df.index:
        statement, label = df.loc[i, ["statement", "label"]]
        neg_statement = None
        for orig, negation in replacements:
            if orig in statement:
                neg_statement = statement.replace(orig, negation, 1)
                break

        if neg_statement is None:
            print(f"Could not handle statement: {statement} from {csv_name}")
            df.loc[i,"failed"] = True
            failed += 1
            continue

        negations.append({
            "statement": neg_statement,
            "label": int(not label)
        })
    neg_df = pd.DataFrame(negations)

    # Remove failed
    df = df[df["failed"] != True]

    print("Examples: ")
    print(df.loc[0, "statement"])
    print(neg_df.loc[0, "statement"])

    print("Failures to negate: ", failed)

    dataset = Dataset.from_pandas(df)
    print(f"Saving {csv_name} (n={len(df)})")
    dataset_dict = DatasetDict({"full": dataset})
    dataset_dict.save_to_disk(save_dir / (csv_name))

    dataset = Dataset.from_pandas(neg_df)
    fname = "neg_" + csv_name
    print(f"Saving {fname} (n={len(dataset)})")
    dataset_dict = DatasetDict({"full": dataset})
    dataset_dict.save_to_disk(save_dir / (fname))
