In [120]:
import numpy as np
import pandas as pd
import json
import os

In [121]:
root_path = os.path.dirname(os.getcwd())

## EXTRACT SAMPLES FROM LILA (they use Task 7 from NumGLUE)

In [122]:
lila_path = os.path.join(root_path, "data", "lila", "all", "NumGLUE_Type_7_crowdsourced.json")

with open(lila_path, 'r') as f:
    lila_equate = json.load(f)

In [123]:
lila_equate.keys()

dict_keys(['Source', 'Categories', 'Instances', 'Metadata'])

In [124]:
lila_equate = lila_equate["Instances"]

In [125]:
len(lila_equate)

6325

In [126]:
lila_equate[0].keys()

dict_keys(['Input', 'Output Program', 'Output Answer', 'split'])

In [127]:
lila_equate[0]["Input"]

' "statement 1": In a deck of less than 72 cards , how many ways are there to select 13 Spade and 13 heart cards without repetition ?, "statement 2" :In a deck of 52 cards , how many ways are there to select 13 Spade and 13 heart cards without repetition ?, "options: " Entailment or contradiction or neutral?'

In [128]:
lila_equate[1]["Output Program"][0]

"RajeshHaveQuestionsS1 = 41 \nRajeshHaveQuestionsS2 = 31\nif RajeshHaveQuestionsS1 is None or RajeshHaveQuestionsS2 is None:\n   print('neutral')\nelif RajeshHaveQuestionsS1==RajeshHaveQuestionsS2:\n      print('Entailment')\nelif RajeshHaveQuestionsS1!=RajeshHaveQuestionsS2:\n     print('contradiction')"

In [129]:
train, val, test = [], [], []
for instance in lila_equate:
    new_instance = dict()
    inputs = instance["Input"]
    premise_hypothesis = inputs.split(", \"statement 2\" :")
    try:
        premise, hypothesis = premise_hypothesis[0], premise_hypothesis[1]
        premise = premise.split("\"statement 1\": ")[-1]
        hypothesis = hypothesis.split(", \"options: \"")[0]
        new_instance.update({"premise": premise,
                             "hypothesis": hypothesis,
                             "lila_label": instance["Output Answer"][0],
                             "lila_script": instance["Output Program"][0]})
        split = instance["split"]
        if split == "train":
            train.append(new_instance)
        elif split == "dev":
            val.append(new_instance)
        else:
            test.append(new_instance)
    except IndexError:
        print(f"ERROR extracting inputs:\n{inputs}")

In [130]:
print(len(train), len(val), len(test))

4302 806 1217


In [131]:
import re

def clean_text(text: str):
    return re.sub(r'\s+', ' ', text.lower().replace("\n", "")).strip()

In [132]:
lila_train = pd.DataFrame(train)
lila_test = pd.DataFrame(test)
lila_val = pd.DataFrame(val)
lila_val.head()

Unnamed: 0,premise,hypothesis,lila_label,lila_script
0,"In a deck of less than 72 cards , how many way...","In a deck of 52 cards , how many ways are ther...",neutral,DeckOfCardStatement1= None \nDeckOfCardStateme...
1,If out of 41 questions solved by Rajesh 37 que...,If out of 31 questions solved by Rajesh 37 que...,contradiction,RajeshHaveQuestionsS1 = 41 \nRajeshHaveQuestio...
2,Alice drives at a constant speed of 30 km per ...,Alice drives at a constant speed of 20 km per ...,contradiction,DrivesSpeedS1 = 30\nDrivesSpeedS2 = 20\nif Dri...
3,Mary is 22 years younger than Albert,Mary is 72 years younger than Albert,contradiction,AgeS1 = 22\nAgeS2 = 72\nif AgeS1 is None or Ag...
4,Assuming that Karen drives at an average speed...,Assuming that Karen drives at an average speed...,neutral,\nDriveS2 = 60\nDriveS1 = None\nif DriveS1 is ...


In [133]:
lila_val["lila_label"] = lila_val["lila_label"].apply(lambda label: label.lower())
lila_val["lila_label"] = lila_val["lila_label"].apply(lambda label: label.lower())
lila_val["lila_label"] = lila_val["lila_label"].apply(lambda label: label.lower())

In [134]:
for col in ["premise", "hypothesis"]:
    lila_train[f"clean_{col}"] = lila_train[col].apply(lambda text: clean_text(text))
    lila_test[f"clean_{col}"] = lila_test[col].apply(lambda text: clean_text(text))
    lila_val[f"clean_{col}"] = lila_val[col].apply(lambda text: clean_text(text))

In [135]:
lila_train = lila_train[~lila_train.duplicated(subset=['clean_premise', 'clean_hypothesis'])]
lila_test = lila_test[~lila_test.duplicated(subset=['clean_premise', 'clean_hypothesis'])]
lila_val = lila_val[~lila_val.duplicated(subset=['clean_premise', 'clean_hypothesis'])]

In [136]:
print(lila_train.shape[0], lila_test.shape[0], lila_val.shape[0])

4302 1217 806


### Save train-val-test splits from LILA

In [137]:
lila_train.drop(["premise", "hypothesis"], axis=1, inplace=True)
lila_test.drop(["premise", "hypothesis"], axis=1, inplace=True)
lila_val.drop(["premise", "hypothesis"], axis=1, inplace=True)
lila_test.columns

Index(['lila_label', 'lila_script', 'clean_premise', 'clean_hypothesis'], dtype='object')

In [138]:
lila_train.to_csv(os.path.join(root_path, "data", "lila", "lila_train.csv"), index=False)
lila_test.to_csv(os.path.join(root_path, "data", "lila", "lila_test.csv"), index=False)
lila_val.to_csv(os.path.join(root_path, "data", "lila", "lila_val.csv"), index=False)

### Check for duplicates accross the entire LILA data

In [109]:
lila_all = []
lila_all.extend(train)
lila_all.extend(test)
lila_all.extend(val)

In [114]:
lila_all_df = pd.DataFrame(lila_all)
lila_all_df.shape[0]

6325

In [113]:
lila_all_df[lila_all_df.duplicated(subset=["clean_premise", "clean_hypothesis"])].shape[0]

0

In [116]:
lila_all_df[lila_all_df.duplicated(subset=["clean_premise", "clean_hypothesis"])].shape[0]

0

## Merge LILA samples with EQUATE samples per EQUATE dataset

In [139]:
lila_train.shape

(4302, 4)

In [115]:
def extract_completion(full_script: str):
    """
    Extract only script from the script .py files (remove comments containing the premise, hypothesis and EQUATE label.
    :param full_script: the script as saved in a .py file in `data/generated/[dataset]/gpt4`
    :return: the script without the removed parts or np.nan for a missing script
    """
    if full_script is None or full_script == "-" or pd.isna(full_script):
        return np.nan
    try:
        lines = full_script.split("\n")
    except AttributeError:
        print("#############", "\n", full_script)
    idx = 0
    for idx, line in enumerate(lines):
        if line.startswith("# Golden Label:"):
            break
    return "\n".join(lines[idx+3:-1])  # skip label and 2 blank lines

def add_script_to_df(df: pd.DataFrame, dataset:str):
    labels_df = pd.read_csv(os.path.join(root_path, "data", "generated", dataset, "gpt4", "results_overview.csv"))
    print(labels_df[labels_df["py_file_content"]=="-"].shape[0])
    print(labels_df[labels_df["py_file_content"]=="-"]["sample_index"].unique())
    labels_df["completion"] = labels_df["py_file_content"].apply(lambda full_script: extract_completion(full_script))
    return pd.merge(df, labels_df, on="sample_index", how="left").drop(["llm_answer", "py_file_content"], axis=1)

In [145]:
datasets = ["NewsNLI", "RTE_Quant", "RedditNLI", "StressTest", "AWPNLI"]

for dataset in datasets:
    print(f"#######{dataset}#######")
    equate_df = pd.read_csv(os.path.join(root_path, "data", "equate_labelled", f"cleaned_{dataset}_gpt4.csv"))
    print(f"Total instances: {equate_df.shape[0]}")
    equate_df = add_script_to_df(equate_df, dataset)
    print(equate_df.columns)
    print(equate_df.shape[0])
    equate_lila_train = pd.merge(equate_df, lila_train, on=["clean_premise", "clean_hypothesis"], how="inner")
    equate_lila_test = pd.merge(equate_df, lila_test, on=["clean_premise", "clean_hypothesis"], how="inner")
    if equate_lila_test.shape[0] > 0:
        print(f"Test: {equate_lila_test.shape[0]}")
        print(f"Test: {equate_lila_test['lila_label'].value_counts()}")
        rest_equate = equate_df[~equate_df["sample_index"].isin(equate_lila_test["sample_index"].unique())]
        print(f"Available for FT: {rest_equate[rest_equate['golden_label'] == rest_equate['generated_label']].shape[0]}")
    else:
        rest_equate = equate_df
    print(f"Rest (train/val): {rest_equate.shape[0]}")
    output_path = os.path.join(root_path, "data", "lila-equate", dataset)
    os.makedirs(output_path, exist_ok=True)
    if dataset == "StressTest":
        rest_equate.to_csv(os.path.join(output_path, "train_val.csv"), index=False)
    else:
        rest_equate.to_csv(os.path.join(output_path, "all.csv"), index=False)
    # we need the train samples for code quality comparison
    if equate_lila_train.shape[0] > 0:
        equate_lila_train.to_csv(os.path.join(output_path, "cc_train.csv"), index=False)
    if equate_lila_test.shape[0] > 0:
        equate_lila_test.to_csv(os.path.join(output_path, "test.csv"), index=False)
        equate_lila_test.drop(["lila_label", "lila_script"], axis=1, inplace=True)
        equate_lila_test.to_csv(os.path.join(root_path, 'data', "finetuning", "StressTest", "test.csv"), index=False)

#######NewsNLI#######
Total instances: 963
0
[]
Index(['sample_index', 'generated_label', 'error_message', 'golden_label',
       'premise', 'hypothesis', 'clean_premise', 'clean_hypothesis',
       'completion'],
      dtype='object')
963
Rest (train/val): 963
#######RTE_Quant#######
Total instances: 165
0
[]
Index(['sample_index', 'generated_label', 'error_message', 'golden_label',
       'premise', 'hypothesis', 'clean_premise', 'clean_hypothesis',
       'completion'],
      dtype='object')
165
Rest (train/val): 165
#######RedditNLI#######
Total instances: 247
0
[]
Index(['sample_index', 'generated_label', 'error_message', 'golden_label',
       'premise', 'hypothesis', 'clean_premise', 'clean_hypothesis',
       'completion'],
      dtype='object')
247
Rest (train/val): 247
#######StressTest#######
Total instances: 6938
11
[5699. 5700. 5701. 5702. 5703. 5704. 5705. 5706. 5707. 5959. 5960.]
Index(['sample_index', 'generated_label', 'error_message', 'golden_label',
       'premise',

## SPLITS

In [146]:
from sklearn.model_selection import train_test_split
total_datasets = {"StressTest": 6938, "AWPNLI": 722, "RTE_Quant": 166, "RedditNLI": 250, "NewsNLI": 968}
test_pctg = 0.25
test_sizes = {"StressTest": 1215, "AWPNLI": int(test_pctg*722), "RTE_Quant": int(test_pctg*166), "RedditNLI": int(test_pctg*250), "NewsNLI": int(test_pctg*968)}
total_test_set_size = np.sum([set_size for _, set_size in test_sizes.items()])
print(total_test_set_size)
print(test_sizes)
test_pctgs = {ds: round(subset/total_test_set_size, 2) for ds, subset in test_sizes.items()}
print(test_pctgs)
total_ft = 0
for dataset, test_size in test_sizes.items():
    if dataset != "StressTest":
        df = pd.read_csv(os.path.join(root_path, "data", "lila-equate", dataset, "all.csv"))
        train_val, test = train_test_split(df, test_size=test_size, stratify=df["golden_label"])
        ft = train_val[train_val['golden_label']==train_val['generated_label']]
        total_ft += ft.shape[0]
        train, val = train_test_split(ft, test_size=0.15, stratify=ft["golden_label"])
        print(f"{dataset} FT data: {ft.shape[0]}")
        print(f"Train: {train.shape[0]}; Val: {val.shape[0]}")
        os.makedirs(os.path.join(root_path, "data", "finetuning", dataset), exist_ok=True)
        test.to_csv(os.path.join(root_path, "data", "finetuning", dataset, "test.csv"), index=False)
        train.to_csv(os.path.join(root_path, "data", "finetuning", dataset, "train.csv"), index=False)
        val.to_csv(os.path.join(root_path, "data", "finetuning", dataset, "val.csv"), index=False)
    else:
        os.makedirs(os.path.join(root_path, "data", "finetuning", dataset), exist_ok=True)
        df = pd.read_csv(os.path.join(root_path, "data", "lila-equate", dataset, "train_val.csv"))
        ft_data = df[df['golden_label'] == df['generated_label']]
        print(f"{dataset} FT data: {ft_data.shape[0]}")
        train, val = train_test_split(ft_data, test_size=0.15, stratify=ft_data["golden_label"])
        print(f"Train: {train.shape[0]}; Val: {val.shape[0]}")
        train.to_csv(os.path.join(root_path, "data", "finetuning", dataset, "train.csv"), index=False)
        val.to_csv(os.path.join(root_path, "data", "finetuning", dataset, "val.csv"), index=False)
        total_ft += ft_data.shape[0]
print(f"Total: {total_ft}")

1740
{'StressTest': 1215, 'AWPNLI': 180, 'RTE_Quant': 41, 'RedditNLI': 62, 'NewsNLI': 242}
{'StressTest': 0.7, 'AWPNLI': 0.1, 'RTE_Quant': 0.02, 'RedditNLI': 0.04, 'NewsNLI': 0.14}
StressTest FT data: 3689
Train: 3135; Val: 554
AWPNLI FT data: 516
Train: 438; Val: 78
RTE_Quant FT data: 97
Train: 82; Val: 15
RedditNLI FT data: 135
Train: 114; Val: 21
NewsNLI FT data: 548
Train: 465; Val: 83
Total: 4985


### Generate prompt and completion features, which will form the dataset for fine-tuning

In [151]:
from prompts import format_prompt
import os
import pandas as pd

# dataset = "RTE_Quant"   # can also use `dataset` variable set above
for dataset in ["StressTest", "NewsNLI", "RTE_Quant", "RedditNLI", "AWPNLI"]:
    for split in ["train", "test", "val"]:
        print(f"{split.upper()} set")
        df = pd.read_csv(os.path.join(root_path, "data", "finetuning", dataset, f"{split}.csv"))
        missing_scripts = df[df["completion"].isna()]["sample_index"]
        print(f'Samples with no generated script: {missing_scripts.shape[0]}, ({missing_scripts.unique()})')
        print(f"Creating {split} file.")
        df.dropna(subset=["completion"], inplace=True)
        df["prompt"] = df.apply(lambda row: format_prompt(dataset.lower().replace("_", ""), {"premise": row["premise"], "hypothesis": row["hypothesis"]}), axis=1)
        df["completion"] = df["completion"].apply(lambda completion: f"```python\n{completion}```")
        os.makedirs(os.path.join(root_path, "data", "finetuning", dataset, "completion"), exist_ok=True)
        if split != "test":
            df[["completion", "prompt"]].to_csv(os.path.join(root_path, "data", "finetuning", dataset, "completion", f"{split}.csv"), index=False)
        else:
            df[["sample_index", "completion", "prompt"]].to_csv(os.path.join(root_path, "data", "finetuning", dataset, "completion", f"{split}.csv"), index=False)

TRAIN set
Samples with no generated script: 3, ([3256 4683 3594])
Creating train file.
TEST set
Samples with no generated script: 2, ([6454 5929])
Creating test file.
VAL set
Samples with no generated script: 3, ([6573 1563 1122])
Creating val file.


In [148]:
df[["completion", "prompt"]].head(1)["prompt"][0]

"### Instruction:\nYou need to reason about weather a hypothesis entails or contradicts a premise, by generating Python scripts. The scripts should classify the relation between the hypothesis and premise based on the quantitative and textual information mentioned in them. All the quantities and textual details in the hypothesis should be entailed by the information in the premise. First, manually extract all the individual quantities from both of the inputs, as valid numbers. Use the variable name to describe what the quantity measures, based on the context. Then, define a Python function that takes the extracted quantities as arguments. Within the function, use these quantities to perform computations based on the context of the premise and hypothesis. Finally, compare the resulting variables to determine the relationship. If the comparison indicates entailment, return True; for contradiction return False. Remember to include brief comments in the script to explain each step of the r

In [20]:
# df["text"] = df.apply(lambda row: f'{row["prompt"]}\n### Response:\n```python\n{row["completion"]}```', axis=1)

In [21]:
# df["text"].to_csv(os.path.join(root_path, "data", "finetuning", "AWPNLI", "text", "val_text.csv"), index=False)

In [22]:
# df["text"][0]

"### Instruction:\nYou need to reason about weather a hypothesis entails or contradicts a premise, by generating Python scripts. The scripts should classify the relation between the hypothesis and premise based on the quantitative and textual information mentioned in them. All the quantities and textual details in the hypothesis should be entailed by the information in the premise. First, manually extract all the individual quantities from both of the inputs, as valid numbers. Use the variable name to describe what the quantity measures, based on the context. Then, define a Python function that takes the extracted quantities as arguments. Within the function, use these quantities to perform computations based on the context of the premise and hypothesis. Finally, compare the resulting variables to determine the relationship. If the comparison indicates entailment, return True; for contradiction return False. Remember to include brief comments in the script to explain each step of the r

## COMBINE ALL DATASETS INTO ONE SET FOR TRAIN/VAL/TEST

In [152]:
for split in ["train", "test", "val"]:
    all_df = pd.DataFrame()
    for dataset in ["StressTest", "AWPNLI", "NewsNLI", "RedditNLI", "RTE_Quant"]:
        samples = pd.read_csv(os.path.join(root_path, "data", "finetuning", dataset, "completion", f"{split}.csv"))
        if split == "test":
            samples["source"] = dataset.lower().replace("_", "")
        all_df = pd.concat([all_df, samples], ignore_index=True)
    print(f"{split} set size: {all_df.shape[0]}")
    os.makedirs(os.path.join(root_path, "data", "finetuning", "completion"), exist_ok=True)
    all_df = all_df.sample(frac=1).reset_index(drop=True)  # shuffle data
    all_df.to_csv(os.path.join(root_path, "data", "finetuning", "completion", f"{split}_all.csv"), index=False)

train set size: 4231
test set size: 1738
val set size: 748
