In [1]:
import numpy as np
import pandas as pd
import json
import os

In [2]:
root_path = os.path.dirname(os.getcwd())

## EXTRACT SAMPLES FROM LILA (they use Task 7 from NumGLUE)

In [71]:
lila_path = os.path.join(root_path, "data", "lila", "all", "NumGLUE_Type_7_crowdsourced.json")

with open(lila_path, 'r') as f:
    lila_equate = json.load(f)

In [141]:
lila_equate.keys()

dict_keys(['Source', 'Categories', 'Instances', 'Metadata'])

In [142]:
lila_equate = lila_equate["Instances"]

In [143]:
len(lila_equate)

6325

In [144]:
lila_equate[0].keys()

dict_keys(['Input', 'Output Program', 'Output Answer', 'split'])

In [145]:
lila_equate[0]["Input"]

' "statement 1": In a deck of less than 72 cards , how many ways are there to select 13 Spade and 13 heart cards without repetition ?, "statement 2" :In a deck of 52 cards , how many ways are there to select 13 Spade and 13 heart cards without repetition ?, "options: " Entailment or contradiction or neutral?'

In [146]:
lila_equate[1]["Output Program"][0]

"RajeshHaveQuestionsS1 = 41 \nRajeshHaveQuestionsS2 = 31\nif RajeshHaveQuestionsS1 is None or RajeshHaveQuestionsS2 is None:\n   print('neutral')\nelif RajeshHaveQuestionsS1==RajeshHaveQuestionsS2:\n      print('Entailment')\nelif RajeshHaveQuestionsS1!=RajeshHaveQuestionsS2:\n     print('contradiction')"

In [147]:
train, val, test = [], [], []
for instance in lila_equate:
    new_instance = dict()
    inputs = instance["Input"]
    premise_hypothesis = inputs.split(", \"statement 2\" :")
    try:
        premise, hypothesis = premise_hypothesis[0], premise_hypothesis[1]
        premise = premise.split("\"statement 1\": ")[-1]
        hypothesis = hypothesis.split(", \"options: \"")[0]
        new_instance.update({"premise": premise,
                             "hypothesis": hypothesis,
                             "lila_label": instance["Output Answer"][0],
                             "lila_script": instance["Output Program"][0]})
        split = instance["split"]
        if split == "train":
            train.append(new_instance)
        elif split == "dev":
            val.append(new_instance)
        else:
            test.append(new_instance)
    except IndexError:
        print(f"ERROR extracting inputs:\n{inputs}")

In [148]:
print(len(train), len(val), len(test))

4302 806 1217


In [149]:
import re

def clean_text(text: str):
    return re.sub(r'\s+', ' ', text.lower().replace("\n", "")).strip()

In [150]:
lila_train = pd.DataFrame(train)
lila_test = pd.DataFrame(test)
lila_val = pd.DataFrame(val)
lila_val.head()

Unnamed: 0,premise,hypothesis,lila_label,lila_script
0,"In a deck of less than 72 cards , how many way...","In a deck of 52 cards , how many ways are ther...",neutral,DeckOfCardStatement1= None \nDeckOfCardStateme...
1,If out of 41 questions solved by Rajesh 37 que...,If out of 31 questions solved by Rajesh 37 que...,contradiction,RajeshHaveQuestionsS1 = 41 \nRajeshHaveQuestio...
2,Alice drives at a constant speed of 30 km per ...,Alice drives at a constant speed of 20 km per ...,contradiction,DrivesSpeedS1 = 30\nDrivesSpeedS2 = 20\nif Dri...
3,Mary is 22 years younger than Albert,Mary is 72 years younger than Albert,contradiction,AgeS1 = 22\nAgeS2 = 72\nif AgeS1 is None or Ag...
4,Assuming that Karen drives at an average speed...,Assuming that Karen drives at an average speed...,neutral,\nDriveS2 = 60\nDriveS1 = None\nif DriveS1 is ...


In [151]:
lila_val["lila_label"] = lila_val["lila_label"].apply(lambda label: label.lower())
lila_val["lila_label"] = lila_val["lila_label"].apply(lambda label: label.lower())
lila_val["lila_label"] = lila_val["lila_label"].apply(lambda label: label.lower())

In [152]:
for col in ["premise", "hypothesis"]:
    lila_train[col] = lila_train[col].apply(lambda text: clean_text(text))
    lila_test[col] = lila_test[col].apply(lambda text: clean_text(text))
    lila_val[col] = lila_val[col].apply(lambda text: clean_text(text))

In [153]:
lila_train = lila_train[~lila_train.duplicated(subset=['premise', 'hypothesis'])]
lila_test = lila_test[~lila_test.duplicated(subset=['premise', 'hypothesis'])]
lila_val = lila_val[~lila_val.duplicated(subset=['premise', 'hypothesis'])]

In [154]:
print(lila_train.shape[0], lila_test.shape[0], lila_val.shape[0])

4302 1217 806


### Save train-val-test splits from LILA

In [None]:
lila_train.to_csv(os.path.join(root_path, "data", "lila", "lila_train.csv"), index=False)
lila_test.to_csv(os.path.join(root_path, "data", "lila", "lila_test.csv"), index=False)
lila_val.to_csv(os.path.join(root_path, "data", "lila", "lila_val.csv"), index=False)

### Check for duplicates accross the entire LILA data

In [109]:
lila_all = []
lila_all.extend(train)
lila_all.extend(test)
lila_all.extend(val)

In [114]:
lila_all_df = pd.DataFrame(lila_all)
lila_all_df.shape[0]

6325

In [113]:
lila_all_df[lila_all_df.duplicated(subset=["premise", "hypothesis"])].shape[0]

0

In [115]:
for col in ["premise", "hypothesis"]:
    lila_all_df[col] = lila_all_df[col].apply(lambda text: clean_text(text))

In [116]:
lila_all_df[lila_all_df.duplicated(subset=["premise", "hypothesis"])].shape[0]

0

## Merge LILA samples with EQUATE samples per EQUATE dataset

In [155]:
lila_train.shape

(4302, 4)

In [156]:
from qnli_datasets import read_data

datasets = ["NewsNLI", "RTE_Quant", "RedditNLI", "StressTest", "AWPNLI"]

for dataset in datasets:
    print(f"#######{dataset}#######")
    instances, _ = read_data(f"{dataset}.jsonl")
    equate_df = pd.DataFrame(instances)
    equate_df["sample_index"] = equate_df.index
    equate_df["premise"] = equate_df["premise"].apply(lambda text: clean_text(text))
    equate_df["hypothesis"] = equate_df["hypothesis"].apply(lambda text: clean_text(text))
    equate_df = equate_df[~equate_df.duplicated(subset=['premise', 'hypothesis'])]
    print(f"Total instances: {equate_df.shape[0]}")
    equate_lila_train = pd.merge(equate_df, lila_train, on=["premise", "hypothesis"], how="inner")
    equate_lila_test = pd.merge(equate_df, lila_test, on=["premise", "hypothesis"], how="inner")
    equate_lila_val = pd.merge(equate_df, lila_val, on=["premise", "hypothesis"], how="inner")
    print(f"Train: {equate_lila_train.shape[0]}\nTest: {equate_lila_test.shape[0]}\nVal: {equate_lila_val.shape[0]}")
    print(f"Train: {equate_lila_train['lila_label'].value_counts()}\nTest: {equate_lila_test['lila_label'].value_counts()}\nVal: {equate_lila_val['lila_label'].value_counts()}")
    output_path = os.path.join(root_path, "data", "lila-equate", dataset)
    os.makedirs(output_path, exist_ok=True)
    if equate_lila_train.shape[0] > 0:
        equate_lila_train.to_csv(os.path.join(output_path, "train.csv"), index=False)
    if equate_lila_test.shape[0] > 0:
        equate_lila_test.to_csv(os.path.join(output_path, "test.csv"), index=False)
    if equate_lila_val.shape[0] > 0:
        equate_lila_val.to_csv(os.path.join(output_path, "val.csv"), index=False)

#######NewsNLI#######
Total instances: 963
Train: 562
Test: 0
Val: 0
Train: lila_label
Entailment    312
neutral       250
Name: count, dtype: int64
Test: Series([], Name: count, dtype: int64)
Val: Series([], Name: count, dtype: int64)
#######RTE_Quant#######
Total instances: 165
Train: 103
Test: 0
Val: 0
Train: lila_label
neutral       52
Entailment    51
Name: count, dtype: int64
Test: Series([], Name: count, dtype: int64)
Val: Series([], Name: count, dtype: int64)
#######RedditNLI#######
Total instances: 247
Train: 0
Test: 0
Val: 0
Train: Series([], Name: count, dtype: int64)
Test: Series([], Name: count, dtype: int64)
Val: Series([], Name: count, dtype: int64)
#######StressTest#######
Total instances: 6947
Train: 3112
Test: 1217
Val: 806
Train: lila_label
neutral          1049
contradiction    1048
Entailment       1015
Name: count, dtype: int64
Test: lila_label
contradiction    466
Entailment       382
neutral          369
Name: count, dtype: int64
Val: lila_label
contradiction   

In [81]:
def extract_completion(full_script: str):
    """
    Extract only script from the script .py files (remove comments containing the premise, hypothesis and EQUATE label.
    :param full_script: the script as saved in a .py file in `data/generated/[dataset]/gpt4`
    :return: the script without the removed parts or np.nan for a missing script
    """
    if full_script is None or full_script == "-" or pd.isna(full_script):
        return np.nan
    try:
        lines = full_script.split("\n")
    except AttributeError:
        print("#############", "\n", full_script)
    idx = 0
    for idx, line in enumerate(lines):
        if line.startswith("# Golden Label:"):
            break
    return "\n".join(lines[idx+3:-1])  # skip label and 2 blank lines

In [None]:
dataset = "StressTest"

In [104]:
dataset_df = pd.read_csv(os.path.join(root_path, "data", "equate", f"{dataset}.csv"))
labels_df = pd.read_csv(os.path.join(root_path, "data", "generated", dataset, "gpt4", "random_sample_results_overview.csv"))
print(labels_df[labels_df["py_file_content"]=="-"].shape[0])
print(labels_df[labels_df["py_file_content"]=="-"]["sample_index"].unique())
labels_df["completion"] = labels_df["py_file_content"].apply(lambda full_script: extract_completion(full_script))
merged_df = pd.merge(dataset_df, labels_df, on="sample_index", how="left")
merged_df.shape[0]

11
[5699. 5700. 5701. 5702. 5703. 5704. 5705. 5706. 5707. 5959. 5960.]


7596

In [62]:
merged_df.head()

Unnamed: 0,premise,hypothesis,label,sample_index,llm_answer,py_file_content,completion
0,In 1956 Accardo won the Geneva Competition and...,Accardo composed 24 Caprices .,neutral,0,```python\ncaprices_premise = 24\ncaprices_hyp...,\n# Premise: In 1956 Accardo won the Geneva Co...,caprices_premise = 24\ncaprices_hypothesis = 2...
1,David Golinkin is the editor or author of eigh...,Golinkin has written eighteen books .,neutral,1,```python\ntotal_books_premise = 18\ntotal_boo...,\n# Premise: David Golinkin is the editor or a...,total_books_premise = 18\ntotal_books_hypothes...
2,David Golinkin is single-handedly responsible ...,David Golinkin is the author of dozen of respo...,neutral,2,```python\ndozens_responsa_premise = 24 # one ...,\n# Premise: David Golinkin is single-handedly...,dozens_responsa_premise = 24 # one dozen is 12...
3,During Reinsdorf 's 24 seasons as chairman of ...,Reinsdorf was the chairman of the White Sox fo...,entailment,3,```python\nseasons_as_chairman_premise = 24\ns...,\n# Premise: During Reinsdorf 's 24 seasons as...,seasons_as_chairman_premise = 24\nseasons_as_c...
4,During Reinsdorf 's 24 seasons as chairman of ...,The White Sox have won 24 championships .,neutral,4,```python\nseasons_as_chairman_premise = 24\nd...,\n# Premise: During Reinsdorf 's 24 seasons as...,seasons_as_chairman_premise = 24\ndivision_cha...


In [106]:
merged_df.columns

Index(['premise', 'hypothesis', 'label', 'sample_index', 'llm_answer',
       'py_file_content', 'completion'],
      dtype='object')

## Create train-test-val splits in a 70-20-10 ratio, ensuring the test split from LILA overlaps with our test split

In [105]:
ds_size = merged_df.shape[0]
train_size, val_size, test_size = int(ds_size * 0.7), int(ds_size * 0.1), int(ds_size * 0.2)
print("Total after split: ", train_size + val_size + test_size)
print("Total before split: ", ds_size)
extra = ds_size - (train_size + val_size + test_size)
test_size += extra
print(train_size, val_size, test_size)

Total after split:  7595
Total before split:  7596
5317 759 1520


In [107]:
lila_test_indices = []
try:
    df = pd.read_csv(os.path.join(root_path, "data", "lila-equate", dataset, "test.csv"))
    lila_test_indices = list(df["sample_index"].unique())
except FileNotFoundError:
    pass
data_for_split = merged_df[~merged_df["sample_index"].isin(lila_test_indices)]  # which data we can use for further splitting

print(f"LILA test size: {len(lila_test_indices)}")
test_size = max(test_size, len(lila_test_indices))

extra_samples_needed = test_size - len(lila_test_indices)
print(extra_samples_needed)

LILA test size: 1217
303


In [108]:
test_ratio = extra_samples_needed / data_for_split.shape[0]

print(f"test ratio: {test_ratio}")

from sklearn.model_selection import train_test_split

dev, test = train_test_split(data_for_split, test_size=test_ratio, stratify=data_for_split["label"])

test ratio: 0.04749960808904217


In [109]:
train, val = train_test_split(dev, test_size=val_size/dev.shape[0], stratify=dev["label"])

In [111]:
if len(lila_test_indices) > 0:
    lila_set = merged_df[merged_df["sample_index"].isin(lila_test_indices)]
    test = pd.concat([test, lila_set], ignore_index=True)
print(train.shape[0], val.shape[0], test.shape[0])

5317 759 1520


### Save out train-test-val splits

In [113]:
os.makedirs(os.path.join(root_path, "data", "finetuning", dataset), exist_ok=True)
train.to_csv(os.path.join(root_path, "data", "finetuning", dataset, "train.csv"))
test.to_csv(os.path.join(root_path, "data", "finetuning", dataset, "test.csv"))
val.to_csv(os.path.join(root_path, "data", "finetuning", dataset, "val.csv"))

## Add target label to the datasets

### Generate prompt and completion features, which will form the dataset for fine-tuning

In [71]:
from prompts import format_prompt
import os
import pandas as pd

# dataset = "RTE_Quant"   # can also use `dataset` variable set above

for split in ["train", "test", "val"]:
    print(f"{split.upper()} set")
    df = pd.read_csv(os.path.join(root_path, "data", "finetuning", dataset, f"{split}.csv"))
    missing_scripts = df[df["py_file_content"] == "-"]["sample_index"]
    print(f'Samples with no generated script: {missing_scripts.shape[0]}, ({missing_scripts.unique()})')
    if missing_scripts.shape[0] == 0:
        print(f"Creating {split} file.")
        df["prompt"] = df.apply(lambda row: format_prompt(dataset.lower().replace("_", ""), {"premise": row["premise"], "hypothesis": row["hypothesis"]}), axis=1)
        df["completion"] = df["completion"].apply(lambda completion: f"```python\n{completion}```")
        os.makedirs(os.path.join(root_path, "data", "finetuning", dataset, "completion"), exist_ok=True)
        df[["completion", "prompt"]].to_csv(os.path.join(root_path, "data", "finetuning", dataset, "completion", f"{split}.csv"), index=False)

TRAIN set
Samples with no generated script: 0, ([])
Creating train file.
TEST set
Samples with no generated script: 0, ([])
Creating test file.
VAL set
Samples with no generated script: 0, ([])
Creating val file.


In [63]:
df[["completion", "prompt"]].head(1)["completion"][0]

'```python\nboys_premise = 27.0\ngirls_premise = 35.0\nchildren_left_hypothesis = 8.0\n\ndef entailment_or_contradiction(boys_premise, girls_premise, children_left_hypothesis):\n    # the hypothesis claims that there were 8 children left on the playground\n    # according to the premise, each boy went back inside with a girl\n    # so, the number of children left should be the difference between the number of girls and boys\n    children_left_premise = girls_premise - boys_premise\n    # check if the quantities in the premise and hypothesis are equal\n    return children_left_premise == children_left_hypothesis\n\nprint(entailment_or_contradiction(boys_premise, girls_premise, children_left_hypothesis))\n```'

In [20]:
# df["text"] = df.apply(lambda row: f'{row["prompt"]}\n### Response:\n```python\n{row["completion"]}```', axis=1)

In [21]:
# df["text"].to_csv(os.path.join(root_path, "data", "finetuning", "AWPNLI", "text", "val_text.csv"), index=False)

In [22]:
# df["text"][0]

"### Instruction:\nYou need to reason about weather a hypothesis entails or contradicts a premise, by generating Python scripts. The scripts should classify the relation between the hypothesis and premise based on the quantitative and textual information mentioned in them. All the quantities and textual details in the hypothesis should be entailed by the information in the premise. First, manually extract all the individual quantities from both of the inputs, as valid numbers. Use the variable name to describe what the quantity measures, based on the context. Then, define a Python function that takes the extracted quantities as arguments. Within the function, use these quantities to perform computations based on the context of the premise and hypothesis. Finally, compare the resulting variables to determine the relationship. If the comparison indicates entailment, return True; for contradiction return False. Remember to include brief comments in the script to explain each step of the r