In [2]:
import json
import os
import numpy as np
import pandas as pd

In [3]:
root_path = os.path.dirname(os.getcwd())
lila_path = os.path.join(root_path, "data", "lila", "all", "NumGLUE_Type_7_crowdsourced.json")
print(lila_path)

/Users/ioanamazilu/PycharmProjects/quant_nli/data/lila/all/NumGLUE_Type_7_crowdsourced.json


## SECTION 1: EXTRACT SAMPLES FROM LILA (they use Task 7 from NumGLUE)

In [7]:
with open(lila_path, 'r') as f:
    lila_equate = json.load(f)

In [8]:
lila_equate.keys()

dict_keys(['Source', 'Categories', 'Instances', 'Metadata'])

In [9]:
lila_equate = lila_equate["Instances"]

In [10]:
len(lila_equate)

6325

In [11]:
lila_equate[0].keys()

dict_keys(['Input', 'Output Program', 'Output Answer', 'split'])

In [12]:
lila_equate[0]["Input"]

' "statement 1": In a deck of less than 72 cards , how many ways are there to select 13 Spade and 13 heart cards without repetition ?, "statement 2" :In a deck of 52 cards , how many ways are there to select 13 Spade and 13 heart cards without repetition ?, "options: " Entailment or contradiction or neutral?'

In [13]:
lila_equate[1]["Output Program"][0]

"RajeshHaveQuestionsS1 = 41 \nRajeshHaveQuestionsS2 = 31\nif RajeshHaveQuestionsS1 is None or RajeshHaveQuestionsS2 is None:\n   print('neutral')\nelif RajeshHaveQuestionsS1==RajeshHaveQuestionsS2:\n      print('Entailment')\nelif RajeshHaveQuestionsS1!=RajeshHaveQuestionsS2:\n     print('contradiction')"

In [14]:
train, val, test = [], [], []
for instance in lila_equate:
    new_instance = dict()
    inputs = instance["Input"]
    premise_hypothesis = inputs.split(", \"statement 2\" :")
    try:
        premise, hypothesis = premise_hypothesis[0], premise_hypothesis[1]
        premise = premise.split("\"statement 1\": ")[-1]
        hypothesis = hypothesis.split(", \"options: \"")[0]
        new_instance.update({"premise": premise,
                             "hypothesis": hypothesis,
                             "lila_label": instance["Output Answer"][0],
                             "lila_script": instance["Output Program"][0]})
        split = instance["split"]
        if split == "train":
            train.append(new_instance)
        elif split == "dev":
            val.append(new_instance)
        else:
            test.append(new_instance)
    except IndexError:
        print(f"ERROR extracting inputs:\n{inputs}")

In [15]:
print(len(train), len(val), len(test))

4302 806 1217


In [16]:
import re

def clean_text(text: str):
    return re.sub(r'\s+', ' ', text.lower().replace("\n", "")).strip()

In [17]:
lila_train = pd.DataFrame(train)
lila_test = pd.DataFrame(test)
lila_val = pd.DataFrame(val)
lila_val.head()

Unnamed: 0,premise,hypothesis,lila_label,lila_script
0,"In a deck of less than 72 cards , how many way...","In a deck of 52 cards , how many ways are ther...",neutral,DeckOfCardStatement1= None \nDeckOfCardStateme...
1,If out of 41 questions solved by Rajesh 37 que...,If out of 31 questions solved by Rajesh 37 que...,contradiction,RajeshHaveQuestionsS1 = 41 \nRajeshHaveQuestio...
2,Alice drives at a constant speed of 30 km per ...,Alice drives at a constant speed of 20 km per ...,contradiction,DrivesSpeedS1 = 30\nDrivesSpeedS2 = 20\nif Dri...
3,Mary is 22 years younger than Albert,Mary is 72 years younger than Albert,contradiction,AgeS1 = 22\nAgeS2 = 72\nif AgeS1 is None or Ag...
4,Assuming that Karen drives at an average speed...,Assuming that Karen drives at an average speed...,neutral,\nDriveS2 = 60\nDriveS1 = None\nif DriveS1 is ...


In [18]:
lila_val["lila_label"] = lila_val["lila_label"].apply(lambda label: label.lower())
lila_val["lila_label"] = lila_val["lila_label"].apply(lambda label: label.lower())
lila_val["lila_label"] = lila_val["lila_label"].apply(lambda label: label.lower())

In [19]:
for col in ["premise", "hypothesis"]:
    lila_train[f"clean_{col}"] = lila_train[col].apply(lambda text: clean_text(text))
    lila_test[f"clean_{col}"] = lila_test[col].apply(lambda text: clean_text(text))
    lila_val[f"clean_{col}"] = lila_val[col].apply(lambda text: clean_text(text))

In [20]:
lila_train = lila_train[~lila_train.duplicated(subset=['clean_premise', 'clean_hypothesis'])]
lila_test = lila_test[~lila_test.duplicated(subset=['clean_premise', 'clean_hypothesis'])]
lila_val = lila_val[~lila_val.duplicated(subset=['clean_premise', 'clean_hypothesis'])]

In [21]:
print(lila_train.shape[0], lila_test.shape[0], lila_val.shape[0])

4302 1217 806


We find no duplicates in LILA, at the train/val/test set levels.

### Check for duplicates across the entire LILA data

In [22]:
lila_all = []
lila_all.extend(train)
lila_all.extend(test)
lila_all.extend(val)

In [23]:
lila_all_df = pd.DataFrame(lila_all)
lila_all_df.shape[0]

6325

In [25]:
for col in ["premise", "hypothesis"]:
    lila_all_df[f"clean_{col}"] = lila_all_df[col].apply(lambda text: clean_text(text))

In [26]:
lila_all_df[lila_all_df.duplicated(subset=["clean_premise", "clean_hypothesis"])].shape[0]

0

Also no duplicates at the whole LILA dataset level (i.e. a sample in both the train and test set)

### Save train-val-test splits from LILA

In [42]:
columns = ["premise", "hypothesis", "lila_label", "lila_script"]
lila_train[columns].to_csv(os.path.join(root_path, "data", "lila", "lila_train.csv"), index=False)
lila_test[columns].to_csv(os.path.join(root_path, "data", "lila", "lila_test.csv"), index=False)
lila_val[columns].to_csv(os.path.join(root_path, "data", "lila", "lila_val.csv"), index=False)

## SECTION 2: Merge LILA samples with EQUATE samples per EQUATE dataset

!!! Before this section, run the 03_cleaning.ipynb for the LILA dataset

Read cleaned LILA datasets.

In [4]:
lila_train = pd.read_csv(os.path.join(root_path, "data", "lila", "03_cleaned", "lila_train.csv"))
lila_test= pd.read_csv(os.path.join(root_path, "data", "lila", "03_cleaned", "lila_test.csv"))
lila_val= pd.read_csv(os.path.join(root_path, "data", "lila", "03_cleaned", "lila_val.csv"))

In [5]:
def add_script_to_df(df: pd.DataFrame, dataset:str):
    sample_indices, scripts = [], []
    scripts_path = os.path.join(root_path, "data", "code_quality", "gpt4", dataset, "phase2")
    for scriptfile in os.listdir(scripts_path):
        index = int(scriptfile.split("_")[1].split(".")[0])
        sample_indices.append(index)
        with open(os.path.join(scripts_path, scriptfile), 'r') as f:
            script = "".join(f.readlines())
            scripts.append(script)
    assert len(sample_indices) == len(scripts)
    print(df.shape[0], len(sample_indices))
    return pd.merge(df, pd.DataFrame({"sample_index": sample_indices, "completion": scripts}), on="sample_index", how="left")

In [8]:
datasets = ["StressTest", "RTE_Quant", "RedditNLI", "NewsNLI", "AWPNLI"]

for dataset in datasets:
    print(f"#######{dataset}#######")
    equate_df = pd.read_csv(os.path.join(root_path, "data", "equate_labelled", "processed", f"{dataset}.csv"))
    print(f"Total instances: {equate_df.shape[0]}")
    equate_df = add_script_to_df(equate_df, dataset)
    equate_lila_test = pd.merge(equate_df, lila_test, on=["premise", "hypothesis"], how="inner")
    print(f"Test: {equate_lila_test.shape[0]}")
    print(equate_lila_test[equate_lila_test["golden_label"] != equate_lila_test["reference_label"]].shape[0])
    if equate_lila_test.shape[0] > 0:
        print(f"Test: {equate_lila_test['golden_label'].value_counts()}")
        rest_equate = equate_df[~equate_df["sample_index"].isin(equate_lila_test["sample_index"].unique())]
        ft_data = rest_equate  # [rest_equate['golden_label'] == rest_equate['reference_label']]
    else:
        ft_data = equate_df  # [equate_df['golden_label'] == equate_df['reference_label']]
    print(f"Available for train-val split: {ft_data.shape[0]}")
    print(f"CC: {ft_data[ft_data['golden_label'] == ft_data['reference_label']].shape[0]}")
    output_path = os.path.join(root_path, "data", "lila-equate", dataset)
    os.makedirs(output_path, exist_ok=True)
    os.makedirs(os.path.join(root_path, "data", "finetuning", dataset), exist_ok=True)
    if dataset == "StressTest":
        ft_data.to_csv(os.path.join(output_path, "train_val.csv"), index=False)
    else:
        ft_data.to_csv(os.path.join(output_path, "all.csv"), index=False)
    # # we need the train samples for code quality comparison
    # if equate_lila_train.shape[0] > 0:
    #     equate_lila_train.to_csv(os.path.join(output_path, "cc_train.csv"), index=False)
    if equate_lila_test.shape[0] > 0:
        equate_lila_test.to_csv(os.path.join(output_path, "test.csv"), index=False)
        equate_lila_test.drop(["lila_label", "lila_script"], axis=1, inplace=True)
        equate_lila_test.to_csv(os.path.join(root_path, 'data', "finetuning", "StressTest", "test.csv"), index=False)

#######StressTest#######
Total instances: 6945
6945 6947
Test: 1217
395
Test: golden_label
contradiction    466
entailment       382
neutral          369
Name: count, dtype: int64
Available for train-val split: 5728
CC: 3836
#######RTE_Quant#######
Total instances: 162
162 162
Test: 0
0
Available for train-val split: 162
CC: 137
#######RedditNLI#######
Total instances: 247
247 247
Test: 0
0
Available for train-val split: 247
CC: 175
#######NewsNLI#######
Total instances: 958
958 958
Test: 0
0
Available for train-val split: 958
CC: 696
#######AWPNLI#######
Total instances: 719
719 722
Test: 0
0
Available for train-val split: 719
CC: 692


## SPLITS

In [11]:
from sklearn.model_selection import train_test_split
total_datasets = {"StressTest": 5728, "AWPNLI": 719, "RTE_Quant": 162, "RedditNLI": 247, "NewsNLI": 959}
test_pctg = 0.25
test_sizes = {"StressTest": 1215, "AWPNLI": int(test_pctg*719), "RTE_Quant": int(test_pctg*162), "RedditNLI": int(test_pctg*247), "NewsNLI": int(test_pctg*959)}
total_test_set_size = np.sum([set_size for _, set_size in test_sizes.items()])
print(total_test_set_size)
print(test_sizes)
test_pctgs = {ds: round(subset/total_test_set_size, 2) for ds, subset in test_sizes.items()}
print(test_pctgs)
total_ft = 0
for dataset, test_size in test_sizes.items():
    if dataset != "StressTest":
        df = pd.read_csv(os.path.join(root_path, "data", "lila-equate", dataset, "all.csv"))
        train_val, test = train_test_split(df, test_size=test_size, stratify=df["golden_label"])
        ft = train_val[train_val['golden_label']==train_val['reference_label']]
        total_ft += ft.shape[0]
        train, val = train_test_split(ft, test_size=0.15, stratify=ft["golden_label"])
        print(f"{dataset} FT data: {ft.shape[0]}")
        print(f"Train: {train.shape[0]}; Val: {val.shape[0]}")
        os.makedirs(os.path.join(root_path, "data", "finetuning", dataset), exist_ok=True)
        test.to_csv(os.path.join(root_path, "data", "finetuning", dataset, "test.csv"), index=False)
        train.to_csv(os.path.join(root_path, "data", "finetuning", dataset, "train.csv"), index=False)
        val.to_csv(os.path.join(root_path, "data", "finetuning", dataset, "val.csv"), index=False)
    else:
        os.makedirs(os.path.join(root_path, "data", "finetuning", dataset), exist_ok=True)
        df = pd.read_csv(os.path.join(root_path, "data", "lila-equate", dataset, "train_val.csv"))
        ft_data = df[df['golden_label'] == df['reference_label']]
        print(f"{dataset} FT data: {ft_data.shape[0]}")
        train, val = train_test_split(ft_data, test_size=0.15, stratify=ft_data["golden_label"])
        print(f"Train: {train.shape[0]}; Val: {val.shape[0]}")
        train.to_csv(os.path.join(root_path, "data", "finetuning", dataset, "train.csv"), index=False)
        val.to_csv(os.path.join(root_path, "data", "finetuning", dataset, "val.csv"), index=False)
        total_ft += ft_data.shape[0]
print(f"Total: {total_ft}")

1734
{'StressTest': 1215, 'AWPNLI': 179, 'RTE_Quant': 40, 'RedditNLI': 61, 'NewsNLI': 239}
{'StressTest': 0.7, 'AWPNLI': 0.1, 'RTE_Quant': 0.02, 'RedditNLI': 0.04, 'NewsNLI': 0.14}
StressTest FT data: 3836
Train: 3260; Val: 576
AWPNLI FT data: 520
Train: 442; Val: 78
RTE_Quant FT data: 101
Train: 85; Val: 16
RedditNLI FT data: 132
Train: 112; Val: 20
NewsNLI FT data: 522
Train: 443; Val: 79
Total: 5111


### Generate prompt and completion features, which will form the dataset for fine-tuning

In [12]:
from prompts import format_prompt
import os
import pandas as pd

# dataset = "RTE_Quant"   # can also use `dataset` variable set above
for dataset in ["StressTest", "NewsNLI", "RTE_Quant", "RedditNLI", "AWPNLI"]:
    for split in ["train", "test", "val"]:
        print(f"{split.upper()} set")
        df = pd.read_csv(os.path.join(root_path, "data", "finetuning", dataset, f"{split}.csv"))
        missing_scripts = df[df["completion"].isna()]["sample_index"]
        print(f'Samples with no generated script: {missing_scripts.shape[0]}, ({missing_scripts.unique()})')
        print(f"Creating {split} file.")
        df.dropna(subset=["completion"], inplace=True)
        df["prompt"] = df.apply(lambda row: format_prompt(dataset.lower().replace("_", ""), {"premise": row["premise"], "hypothesis": row["hypothesis"]}), axis=1)
        df["completion"] = df["completion"].apply(lambda completion: f"```python\n{completion}```")
        os.makedirs(os.path.join(root_path, "data", "finetuning", dataset, "completion"), exist_ok=True)
        if split != "test":
            df[["completion", "prompt"]].to_csv(os.path.join(root_path, "data", "finetuning", dataset, "completion", f"{split}.csv"), index=False)
        else:
            df[["sample_index", "completion", "prompt"]].to_csv(os.path.join(root_path, "data", "finetuning", dataset, "completion", f"{split}.csv"), index=False)

TRAIN set
Samples with no generated script: 0, ([])
Creating train file.
TEST set
Samples with no generated script: 0, ([])
Creating test file.
VAL set
Samples with no generated script: 0, ([])
Creating val file.
TRAIN set
Samples with no generated script: 0, ([])
Creating train file.
TEST set
Samples with no generated script: 0, ([])
Creating test file.
VAL set
Samples with no generated script: 0, ([])
Creating val file.
TRAIN set
Samples with no generated script: 0, ([])
Creating train file.
TEST set
Samples with no generated script: 0, ([])
Creating test file.
VAL set
Samples with no generated script: 0, ([])
Creating val file.
TRAIN set
Samples with no generated script: 0, ([])
Creating train file.
TEST set
Samples with no generated script: 0, ([])
Creating test file.
VAL set
Samples with no generated script: 0, ([])
Creating val file.
TRAIN set
Samples with no generated script: 0, ([])
Creating train file.
TEST set
Samples with no generated script: 0, ([])
Creating test file.
VAL 

In [13]:
df[["completion", "prompt"]].head(1)["prompt"][0]

'### Instruction:\n\nYou must write Python code starting from 2 input sentences, based on these rules:\n- first you define variables with representative names for the numerical entities in both inputs (one variable per entity, per sentence);\n- extract all quantities as valid numbers (integers or floats). do not ignore any quantity or numerical information;\n- next, use brief comments to explain what comparison you do between the defined variables (do not use their values in the comments). Any comparison you do should be do through code as well.;\n- refrain from concluding in the comments the entailment/contradiction or neutral relation;\n- use the variables to perform calculations if necessary and finally compare them accordingly to infer one of the following: \n    - "entailment": the hypothesis can be fully and explicitly entailed from the premise\n    - "contradiction": at least one aspect in the hypothesis contradicts the premise\n- use the correct comparison operators (i.e., if w

## COMBINE ALL DATASETS INTO ONE SET FOR TRAIN/VAL/TEST

In [14]:
for split in ["train", "test", "val"]:
    all_df = pd.DataFrame()
    for dataset in ["StressTest", "AWPNLI", "NewsNLI", "RedditNLI", "RTE_Quant"]:
        samples = pd.read_csv(os.path.join(root_path, "data", "finetuning", dataset, "completion", f"{split}.csv"))
        if split == "test":
            samples["source"] = dataset.lower().replace("_", "")
        all_df = pd.concat([all_df, samples], ignore_index=True)
    print(f"{split} set size: {all_df.shape[0]}")
    os.makedirs(os.path.join(root_path, "data", "finetuning", "completion"), exist_ok=True)
    all_df = all_df.sample(frac=1).reset_index(drop=True)  # shuffle data
    all_df.to_csv(os.path.join(root_path, "data", "finetuning", "completion", f"{split}_all.csv"), index=False)

train set size: 4342
test set size: 1736
val set size: 769
