In [139]:
import pandas as pd
import json
import os

In [140]:
root_path = os.path.dirname(os.getcwd())
lila_path = os.path.join(root_path, "data", "lila", "all", "NumGLUE_Type_7_crowdsourced.json")

with open(lila_path, 'r') as f:
    lila_equate = json.load(f)

In [141]:
lila_equate.keys()

dict_keys(['Source', 'Categories', 'Instances', 'Metadata'])

In [142]:
lila_equate = lila_equate["Instances"]

In [143]:
len(lila_equate)

6325

In [144]:
lila_equate[0].keys()

dict_keys(['Input', 'Output Program', 'Output Answer', 'split'])

In [145]:
lila_equate[0]["Input"]

' "statement 1": In a deck of less than 72 cards , how many ways are there to select 13 Spade and 13 heart cards without repetition ?, "statement 2" :In a deck of 52 cards , how many ways are there to select 13 Spade and 13 heart cards without repetition ?, "options: " Entailment or contradiction or neutral?'

In [146]:
lila_equate[1]["Output Program"][0]

"RajeshHaveQuestionsS1 = 41 \nRajeshHaveQuestionsS2 = 31\nif RajeshHaveQuestionsS1 is None or RajeshHaveQuestionsS2 is None:\n   print('neutral')\nelif RajeshHaveQuestionsS1==RajeshHaveQuestionsS2:\n      print('Entailment')\nelif RajeshHaveQuestionsS1!=RajeshHaveQuestionsS2:\n     print('contradiction')"

In [147]:
train, val, test = [], [], []
for instance in lila_equate:
    new_instance = dict()
    inputs = instance["Input"]
    premise_hypothesis = inputs.split(", \"statement 2\" :")
    try:
        premise, hypothesis = premise_hypothesis[0], premise_hypothesis[1]
        premise = premise.split("\"statement 1\": ")[-1]
        hypothesis = hypothesis.split(", \"options: \"")[0]
        new_instance.update({"premise": premise,
                             "hypothesis": hypothesis,
                             "lila_label": instance["Output Answer"][0],
                             "lila_script": instance["Output Program"][0]})
        split = instance["split"]
        if split == "train":
            train.append(new_instance)
        elif split == "dev":
            val.append(new_instance)
        else:
            test.append(new_instance)
    except IndexError:
        print(f"ERROR extracting inputs:\n{inputs}")

In [148]:
print(len(train), len(val), len(test))

4302 806 1217


In [149]:
import re

def clean_text(text: str):
    return re.sub(r'\s+', ' ', text.lower().replace("\n", "")).strip()

In [150]:
lila_train = pd.DataFrame(train)
lila_test = pd.DataFrame(test)
lila_val = pd.DataFrame(val)
lila_val.head()

Unnamed: 0,premise,hypothesis,lila_label,lila_script
0,"In a deck of less than 72 cards , how many way...","In a deck of 52 cards , how many ways are ther...",neutral,DeckOfCardStatement1= None \nDeckOfCardStateme...
1,If out of 41 questions solved by Rajesh 37 que...,If out of 31 questions solved by Rajesh 37 que...,contradiction,RajeshHaveQuestionsS1 = 41 \nRajeshHaveQuestio...
2,Alice drives at a constant speed of 30 km per ...,Alice drives at a constant speed of 20 km per ...,contradiction,DrivesSpeedS1 = 30\nDrivesSpeedS2 = 20\nif Dri...
3,Mary is 22 years younger than Albert,Mary is 72 years younger than Albert,contradiction,AgeS1 = 22\nAgeS2 = 72\nif AgeS1 is None or Ag...
4,Assuming that Karen drives at an average speed...,Assuming that Karen drives at an average speed...,neutral,\nDriveS2 = 60\nDriveS1 = None\nif DriveS1 is ...


In [151]:
lila_val["lila_label"] = lila_val["lila_label"].apply(lambda label: label.lower())
lila_val["lila_label"] = lila_val["lila_label"].apply(lambda label: label.lower())
lila_val["lila_label"] = lila_val["lila_label"].apply(lambda label: label.lower())

In [152]:
for col in ["premise", "hypothesis"]:
    lila_train[col] = lila_train[col].apply(lambda text: clean_text(text))
    lila_test[col] = lila_test[col].apply(lambda text: clean_text(text))
    lila_val[col] = lila_val[col].apply(lambda text: clean_text(text))

In [153]:
lila_train = lila_train[~lila_train.duplicated(subset=['premise', 'hypothesis'])]
lila_test = lila_test[~lila_test.duplicated(subset=['premise', 'hypothesis'])]
lila_val = lila_val[~lila_val.duplicated(subset=['premise', 'hypothesis'])]

In [154]:
print(lila_train.shape[0], lila_test.shape[0], lila_val.shape[0])

4302 1217 806


In [None]:
lila_train.to_csv(os.path.join(root_path, "data", "lila", "lila_train.csv"), index=False)
lila_test.to_csv(os.path.join(root_path, "data", "lila", "lila_test.csv"), index=False)
lila_val.to_csv(os.path.join(root_path, "data", "lila", "lila_val.csv"), index=False)

In [109]:
lila_all = []
lila_all.extend(train)
lila_all.extend(test)
lila_all.extend(val)

In [114]:
lila_all_df = pd.DataFrame(lila_all)
lila_all_df.shape[0]

6325

In [113]:
lila_all_df[lila_all_df.duplicated(subset=["premise", "hypothesis"])].shape[0]

0

In [115]:
for col in ["premise", "hypothesis"]:
    lila_all_df[col] = lila_all_df[col].apply(lambda text: clean_text(text))

In [116]:
lila_all_df[lila_all_df.duplicated(subset=["premise", "hypothesis"])].shape[0]

0

## Merge LILA samples with EQUATE samples per EQUATE dataset

In [155]:
lila_train.shape

(4302, 4)

In [156]:
from qnli_datasets import read_data

datasets = ["NewsNLI", "RTE_Quant", "RedditNLI", "StressTest", "AWPNLI"]

for dataset in datasets:
    print(f"#######{dataset}#######")
    instances, _ = read_data(f"{dataset}.jsonl")
    equate_df = pd.DataFrame(instances)
    equate_df["sample_index"] = equate_df.index
    equate_df["premise"] = equate_df["premise"].apply(lambda text: clean_text(text))
    equate_df["hypothesis"] = equate_df["hypothesis"].apply(lambda text: clean_text(text))
    equate_df = equate_df[~equate_df.duplicated(subset=['premise', 'hypothesis'])]
    print(f"Total instances: {equate_df.shape[0]}")
    equate_lila_train = pd.merge(equate_df, lila_train, on=["premise", "hypothesis"], how="inner")
    equate_lila_test = pd.merge(equate_df, lila_test, on=["premise", "hypothesis"], how="inner")
    equate_lila_val = pd.merge(equate_df, lila_val, on=["premise", "hypothesis"], how="inner")
    print(f"Train: {equate_lila_train.shape[0]}\nTest: {equate_lila_test.shape[0]}\nVal: {equate_lila_val.shape[0]}")
    print(f"Train: {equate_lila_train['lila_label'].value_counts()}\nTest: {equate_lila_test['lila_label'].value_counts()}\nVal: {equate_lila_val['lila_label'].value_counts()}")
    output_path = os.path.join(root_path, "data", "lila-equate", dataset)
    os.makedirs(output_path, exist_ok=True)
    if equate_lila_train.shape[0] > 0:
        equate_lila_train.to_csv(os.path.join(output_path, "train.csv"), index=False)
    if equate_lila_test.shape[0] > 0:
        equate_lila_test.to_csv(os.path.join(output_path, "test.csv"), index=False)
    if equate_lila_val.shape[0] > 0:
        equate_lila_val.to_csv(os.path.join(output_path, "val.csv"), index=False)

#######NewsNLI#######
Total instances: 963
Train: 562
Test: 0
Val: 0
Train: lila_label
Entailment    312
neutral       250
Name: count, dtype: int64
Test: Series([], Name: count, dtype: int64)
Val: Series([], Name: count, dtype: int64)
#######RTE_Quant#######
Total instances: 165
Train: 103
Test: 0
Val: 0
Train: lila_label
neutral       52
Entailment    51
Name: count, dtype: int64
Test: Series([], Name: count, dtype: int64)
Val: Series([], Name: count, dtype: int64)
#######RedditNLI#######
Total instances: 247
Train: 0
Test: 0
Val: 0
Train: Series([], Name: count, dtype: int64)
Test: Series([], Name: count, dtype: int64)
Val: Series([], Name: count, dtype: int64)
#######StressTest#######
Total instances: 6947
Train: 3112
Test: 1217
Val: 806
Train: lila_label
neutral          1049
contradiction    1048
Entailment       1015
Name: count, dtype: int64
Test: lila_label
contradiction    466
Entailment       382
neutral          369
Name: count, dtype: int64
Val: lila_label
contradiction   