In [2]:
import pandas as pd
import os
import re

In [3]:
root_path = os.path.dirname(os.path.dirname(os.getcwd()))  # go 2 back, on the same level with the "data" directory

datasets_path = os.path.join(root_path, "data", "equate", "02_post_discarding")
print(datasets_path)

/Users/ioanamazilu/PycharmProjects/quant_nli/data/equate/02_post_discarding


In [8]:
def add_dot_to_sentence_end(sentence: str):
    """Add a dot to the sentence end if it does not end in one of the specified chars."""
    check_chars = [".", "!", "?", ")", ":", "'", ","]
    if sentence[-1] not in check_chars:
        return f"{sentence.strip()}."
    return sentence

def clean_sentence(sentence: str):
    return re.sub("\s+", " ", sentence).strip().replace(" , ", ", ").replace(" ' ", " '").replace(" ` ", " `").replace(" / ", "/").replace(" - ", "-").replace(" ( ", " (").replace(" ) ", ") ").replace(" : ", ":").replace(" % ", "% ").replace(" ?", "?").replace(" ’ ", "’").replace(" '", "'").replace(" .", ".").replace("$ ", "$").replace(" )", ")").replace("( ", "(").replace("`` ", "'' ")

In [14]:
os.makedirs(os.path.join(root_path, "data", "equate", "03_cleaned"), exist_ok=True)
for dataset in ["AWPNLI", "NewsNLI", "RedditNLI", "RTE_Quant", "StressTest"]:
    print(dataset)
    df = pd.read_csv(os.path.join(datasets_path, f"{dataset}.csv"))
    df["premise"] = df["premise"].apply(lambda premise: clean_sentence(premise))
    df["hypothesis"] = df["hypothesis"].apply(lambda hypothesis: clean_sentence(hypothesis))
    df["premise"] = df["premise"].apply(lambda premise: add_dot_to_sentence_end(premise))
    df["hypothesis"] = df["hypothesis"].apply(lambda hypothesis: add_dot_to_sentence_end(hypothesis))
    df["label"] = df["label"].str.lower()  # ensure case consistency

    df.to_csv(os.path.join(root_path, "data", "equate", "03_cleaned", f"{dataset}.csv"), index=False)

AWPNLI
NewsNLI
RedditNLI
RTE_Quant
StressTest


### Apply the same cleaning functions to the LILA datasets, so we can match our samples with theirs on the premise-hypothesis subset.

!!!! First, run the code in SECTION 1 of the "train_val_test_split.ipynb" file.
Note: The deduplication for LILA is done in the above notebook, no need to run step 01 of the preprocessing.

In [6]:
datasets_path = os.path.join(root_path, "data", "lila")
os.makedirs(os.path.join(root_path, "data", "lila", "03_cleaned"), exist_ok=True)
print(datasets_path)

/Users/ioanamazilu/PycharmProjects/quant_nli/data/lila


In [9]:
for dataset in ["lila_test.csv", "lila_val.csv", "lila_train.csv"]:
    print(dataset)
    df = pd.read_csv(os.path.join(datasets_path, dataset))
    df["premise"] = df["premise"].apply(lambda premise: clean_sentence(premise))
    df["hypothesis"] = df["hypothesis"].apply(lambda hypothesis: clean_sentence(hypothesis))
    df["premise"] = df["premise"].apply(lambda premise: add_dot_to_sentence_end(premise))
    df["hypothesis"] = df["hypothesis"].apply(lambda hypothesis: add_dot_to_sentence_end(hypothesis))
    df["label"] = df["label"].str.lower()  # ensure case consistency

    df.to_csv(os.path.join(root_path, "data", "lila", "03_cleaned", dataset), index=False)

lila_test.csv
lila_val.csv
lila_train.csv
