## References:

The json catalogs have been retrieved from the following locations:

"fortune500.json": https://github.com/dariusk/corpora/blob/master/data/corporations/fortune500.json
orgs

"basic.json" and "names.json": https://github.com/marcotcr/checklist/tree/master/checklist/data
loc per

Most code has been made using generative AI (chatGPT 4o)

Pertubation will only be done on the 90/10 split model, as this was determined to have the highest accuracy

In [2]:
import json
import random
import csv
import pandas as pd

In [3]:
#preps n sentences (so we can choose the size of our invariance test set) for entity swapping
def prep_n_sentences(filepath, n):
    sentences = [] #empty list with all sentences
    sentence = [] #the current sentence


    #generating tuples of words and their tags and appending to sentence
    with open(filepath, 'r') as f:
        for line in f:
            stripped = line.strip()
            if not stripped:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
                    if len(sentences) >=n:
                        break
                continue
            parts = stripped.split()
            if len(parts)>=3:
                token = parts[1]
                tag = parts[2]
                sentence.append((token,tag))
    
    #if not empty, append to sentences list
    if sentence:
        sentences.append(sentence)
    
    return sentences

In [4]:
def swap_named_entities(sentence, name_dict, location_dict, organisations_dict):
    swapped_sentence = []
    i = 0
    while i < len(sentence):
        token, tag = sentence[i]

        if tag.startswith("B-"):
            entity_type = tag[2:]  # e.g., 'PER', 'LOC', 'ORG'
            entity_tokens = [token]
            j = i + 1

            # Collect the rest of the entity
            while j < len(sentence) and sentence[j][1] == f"I-{entity_type}":
                entity_tokens.append(sentence[j][0])
                j += 1

            # Swap the entity
            if entity_type == "PER":
                firstname_sex = random.choice(["men","women"])
                if len(entity_tokens) == 1:
                    new_entity = [random.choice(name_dict[firstname_sex])]
                else:
                    new_entity = [random.choice(name_dict[firstname_sex])]
                    new_entity += [random.choice(name_dict["last"]) for _ in range(len(entity_tokens) - 1)]
            elif entity_type == "ORG":
                new_entity = random.choice(organisations_dict["companies"]).split()
            elif entity_type == "LOC":
                loc_choice = random.choice(["city", "country"])
                new_entity = random.choice(location_dict[loc_choice]).split()
            else:
                new_entity = entity_tokens  # fallback: no swap

            # Apply new entity with correct tags
            swapped_sentence.append((new_entity[0], f"B-{entity_type}"))
            for tok in new_entity[1:]:
                swapped_sentence.append((tok, f"I-{entity_type}"))

            i = j  # move past the original entity
        else:
            swapped_sentence.append((token, tag))
            i += 1

    return swapped_sentence

In [14]:
def add_dir_sentiment(sentence, label, sentiment_dict):
    if label == 1:  # Positive
        addon = random.choice(sentiment_dict["positive"])
    else:  # Negative
        addon = random.choice(sentiment_dict["negative"])
    return f"{sentence} {addon}"

In [7]:
tagged_filepath = "../data/imdb_test_train_datasets/test_tagged/test_9010_tagged_output.iob2"

names_filepath = "../data/perturb_json_files/names.json"
with open(names_filepath, "r") as js:
    names_dict = json.load(js)

locations_filepath = "../data/perturb_json_files/basic.json"
with open(locations_filepath, "r") as js:
    locations_dict = json.load(js)

org_filepath = "../data/perturb_json_files/fortune500.json"
with open(org_filepath, "r") as js:
    orgs_dict = json.load(js)

sentiment_filepath = "../data/perturb_json_files/sentiment_dict.json"
with open(sentiment_filepath, "r") as js:
    sentiment_dict = json.load(js)

In [9]:
# Load the original labels from test_9010.csv
label_df = pd.read_csv("../data/imdb_test_train_datasets/test/test_9010.csv")
reviews = label_df["review"].tolist()
labels = label_df["label"].tolist()

n = len(reviews)

In [11]:
# Parse a subset or the full file
sentences = prep_n_sentences(tagged_filepath, n)

# Swap entities
swapped_sentences = [swap_named_entities(sent, names_dict, locations_dict, orgs_dict) for sent in sentences]

# Pair each swapped sentence with its corresponding label
output_path = "../data/imdb_test_train_datasets/perturbed/invariance_swapped_data.csv"
perturbed_with_labels = [
    (" ".join(token for token, _ in sent), labels[i])
    for i, sent in enumerate(swapped_sentences)
]

# Write to CSV
with open(output_path, mode='w', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["review", "label"])  # header
    for review, label in perturbed_with_labels:
        writer.writerow([review, label])

print(f"✅ Written {len(perturbed_with_labels)} swapped sentences to {output_path}")

✅ Written 4959 swapped sentences to ../data/imdb_test_train_datasets/perturbed/invariance_swapped_data.csv


In [15]:
# Directional expectation
augmented_data = [
    (add_dir_sentiment(review, label, sentiment_dict), label)
    for review, label in zip(reviews, labels)
]

# Write to CSV
output_path = "../data/imdb_test_train_datasets/perturbed/directional_expectation_data.csv"
with open(output_path, mode='w', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["review", "label"])  # header
    for review, label in augmented_data:
        writer.writerow([review, label])

print(f"✅ Written {len(augmented_data)} sentiment-augmented reviews to {output_path}")

✅ Written 4959 sentiment-augmented reviews to ../data/imdb_test_train_datasets/perturbed/directional_expectation_data.csv
