## References:

The json catalogs have been retrieved from the following locations:

"fortune500.json": https://github.com/dariusk/corpora/blob/master/data/corporations/fortune500.json

"basic.json" and "names.json": https://github.com/marcotcr/checklist/tree/master/checklist/data

Most code has been made using generative AI (chatGPT 4o)

In [None]:
import json
import random

In [None]:
#preps n sentences (so we can choose the size of our invariance test set) for entity swapping
def prep_n_sentences(filepath, n):
    sentences = [] #empty list with all sentences
    sentence = [] #the current sentence


    #generating tuples of words and their tags and appending to sentence
    with open(filepath, 'r') as f:
        for line in f:
            stripped = line.strip()
            if not stripped:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
                    if len(sentences) >=n:
                        break
                continue
            parts = stripped.split()
            if len(parts)>=3:
                token = parts[1]
                tag = parts[2]
                sentence.append((token,tag))
    
    #if not empty, append to sentences list
    if sentence:
        sentences.append(sentence)
    
    return sentences

In [None]:
def swap_named_entities(sentence, name_dict, location_dict, organisations_dict):
    swapped_sentence = []
    i = 0
    while i < len(sentence):
        token, tag = sentence[i]

        if tag.startswith("B-"):
            entity_type = tag[2:]  # e.g., 'PER', 'LOC', 'ORG'
            entity_tokens = [token]
            j = i + 1

            # Collect the rest of the entity
            while j < len(sentence) and sentence[j][1] == f"I-{entity_type}":
                entity_tokens.append(sentence[j][0])
                j += 1

            # Swap the entity
            if entity_type == "PER":
                firstname_sex = random.choice(["men","women"])
                if len(entity_tokens) == 1:
                    new_entity = [random.choice(name_dict[firstname_sex])]
                else:
                    new_entity = [random.choice(name_dict[firstname_sex])]
                    new_entity += [random.choice(name_dict["last"]) for _ in range(len(entity_tokens) - 1)]
            elif entity_type == "ORG":
                new_entity = random.choice(organisations_dict["companies"]).split()
            elif entity_type == "LOC":
                loc_choice = random.choice(["city", "country"])
                new_entity = random.choice(location_dict[loc_choice]).split()
            else:
                new_entity = entity_tokens  # fallback: no swap

            # Apply new entity with correct tags
            swapped_sentence.append((new_entity[0], f"B-{entity_type}"))
            for tok in new_entity[1:]:
                swapped_sentence.append((tok, f"I-{entity_type}"))

            i = j  # move past the original entity
        else:
            swapped_sentence.append((token, tag))
            i += 1

    return swapped_sentence

In [None]:
def add_sentiment(sentence, sentiment_dict, sentiment = "pos"):
    tokens = [token for token, _ in sentence] #getting all tokens from sentence

    joined_sentence = " ".join(tokens) #recreating original sentence
    
    if sentiment == "pos": #if selected sentiment is positive
        addon = random.choice(sentiment_dict["positive"]) #selects a random add-on sentence from the positive list in the json
    
    else:
        addon = random.choice(sentiment_dict["negative"]) #same as above but negative
    
    final_sentence = f"{joined_sentence} {addon}"

    return final_sentence

In [None]:
tagged_filepath = "../data/imdb_tagged_output.iob2"

names_filepath = "../data/names.json"
with open(names_filepath, "r") as js:
    names_dict = json.load(js)

locations_filepath = "../data/basic.json"
with open(locations_filepath, "r") as js:
    locations_dict = json.load(js)

org_filepath = "../data/fortune500.json"
with open(org_filepath, "r") as js:
    orgs_dict = json.load(js)

sentiment_filepath = "../data/sentiment_dict.json"
with open(sentiment_filepath, "r") as js:
    sentiment_dict = json.load(js)

In [None]:
# Parse a subset or the full file
sentences = prep_n_sentences(tagged_filepath, 1000)

# Swap entities
swapped_sentences = [swap_named_entities(sent, names_dict, locations_dict, orgs_dict) for sent in sentences]

# Convert to plain strings
sentence_strings = [" ".join(token for token, tag in sent) for sent in swapped_sentences]

# Write one sentence per line in a new file
with open("../data/invariance_swapped_data.txt", 'w', encoding='utf-8') as f:
    for line in sentence_strings:
        f.write(line + '\n')

print(f"✅ Written {len(sentence_strings)} swapped sentences to invariance_swapped_data.txt")

In [None]:
#this cell assumes you've run the cell above, otherwise please run the commented-out cell below:
#sentences = prep_n_sentences(tagged_filepath, 1000)

#add sentiment-sentences, default is positive
augmented_sentences = [add_sentiment(sent, sentiment_dict) for sent in sentences]

#Write one sentence per line in a new file
with open("../data/dir_augmented_data.txt", 'w', encoding='utf-8') as f:
    for line in augmented_sentences:
        f.write(line + '\n')

print(f"✅ Written {len(augmented_sentences)} swapped sentences to dir_augmented_data.txt")