In [None]:
%pip install nltk

In [35]:
# Seed data for synthetic data generation

# Reading in data
import pandas as pd
csv_file_path = 'data/coherence/coherence_human_data.csv'
# Read the CSV file back into a DataFrame
coherence_human_data_df = pd.read_csv(csv_file_path)

# Combine the 'story1' and 'story2' columns
combined_stories = coherence_human_data_df['story1'].tolist() + coherence_human_data_df['story2'].tolist()

# Get unique stories
unique_stories = list(set(combined_stories))
len(unique_stories)

414

## Coherence Errorifier

In [3]:
init_text = """
In the heart of the bustling city, there lived a man named Alex, whose days were a monotonous blend of work and solitude. The city’s vibrant lights couldn't illuminate the emptiness he felt inside. Love, a distant dream, flickered in his heart like a fading star. Alex longed for someone to share his hopes, dreams, and quiet dinners.

Each evening, Alex strolled through the city park, hoping to find a connection amidst the sea of strangers. The park, a canvas of life, was where laughter danced in the air, and hearts spoke without words. Yet, Alex walked alone, his gaze often lingering on happy couples, wondering when his moment would come.

One autumn evening, as amber leaves rustled underfoot, Alex’s eyes met Emma’s. She was sitting alone on a bench, lost in a book, her laughter echoing with the lines she read. Intrigued, Alex mustered his courage and approached her, asking about the book that brought such joy.

Their conversation flowed effortlessly, like a melody long rehearsed yet spontaneously played. Emma, with her warm smile and insightful eyes, saw through Alex’s guarded exterior. She listened to his dreams and shared her own, painting her world with words that resonated with his soul.
"""

In [11]:
# coherence_errorifier - takes in a good text (either human or AI generated)
# removes random sentences (with a prob of p_remove for each sentences)
# reshuffles random sentences (with a prob p_reshuffle for any given sentences, sentence to shuffle with is picked randomly)
# adds random sentences (to be implemented later)
# each of those should introduce a coherence error

import random
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def coherence_errorifier(text, p_remove, p_reshuffle, p_group_reshuffle, max_group_size=2):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Remove random sentences
    sentences = [s for s in sentences if random.random() > p_remove]

    # Reshuffle individual sentences
    for i in range(len(sentences)):
        if random.random() < p_reshuffle:
            swap_index = random.randint(0, len(sentences) - 1)
            sentences[i], sentences[swap_index] = sentences[swap_index], sentences[i]

    # Reshuffle groups of sentences
    i = 0
    group_size = 0
    while i < len(sentences):
        if random.random() < p_group_reshuffle:
            group_size = random.randint(1, min(max_group_size, len(sentences) - i))
            swap_index = random.randint(0, len(sentences) - group_size)
            # Swap groups of sentences
            for j in range(group_size):
                if i + j < len(sentences) and swap_index + j < len(sentences):
                    sentences[i + j], sentences[swap_index + j] = sentences[swap_index + j], sentences[i + j]

        i += group_size

    # Add random sentences (to be implemented)

    # Reconstruct the text
    modified_text = ' '.join(sentences)
    return modified_text

# Example usage
text = init_text
modified_text = coherence_errorifier(text, p_remove=0.05, p_reshuffle=0.1, p_group_reshuffle=0.1, max_group_size=3)
print(modified_text)


Alex longed for someone to share his hopes, dreams, and quiet dinners. Each evening, Alex strolled through the city park, hoping to find a connection amidst the sea of strangers. 
In the heart of the bustling city, there lived a man named Alex, whose days were a monotonous blend of work and solitude. She was sitting alone on a bench, lost in a book, her laughter echoing with the lines she read. Intrigued, Alex mustered his courage and approached her, asking about the book that brought such joy. Yet, Alex walked alone, his gaze often lingering on happy couples, wondering when his moment would come. The park, a canvas of life, was where laughter danced in the air, and hearts spoke without words. The city’s vibrant lights couldn't illuminate the emptiness he felt inside. One autumn evening, as amber leaves rustled underfoot, Alex’s eyes met Emma’s. Their conversation flowed effortlessly, like a melody long rehearsed yet spontaneously played. Emma, with her warm smile and insightful eyes, 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mbondarenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [28]:
from tqdm import tqdm

errorified_data = []
with tqdm(total=5000) as pbar:
    while len(errorified_data) < 5000:
        for text in combined_stories:
            # Create a new datapoint dictionary in each iteration
            datapoint = {}
            modified_text = coherence_errorifier(text, p_remove=0.05, p_reshuffle=0.1, p_group_reshuffle=0.1, max_group_size=3)
            datapoint["errorified"] = modified_text
            datapoint["original"] = text
            errorified_data.append(datapoint)

            # Update the progress bar
            pbar.update(1)

            # Break if we reached the desired length
            if len(errorified_data) >= 5000:
                break

# Ensure we don't exceed the length of 5000
errorified_data = errorified_data[:5000]


100%|██████████| 5000/5000 [00:03<00:00, 1428.18it/s]


In [33]:
import pandas as pd
import random

# Assuming 'data' is your DataFrame
data = pd.DataFrame(errorified_data)

# Format One
preference_format = data.apply(lambda x: pd.Series({
    "story1": x['original'] if random.choice([True, False]) else x['errorified'],
    "story2": x['errorified'] if x['original'] in x else x['original'],
    "preference": 0 if x['original'] in x else 1
}), axis=1)

# Format Two
classifier_format = pd.concat([
    pd.DataFrame({"story": data["original"], "label": 1}),
    pd.DataFrame({"story": data["errorified"], "label": 0})
]).reset_index(drop=True)

# Saving to files (adjust the file paths as needed)
preference_format.to_csv("synthetic_preference_coherence.csv", index=False)
classifier_format.to_csv("synthetic_classifier_coherence.csv", index=False)


## Relevance errorifier

In [None]:
# relevance_erorifier - takes in a good text (either human or AI generated)
# removes random sentences (with a prob of p_remove for each sentences)
# finds sentences closest in embedding to the prompt, removes them (remove N closest sentences)
# adds random sentences from other texts
# each of those should introduce a relevance error