In [None]:
%pip install nltk

In [1]:
# Reading in data
import pandas as pd
csv_file_path = 'coherence_human_data.csv'
# Read the CSV file back into a DataFrame
coherence_human_data_df = pd.read_csv(csv_file_path)

## Coherence Errorifier

In [3]:
init_text = """
In the heart of the bustling city, there lived a man named Alex, whose days were a monotonous blend of work and solitude. The city’s vibrant lights couldn't illuminate the emptiness he felt inside. Love, a distant dream, flickered in his heart like a fading star. Alex longed for someone to share his hopes, dreams, and quiet dinners.

Each evening, Alex strolled through the city park, hoping to find a connection amidst the sea of strangers. The park, a canvas of life, was where laughter danced in the air, and hearts spoke without words. Yet, Alex walked alone, his gaze often lingering on happy couples, wondering when his moment would come.

One autumn evening, as amber leaves rustled underfoot, Alex’s eyes met Emma’s. She was sitting alone on a bench, lost in a book, her laughter echoing with the lines she read. Intrigued, Alex mustered his courage and approached her, asking about the book that brought such joy.

Their conversation flowed effortlessly, like a melody long rehearsed yet spontaneously played. Emma, with her warm smile and insightful eyes, saw through Alex’s guarded exterior. She listened to his dreams and shared her own, painting her world with words that resonated with his soul.
"""

In [2]:
coherence_human_data_df

Unnamed: 0,premise,story1,story2,coherence_preference
0,The world ends in a cataclysmic event.,"One day, the world just ended.\n\n The sky ...",The world ended. The sky turned red and the gr...,story2
1,Brad realizes the error of his ways and comes ...,"Shannon drove down the road slowly, and her ha...","”\n\nBrad pulled Shannon close, wrapping his a...",story1
2,The group of survivors must face the enemy and...,"continued, “Thank you, we promise that we will...",Jake looked around and saw a group of building...,story2
3,Aimee Kincaid goes home after a long day at wo...,Aimee Kincaid came home to an empty apartment....,Aimee Kincaid came to the door of her apartmen...,story1
4,"Valerie Marx wakes up in a dark basement, boun...",Valerie Marx woke and found herself lying on t...,Valerie woke up sitting on the floor of a conc...,story1
...,...,...,...,...
546,Shannon Daniels loses her mother to cancer and...,"When Shannon’s mother died, it was her father ...",Shannon Daniels felt like a lost and lonely tw...,story2
547,Jenna wakes up in the hospital a week later to...,had leapt down from the fence and was now sitt...,<br><br>“How long have I been out?” Jenna aske...,story2
548,"Daisy's daughter, Lisa, becomes her primary ca...",and threw it on the ground. He glanced at a pi...,"<br><br>\t“No, it’s nothing,” I replied as I l...",story1
549,Natalie goes to the police and helps them appr...,"that month, Trevor Lockwood cleared his throat...",?<br><br>Magic</s>Covid restrictions ease in N...,story1


In [13]:
# Combine the 'story1' and 'story2' columns
combined_stories = coherence_human_data_df['story1'].tolist() + coherence_human_data_df['story2'].tolist()

# Get unique stories
unique_stories = list(set(combined_stories))
len(unique_stories)

414

In [11]:
# coherence_errorifier - takes in a good text (either human or AI generated)
# removes random sentences (with a prob of p_remove for each sentences)
# reshuffles random sentences (with a prob p_reshuffle for any given sentences, sentence to shuffle with is picked randomly)
# adds random sentences (to be implemented later)
# each of those should introduce a coherence error

import random
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def coherence_errorifier(text, p_remove, p_reshuffle, p_group_reshuffle, max_group_size=2):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Remove random sentences
    sentences = [s for s in sentences if random.random() > p_remove]

    # Reshuffle individual sentences
    for i in range(len(sentences)):
        if random.random() < p_reshuffle:
            swap_index = random.randint(0, len(sentences) - 1)
            sentences[i], sentences[swap_index] = sentences[swap_index], sentences[i]

    # Reshuffle groups of sentences
    i = 0
    group_size = 0
    while i < len(sentences):
        if random.random() < p_group_reshuffle:
            group_size = random.randint(1, min(max_group_size, len(sentences) - i))
            swap_index = random.randint(0, len(sentences) - group_size)
            # Swap groups of sentences
            for j in range(group_size):
                if i + j < len(sentences) and swap_index + j < len(sentences):
                    sentences[i + j], sentences[swap_index + j] = sentences[swap_index + j], sentences[i + j]

        i += group_size

    # Add random sentences (to be implemented)

    # Reconstruct the text
    modified_text = ' '.join(sentences)
    return modified_text

# Example usage
text = init_text
modified_text = coherence_errorifier(text, p_remove=0.05, p_reshuffle=0.1, p_group_reshuffle=0.1, max_group_size=3)
print(modified_text)


Alex longed for someone to share his hopes, dreams, and quiet dinners. Each evening, Alex strolled through the city park, hoping to find a connection amidst the sea of strangers. 
In the heart of the bustling city, there lived a man named Alex, whose days were a monotonous blend of work and solitude. She was sitting alone on a bench, lost in a book, her laughter echoing with the lines she read. Intrigued, Alex mustered his courage and approached her, asking about the book that brought such joy. Yet, Alex walked alone, his gaze often lingering on happy couples, wondering when his moment would come. The park, a canvas of life, was where laughter danced in the air, and hearts spoke without words. The city’s vibrant lights couldn't illuminate the emptiness he felt inside. One autumn evening, as amber leaves rustled underfoot, Alex’s eyes met Emma’s. Their conversation flowed effortlessly, like a melody long rehearsed yet spontaneously played. Emma, with her warm smile and insightful eyes, 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mbondarenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
from tqdm import tqdm


errorified_data = []
with tqdm(total=5000) as pbar:
    while len(errorified_data) < 5000:
        datapoint = {}
        for text in combined_stories:
            modified_text = coherence_errorifier(text, p_remove=0.05, p_reshuffle=0.1, p_group_reshuffle=0.1, max_group_size=3)
            datapoint["errorified"] = modified_text
            datapoint["original"] = text
            errorified_data.append(datapoint)

            # Update the progress bar
            pbar.update(1)

            # Check if we reached the desired length
            if len(errorified_data) >= 5000:
                break


100%|██████████| 5000/5000 [00:03<00:00, 1459.79it/s]


In [26]:
pd.DataFrame(errorified_data)

Unnamed: 0,errorified,original
0,Who knows what kind of poisonous stuff could b...,<br><br>Luke Williams was a man who lived in t...
1,Who knows what kind of poisonous stuff could b...,<br><br>Luke Williams was a man who lived in t...
2,Who knows what kind of poisonous stuff could b...,<br><br>Luke Williams was a man who lived in t...
3,Who knows what kind of poisonous stuff could b...,<br><br>Luke Williams was a man who lived in t...
4,Who knows what kind of poisonous stuff could b...,<br><br>Luke Williams was a man who lived in t...
...,...,...
4995,He opened the door to his room and immediately...,Gabriel’s parents helped him pack his bags. He...
4996,He opened the door to his room and immediately...,Gabriel’s parents helped him pack his bags. He...
4997,He opened the door to his room and immediately...,Gabriel’s parents helped him pack his bags. He...
4998,He opened the door to his room and immediately...,Gabriel’s parents helped him pack his bags. He...


## Relevance errorifier

In [None]:
# relevance_erorifier - takes in a good text (either human or AI generated)
# removes random sentences (with a prob of p_remove for each sentences)
# finds sentences closest in embedding to the prompt, removes them (remove N closest sentences)
# adds random sentences from other texts
# each of those should introduce a relevance error