# Hippocorpus converter

## Import

In [None]:
import pandas as pd

hippocorpus = pd.read_csv("hippocorpus/hcV3-stories.csv")

In [None]:
# There is a surprising number of people who seem to have left capslock on while participating in the data collection process.
# These entries tend to be of lower than average quality and would be impossible to fully restore without more complex methods, so they are excluded
hippocorpus = hippocorpus[~hippocorpus["mainEvent"].str.isupper()]

## Convert

In [None]:
import re
from random import choice, random, randrange
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")


def replace_my(string):
    match = re.search(r"my (\w+)", string)
    if match:
        word = match.group(1)
        if word[0] in "aeiou":
            string = re.sub(r"my", "an", string, 1)
        else:
            string = re.sub(r"my", "a", string, 1)
    return string


def sure():
    ack = choice(["Sure", "Of course", "Alright", "Certainly"])
    punctuation = choice([",", "!", "."])
    return ack + punctuation


def convert_row(row):
    interaction = ""
    main_event = row["mainEvent"].rstrip("!.?;:")
    main_event = main_event[0].lower() + main_event[1:]
    main_event = replace_my(main_event)
    an_original = choice(["a", "an original"])
    write = choice(["Write", "Write me", "Please write"])
    instruction = f"{write} {an_original} story about {main_event}."
    interaction += f"User: {instruction}"

    story = row["story"]
    do_sentence_instruction = random() > 0.5
    if do_sentence_instruction:
        sentences = sent_tokenize(story)
        sentence_index = randrange(len(sentences))
        if sentence_index == 0:
            interaction += " Make the first sentence "
            sentence_response_section = f" where the first sentence is "
        elif sentence_index == len(sentences) - 1:
            interaction += " Make the last sentence "
            sentence_response_section = f" where the last sentence is "
        else:
            interaction += " Include the sentence "
            sentence_response_section = f" which includes the sentence "
        interaction += f'"{sentences[sentence_index]}"'
        sentence_response_section += f'"{sentences[sentence_index]}"'
    else:
        sentence_response_section = ""
    interaction += "\n\n"

    interaction += f"Rosey: {sure()} Here's a story about {main_event}{sentence_response_section}.\n\n{story}"
    interaction += "\n\n"

    def most_surprising(interaction):
        most_surprising = row["mostSurprising"]
        most_surprising = most_surprising[0].lower() + most_surprising[1:]
        was = choice(["was", "do you think was", "would you say was", "do you think someone would say was"])
        surprising = choice(
            ["the most surprising thing", "one of the most surprising things", "a surprising development"]
        )
        interaction += f"User: What {was} {surprising} in that story?\n\n"
        id_say = choice(["I'd say the", "I would have to say the", "The", "This story's"])
        interaction += f"Rosey: {id_say} most surprising development was {most_surprising}."
        return interaction

    def summarize(interaction):
        preamble = choice(
            ["The story is a little long. ", "This is longer than I was expecting. ", "It needs to be shorter. ", ""]
        )
        verb = choice(
            ["shorten it to a sentence or two", "summarize it", "shrink it way down", "make it way more terse"]
        )
        request = choice(["Can you ", "I need you to ", "Please "])
        interaction += f"User: {preamble}{request}{verb}.\n\n"
        interaction += f"Rosey: {sure()} Here's a summary of the story:\n\n{row['summary']}"
        return interaction

    (first, second) = (most_surprising, summarize)
    if random() > 0.5:
        (first, second) = (second, first)
    interaction = first(interaction)
    interaction += "\n\n"
    interaction = second(interaction)

    return interaction


hippocorpus = hippocorpus.apply(convert_row, axis=1)

## Export

In [None]:
hippocorpus.to_csv("hippocorpus.csv")