# Hippocorpus converter

## Import

In [None]:
import pandas as pd

hippocorpus = pd.read_csv("hippocorpus/hcV3-stories.csv")

In [None]:
# There is a surprising number of people who seem to have left capslock on while participating in the data collection process.
# These entries tend to be of lower than average quality and would be impossible to fully restore without more complex methods, so they are excluded
hippocorpus = hippocorpus[~hippocorpus["mainEvent"].str.isupper()]

## Convert

In [None]:
cols_to_drop = [
    "WorkTimeInSeconds",
    "WorkerId",
    "annotatorAge",
    "annotatorGender",
    "annotatorRace",
    "distracted",
    "draining",
    "frequency",
    "importance",
    "logTimeSinceEvent",
    "memType",
    "mostSurprising",
    "openness",
    "recAgnPairId",
    "recImgPairId",
    "similarity",
    "similarityReason",
    "stressful",
    "summary",
    "timeSinceEvent",
]
hippocorpus = hippocorpus.drop(cols_to_drop, axis=1)
hippocorpus.columns = ["SOURCE", "INSTRUCTION", "RESPONSE"]

In [None]:
hippocorpus[["SOURCE"]] = "Hippocorpus: " + hippocorpus[["SOURCE"]]

In [None]:
import re
from random import choice, random, randrange
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")


def replace_my(string):
    match = re.search(r"my (\w+)", string)
    if match:
        word = match.group(1)
        if word[0] in "aeiou":
            string = re.sub(r"my", "an", string, 1)
        else:
            string = re.sub(r"my", "a", string, 1)
    return string


def convert_row(row):
    orig_instruction = row["INSTRUCTION"].rstrip("!.?;:")
    orig_instruction = orig_instruction[0].lower() + orig_instruction[1:]
    orig_instruction = replace_my(orig_instruction)
    orig_response = row["RESPONSE"]
    n_original = choice(["", "n original"])
    instruction = f"Write a{n_original} story about {orig_instruction}."
    do_sentence_instruction = random() > 0.5
    if do_sentence_instruction:
        sentences = sent_tokenize(orig_response)
        sentence_index = randrange(len(sentences))
        if sentence_index == 0:
            instruction += " Make the first sentence "
            sentence_response_section = f" where the first sentence is "
        elif sentence_index == len(sentences) - 1:
            instruction += " Make the last sentence "
            sentence_response_section = f" where the last sentence is "
        else:
            instruction += " Include the sentence "
            sentence_response_section = f" which includes the sentence "
        instruction += f'"{sentences[sentence_index]}"'
        sentence_response_section += f'"{sentences[sentence_index]}"'
    else:
        sentence_response_section = ""
    sure = choice(["Sure", "Of course", "Alright", "Certainly"])
    punctuation = choice([",", "!", "."])
    response = (
        f"{sure}{punctuation} Here's a story about {orig_instruction}{sentence_response_section}.\n\n{orig_response}"
    )
    row["INSTRUCTION"] = instruction
    row["RESPONSE"] = response
    return row


hippocorpus.apply(convert_row, axis=1)

## Export

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq

table = pa.Table.from_pandas(hippocorpus)
pq.write_table(table, "data.parquet", row_group_size=100)