In [83]:
import pandas as pd
import itertools, re

df = pd.read_csv("NAI_story_data/NAI_story_data.csv")

df.drop(columns = "Unnamed: 0", inplace = True)

In [84]:
# Put starting prompt and result together
df["full_story"] = df["prompt"] + df["result"]

# Check stories with incomplete sentences
matches_incomp_sent = df["full_story"].str.findall(r"(?<=[\.\?\!\"])[^\.\?\!\"]*$")
# mask for incommplete sentences - exclude results ending with '. That is just ending of direct speech
mask_incomp_sent = (matches_incomp_sent.apply(lambda x:x[0]) != "") & (matches_incomp_sent.apply(lambda x:x[0]) != "'")
corrected_stories = df["full_story"][mask_incomp_sent].str.replace(r"(?<=[\.\?\!\"])[^\.\?\!\"]*$", "", regex = True)
df.update(corrected_stories)
df.reset_index(drop = True, inplace=True)

# mask_asterism = df["full_story"].str.contains(r"⁂.*", flags = re.DOTALL)
# df["full_story"][mask_asterism]

# Remove everything after a ⁂, as it would indicate a new story
# Note to self: Ban the asterism token next time you dummy!
df["full_story"] = df["full_story"].str.replace(r"⁂.*", "", flags = re.DOTALL, regex = True) 

In [85]:
# Some stories might have been cut substantially - best to remove extreme outliers
# We will be using Tukey's rule for extre,e outliers (no more than IQRx1.5 from Q1 or Q3)
df["word_count"] = df["full_story"].str.split().apply(len)
words_Q1 = df["word_count"].quantile(0.25)
words_Q3 = df["word_count"].quantile(0.75)
words_iqr = words_Q3-words_Q1
words_outlier_lower = words_Q1 - (words_iqr*3)
words_outlier_upper = words_Q3 + (words_iqr*3)
print("25th Percentile (Q1): {}\n75th Percentile (Q3): {}\nIQR: {}".format(words_Q1,words_Q3,words_iqr))
print("will sort out stories with less than {} or more than {} words.".format(words_outlier_lower, words_outlier_upper))

def determine_word_outlier(row):
    if (row["word_count"] < words_outlier_lower) or (row["word_count"] > words_outlier_upper):
        return True
    else:
        return False
    
df["word_outlier"] = df.apply(lambda row: determine_word_outlier(row), axis = 1)
print("\nWord outliers:")
print(df[["preset_label", "prompt_label"]][df["word_outlier"] == True])

# 9 outliers
# seems like Horror is especially problematic...
# will have to repeat some stories to make up for the deleted outliers...

# create dictionary with count of outliers to know how often to repeat stories later
missing_stories_count = {}
outliers_df = df[["preset_label", "prompt_label"]][df["word_outlier"] == True]

for index, row in outliers_df.iterrows():
    preset = row["preset_label"]
    genre = row["prompt_label"]
    
    
    if (preset, genre) in missing_stories_count:
        missing_stories_count[(preset, genre)] += 1
    else:
        missing_stories_count[(preset, genre)] = 1

# delete outliers
df = df[df["word_outlier"] != True]

25th Percentile (Q1): 1118.0
75th Percentile (Q3): 1249.25
IQR: 131.25
will sort out stories with less than 724.25 or more than 1643.0 words.

Word outliers:
                     preset_label prompt_label
10      Ace of Spade (14/02/2022)       Horror
18      Ace of Spade (14/02/2022)       Horror
56       All-Nighter (14/02/2022)       Horror
91   Basic Coherence (14/02/2022)       Horror
95   Basic Coherence (14/02/2022)       Horror
98   Basic Coherence (14/02/2022)       Horror
130         Fandango (14/02/2022)       Horror
251           Morpho (14/02/2022)       Horror
259           Morpho (14/02/2022)       Horror


In [86]:
# Start building blocks for qualtrics survey

# Create IDs for different prompt combinations
genre_li = df["prompt_label"].unique()
preset_li = df["preset_label"].unique()

genre_preset_li = list(itertools.product(preset_li, genre_li))

# create ID prefixes by using the first 3 letters of preset and identifer for genre
# when adding more presets and/or genres make sure IDs stay unique
story_id_dict = {}

for comb in genre_preset_li:
    id_str = comb[0][:3] + "_"
    genre = comb[1]
    if genre == "High Fantasy":
        id_str += "HF"
    elif genre == "Horror":
        id_str += "HOR"
    elif genre == "Hard Sci-fi":
        id_str += "HSF"
    elif genre == "Historical Romance":
        id_str += "HR"
        
    id_str =  id_str.upper()
    story_id_dict[id_str] = 0

# Create string to later write into a file for Qualtrics' advanced txt format
qualtrics_str = "[[AdvancedFormat]]\n\n[[Block:Stories]]\n"

for index, row in df.iterrows():
    
    # determine story id
    preset = row["preset_label"]
    genre = row["prompt_label"]
    
    story_id_prefix = preset[:3] + "_"
    
    
    if genre == "High Fantasy":
        story_id_prefix += "HF"
    elif genre == "Horror":
        story_id_prefix += "HOR"
    elif genre == "Hard Sci-fi":
        story_id_prefix += "HSF"
    elif genre == "Historical Romance":
        story_id_prefix += "HR"
    
    story_id_prefix = story_id_prefix.upper()
    
    story_id = story_id_prefix + "_" + str(story_id_dict[story_id_prefix]+1)
    
    # increase counter for id
    story_id_dict[story_id_prefix] += 1
    
    # Write to qualtrics string
    qualtrics_str += "\n[[Question:DB]]"
    qualtrics_str += "\n[[ID:" + story_id + "]]\n"
    qualtrics_str += row["full_story"].replace("\n","<br>")
    qualtrics_str += "\n"
    
    # if there is a lack of stories of this type due to outliers, repeat stories
    if (preset, genre) in missing_stories_count:
        if missing_stories_count[(preset, genre)] > 0:
            qualtrics_str += "\n[[Question:DB]]"
            qualtrics_str += "\n[[ID:" + story_id + "_rep]]\n"
            qualtrics_str += row["full_story"].replace("\n","<br>")
            qualtrics_str += "\n"
            
            missing_stories_count[(preset, genre)] -= 1
    
    # Also record story id in dataframe
    df.loc[index, "Story_ID"] = story_id
    
# Use story ID as index
df.set_index("Story_ID", inplace = True)

In [87]:
# Save Qualtrics advanced txt file
with open("stories_qualtrics_advanced_txt.txt", "w", encoding='utf-8') as text_file:
    text_file.write(qualtrics_str)
    
# Save story datafile with story IDs
df.to_csv("NAI_story_data/NAI_story_data_for_qualtrics.csv")