In [1]:
import pandas as pd
import copy

community_df = pd.read_csv("data/raw/Story+Scale_February+24,+2022_05.59.csv")
surveyswap_df = pd.read_csv(
    "data/raw/Story+Scale+-+SurveySwap_February+24,+2022_06.03.csv")
story_df = pd.read_csv("survey/NAI_story_data/NAI_story_data_for_qualtrics.csv")

In [2]:
# Variable with question order automatically shows storyID
# rename accordingly
community_df.rename(
    columns={"Stories-Feb17,2022_DO": "story_id"}, inplace=True)
surveyswap_df.rename(
    columns={"Stories-Feb17,2022_DO": "story_id"}, inplace=True)

# Extract prompt abbreviation
surveyswap_df["prompt_label"] = surveyswap_df["story_id"].str.extract(
    r"_(.*)_\d_?\d?")
community_df["prompt_label"] = community_df["story_id"].str.extract(
    r"_(.*)_\d_?\d?")

# Rename prompt abbreviation to full name
prompt_rename_dict = {"HF": "High Fantasy", "HOR": "Horror",
                      "HR": "Historical Romance", "HSF": "Hard Sci-Fi"}
surveyswap_df["prompt_label"].replace(prompt_rename_dict, inplace=True)
community_df["prompt_label"].replace(prompt_rename_dict, inplace=True)

In [3]:
# first line has description of variables so also of itmes
# save these to own df, then delete in main dfs
items_descr = surveyswap_df.iloc[0, 18:94]
surveyswap_df.drop(index=[0, 1], inplace=True)
community_df.drop(index=[0, 1], inplace=True)

In [4]:
# Quick clean up of the descriptions
items_descr = items_descr.str.replace(
    "For the following questions, please think of the story you just read.\nIndicate how much you agree or disagree with each of the following statements about the story. - ", "", regex=False)

In [5]:
# SurveySwap survey contained quality checks
# Label if participants passed those
def evaluate_qual_check_1(row):
    # returns True if passed check
    if (row["Qual_Check_1"] == '...someone working in a tavern.') and (row["prompt_label"] == "Historical Romance"):
        return True
    elif (row["Qual_Check_1"] == '...a noise coming from a mirror.') and (row["prompt_label"] == "Horror"):
        return True
    elif (row["Qual_Check_1"] == '...with a message from the president.') and (row["prompt_label"] == "Hard Sci-Fi"):
        return True
    elif (row["Qual_Check_1"] == '...the description of a small village.') and (row["prompt_label"] == "High Fantasy"):
        return True
    else:
        return False


# Check quality check 1 - participants correctly identified beginning of story
surveyswap_df["pass_qual_1"] = surveyswap_df.apply(
    evaluate_qual_check_1, axis=1)
# Check quality check 2 - participants were asked to give specific answer to question
surveyswap_df["pass_qual_2"] = surveyswap_df.apply(
    lambda row: True if row["story_scale_74"] == "Somewhat disagree" else False, axis=1)

# quality check 2 marks bad respondent either way
# if quality check 1 is failed, but 2 is passed needs closer look
mask_inspect_qual = (surveyswap_df["pass_qual_1"] == False) & (
    surveyswap_df["pass_qual_2"] == True)
print("Do any cases need further investigation?\n" +
      str(mask_inspect_qual.value_counts()))

# okay no closer inspection needed
# create dataframe with only good respondents
mask_passed = (surveyswap_df["pass_qual_1"] == True) & (
    surveyswap_df["pass_qual_2"] == True)
surveyswap_passed_df = copy.deepcopy(surveyswap_df[mask_passed])

Do any cases need further investigation?
False    41
dtype: int64


In [6]:
good_resp = len(surveyswap_passed_df)
bad_respondents = len(surveyswap_df) - good_resp
bar_resp_per = (bad_respondents/len(surveyswap_df))*100
print("Good respondents from SurveySwap:\t\t {}".format(good_resp))
print("Bad respondents from SurveySwap (filtered out):\t {}".format(bad_respondents))
print("That is {}% bad respondents - yay SurveySwap has really ")

Good respondents from SurveySwap:		 27
Bad respondents from SurveySwap (filtered out):	 14
That is {}% bad respondents - yay SurveySwap has really 


In [7]:
# Time to combine community and SurveySwap sample
community_df["sample"] = "Community"
surveyswap_passed_df["sample"] = "SurveySwap"
combined_df = pd.concat([community_df, surveyswap_passed_df])

In [8]:
# delete identifying or unnecessary columns
cols_to_delete = ['Status', 'IPAddress', 'Progress', 'Finished', 'RecipientLastName', 'RecipientFirstName',
                  'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude',
                  'DistributionChannel', 'UserLanguage', '1']

combined_df.drop(columns=cols_to_delete, inplace=True)

In [9]:
# Rename remaining columns to a more sensible and easier to use naming-scheme

# build mapping dict for renaming
rename_cols_dict_1 = {'StartDate': "start", 'EndDate': "end", 'Duration (in seconds)': "duration_in_sec",
                      'RecordedDate': "recorded", 'ResponseId': "response_id",
                      "story_scale_DO": "tts_order",
                      "Qual_Check_1": "qual_check_1", "Qual_Check_1_DO": "qual_check_1_order"}

rename_cols_dict_2 = {}

for i in range(1, 13):
    current_item = "story_scale_" + str(i)
    new_item = "tts_coh_" + str(i)
    rename_cols_dict_2[current_item] = new_item

a = 1
for i in range(13, 21):
    current_item = "story_scale_" + str(i)
    new_item = "tts_conch_" + str(a)
    rename_cols_dict_2[current_item] = new_item
    a += 1

a = 1
for i in range(21, 33):
    current_item = "story_scale_" + str(i)
    new_item = "tss_cre_" + str(a)
    rename_cols_dict_2[current_item] = new_item
    a += 1

a = 1
for i in range(33, 41):
    current_item = "story_scale_" + str(i)
    new_item = "tts_qua_" + str(a)
    rename_cols_dict_2[current_item] = new_item
    a += 1

a = 1
for i in range(41, 53):
    current_item = "story_scale_" + str(i)
    new_item = "tts_rep_" + str(a)
    rename_cols_dict_2[current_item] = new_item
    a += 1

a = 1
for i in range(53, 65):
    current_item = "story_scale_" + str(i)
    new_item = "tts_sty_" + str(a)
    rename_cols_dict_2[current_item] = new_item
    a += 1

a = 1
for i in range(65, 74):
    current_item = "story_scale_" + str(i)
    new_item = "tts_pac_" + str(a)
    rename_cols_dict_2[current_item] = new_item
    a += 1

rename_cols_dict_2["story_scale_74"] = "qual_check_2"

# update items_descr naming
items_descr.rename(rename_cols_dict_2, inplace=True)

# update naming for combined_df
combined_df.rename(columns=rename_cols_dict_1, inplace=True)
combined_df.rename(columns=rename_cols_dict_2, inplace=True)

In [10]:
# We already have the prompt_label,
# time to also create a preset_label
# Extract prompt abbreviation
combined_df["preset_label"] = combined_df["story_id"].str.extract(
    r"^([A-Z]*)")

# Rename prompt abbreviation to full name
preset_rename_dict = {"ACE": "Ace of Spade", "ALL": "All-Nighter",
                      "BAS": "Basic Coherence", "FAN": "Fandango",
                      "GEN": "Genesis", "LOW": "Low Rider",
                      "MOR": "Morpho", "OUR": "Ouroboros"}
combined_df["preset_label"].replace(preset_rename_dict, inplace=True)

# recode likert responses
likert_recode_dict = {'Strongly disagree': 1, 'Somewhat disagree': 2,
                      'Neither agree nor disagree': 3,
                      'Somewhat agree': 4,'Strongly agree': 5}
combined_df.replace(likert_recode_dict, inplace=True)

In [11]:
# Add story infos
story_df = story_df[["Story_ID", "full_story", "prompt", "memory", "result"]]
story_df.rename(columns={"Story_ID": "story_id"}, inplace=True)
combined_df = combined_df.join(story_df.set_index("story_id"), on="story_id", how = "inner")

In [12]:
# reorder cols
cols_new_order = ['response_id', 'story_id', 'prompt_label', 'preset_label', 'sample',
                  'tts_coh_1', 'tts_coh_2', 'tts_coh_3', 'tts_coh_4', 'tts_coh_5',
                  'tts_coh_6', 'tts_coh_7', 'tts_coh_8', 'tts_coh_9', 'tts_coh_10',
                  'tts_coh_11', 'tts_coh_12', 'tts_conch_1', 'tts_conch_2', 'tts_conch_3',
                  'tts_conch_4', 'tts_conch_5', 'tts_conch_6', 'tts_conch_7',
                  'tts_conch_8', 'tss_cre_1', 'tss_cre_2', 'tss_cre_3', 'tss_cre_4',
                  'tss_cre_5', 'tss_cre_6', 'tss_cre_7', 'tss_cre_8', 'tss_cre_9',
                  'tss_cre_10', 'tss_cre_11', 'tss_cre_12', 'tts_qua_1', 'tts_qua_2',
                  'tts_qua_3', 'tts_qua_4', 'tts_qua_5', 'tts_qua_6', 'tts_qua_7',
                  'tts_qua_8', 'tts_rep_1', 'tts_rep_2', 'tts_rep_3', 'tts_rep_4',
                  'tts_rep_5', 'tts_rep_6', 'tts_rep_7', 'tts_rep_8', 'tts_rep_9',
                  'tts_rep_10', 'tts_rep_11', 'tts_rep_12', 'tts_sty_1', 'tts_sty_2',
                  'tts_sty_3', 'tts_sty_4', 'tts_sty_5', 'tts_sty_6', 'tts_sty_7',
                  'tts_sty_8', 'tts_sty_9', 'tts_sty_10', 'tts_sty_11', 'tts_sty_12',
                  'tts_pac_1', 'tts_pac_2', 'tts_pac_3', 'tts_pac_4', 'tts_pac_5',
                  'tts_pac_6', 'tts_pac_7', 'tts_pac_8', 'tts_pac_9',
                  'full_story', 'prompt', 'memory', 'result',
                  'start', 'end', 'duration_in_sec', 'recorded', 
                  'tts_order', 
                  'qual_check_1', 'qual_check_1_order', 'qual_check_2', 'pass_qual_1', 'pass_qual_2']
combined_df = combined_df[cols_new_order]

In [13]:
# save to file
combined_df.to_csv("data/combined_data.csv")
items_descr.to_csv("data/description_items.csv")