In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import copy
import re

community_df = pd.read_csv("data/raw/story_scale_community.csv")
panel_df = pd.read_csv(
    "data/raw/story_scale_panel.csv")
story_df = pd.read_csv(
    "survey/NAI_story_data/NAI_story_data_for_qualtrics.csv")

In [2]:
# first line has description of variables so also of itmes
# save these to own df, then delete in main dfs
items_descr = panel_df.iloc[0, 18:94]
panel_df.drop(index=[0, 1], inplace=True)
community_df.drop(index=[0, 1], inplace=True)

In [3]:
# Quick clean up of the descriptions
items_descr = items_descr.str.replace(
    "For the following questions, please think of the story you just read.\nIndicate how much you agree or disagree with each of the following statements about the story. - ", "", regex=False)

In [4]:
story_items_cols = community_df.iloc[:,18:91].columns

story_items_cols

Index(['story_scale_1', 'story_scale_2', 'story_scale_3', 'story_scale_4',
       'story_scale_5', 'story_scale_6', 'story_scale_7', 'story_scale_8',
       'story_scale_9', 'story_scale_10', 'story_scale_11', 'story_scale_12',
       'story_scale_13', 'story_scale_14', 'story_scale_15', 'story_scale_16',
       'story_scale_17', 'story_scale_18', 'story_scale_19', 'story_scale_20',
       'story_scale_21', 'story_scale_22', 'story_scale_23', 'story_scale_24',
       'story_scale_25', 'story_scale_26', 'story_scale_27', 'story_scale_28',
       'story_scale_29', 'story_scale_30', 'story_scale_31', 'story_scale_32',
       'story_scale_33', 'story_scale_34', 'story_scale_35', 'story_scale_36',
       'story_scale_37', 'story_scale_38', 'story_scale_39', 'story_scale_40',
       'story_scale_41', 'story_scale_42', 'story_scale_43', 'story_scale_44',
       'story_scale_45', 'story_scale_46', 'story_scale_47', 'story_scale_48',
       'story_scale_49', 'story_scale_50', 'story_scale_51', 

In [5]:
# Drop cases without any response on the story items
community_df.dropna(how="all", inplace=True, subset=story_items_cols)
panel_df.dropna(how="all", inplace=True, subset=story_items_cols)

In [6]:
# Variable with question order automatically shows storyID
# rename accordingly
community_df.rename(
    columns={"Stories-Feb17,2022_DO": "story_id"}, inplace=True)
panel_df.rename(
    columns={"Stories-Feb17,2022_DO": "story_id"}, inplace=True)

# A few stories were repeated to make the design balanced
# cut the "_rep from these names"
community_df["story_id"] = community_df["story_id"].str.replace(r"_rep", "")
panel_df["story_id"] = panel_df["story_id"].str.replace(r"_rep", "")

# Extract prompt abbreviation
community_df["prompt_label"] = community_df["story_id"].str.extract(
    r"_(.*)_\d_?\d?")
panel_df["prompt_label"] = panel_df["story_id"].str.extract(
    r"_(.*)_\d_?\d?")

# Rename prompt abbreviation to full name
prompt_rename_dict = {"HF": "High Fantasy", "HOR": "Horror",
                      "HR": "Historical Romance", "HSF": "Hard Sci-Fi"}
community_df["prompt_label"].replace(prompt_rename_dict, inplace=True)
panel_df["prompt_label"].replace(prompt_rename_dict, inplace=True)

print("Initial case count")
print("Community sample: {}\nPanel sample: {}".format(community_df.shape[0],panel_df.shape[0]))

Initial case count
Community sample: 118
Panel sample: 97


In [7]:
# Add story infos
story_df = story_df[["Story_ID", "full_story", "prompt", "memory", "result"]]
story_df.rename(columns={"Story_ID": "story_id"}, inplace=True)
community_df = community_df.join(story_df.set_index(
    "story_id"), on="story_id", how="left")
panel_df = panel_df.join(story_df.set_index(
    "story_id"), on="story_id", how="left")

In [8]:
## Panel survey contained quality checks
# Label if participants passed those
def evaluate_qual_check_1(row):
    # returns True if passed check
    if (row["Qual_Check_1"] == '...someone working in a tavern.') and (row["prompt_label"] == "Historical Romance"):
        return True
    elif (row["Qual_Check_1"] == '...a noise coming from a mirror.') and (row["prompt_label"] == "Horror"):
        return True
    elif (row["Qual_Check_1"] == '...with a message from the president.') and (row["prompt_label"] == "Hard Sci-Fi"):
        return True
    elif (row["Qual_Check_1"] == '...the description of a small village.') and (row["prompt_label"] == "High Fantasy"):
        return True
    else:
        return False

# Check quality check 1 - participants correctly identified beginning of story
panel_df["pass_qual_1"] = panel_df.apply(
    evaluate_qual_check_1, axis=1)
# Check quality check 2 - participants were asked to give specific answer to question
panel_df["pass_qual_2"] = panel_df.apply(
    lambda row: True if row["story_scale_74"] == "Somewhat disagree" else False, axis=1)

# quality check 2 marks bad respondent either way
# if quality check 1 is failed, but 2 is passed needs closer look
mask_inspect_qual = (panel_df["pass_qual_1"] == False) & (
    panel_df["pass_qual_2"] == True)
inspect_count = mask_inspect_qual.value_counts()[True]
print("{} cases need manual investigation. Displaying them now.".format(inspect_count))

# Create final check column - failed either way if quality check 2 was failed
# Otherwise will go through manual check
panel_df["passed_manual_check"] = panel_df["pass_qual_2"]

for index,row in panel_df[mask_inspect_qual].iterrows():
    print("Participant said this story began with...")
    print(row["Qual_Check_1"], "\n")
    print("Actual story was:")
    print(row["full_story"], "\n\n")
    manual_qual_check = input("The participant was correct (yes/no): ")
    panel_df.at[index,'passed_manual_check'] = True if (manual_qual_check == "yes") else False

4 cases need manual investigation. Displaying them now.
Participant said this story began with...
...the description of a small village. 

Actual story was:
"I have a message for you from the president," said Dr. Sato, handing over an envelope to me. "He's asking that we meet with him at his office this afternoon." I took it and thanked her before walking out of my apartment building into the bright sun. It was already noon on Mars—the longest day in the year here on the planet. The air felt warm against my face as I walked down the street toward the presidential palace.
The streets were empty except for a few people going about their daily business. I saw one old woman pushing a cart filled with vegetables, another man carrying two large bags of grain across his shoulders, and a young couple holding hands while they walked past me. As I passed them, I could see the girl looking up at me curiously. She had dark brown skin like mine but wore a white dress with purple trim. Her hair was 

The participant was correct (yes/no): no
Participant said this story began with...
...someone waking up in a spaceship. 

Actual story was:
"I have a message for you from the president," said Dr. Sato, handing over an envelope to me. "He's asking that we meet with him at his office this afternoon." I took it and thanked her before walking out of my apartment building into the bright sun. It was already noon on Mars—the longest day in the year here on the planet. Today, however, would be shorter than usual because of the eclipse, which was due to happen later today. The next time we'd have a full solar eclipse on Mars, it would be in two years. I got into the aircar, which I'd driven to work all of one other time since moving here. There was a line of cars parked along the road waiting to take people across the crater to their homes or offices. I recognized some of the faces—some were crew who lived here, and others were students like me who had been recruited by the government. When yo

The participant was correct (yes/no): no


In [9]:
# create dataframe with only good respondents
mask_passed = (panel_df["passed_manual_check"] == True)
panel_df = panel_df[mask_passed]

print("\nCase count after deleting failed quality checks for panel data: {}".format(panel_df.shape[0]))


Case count after deleting failed quality checks for panel data: 64


In [10]:
# Time to combine community and SurveySwap sample
community_df["sample"] = "Community"
panel_df["sample"] = "Panel"
combined_df = pd.concat([community_df, panel_df])

In [11]:
# delete identifying or unnecessary columns
cols_to_delete = ['Status', 'IPAddress', 'Progress', 'Finished', 'RecipientLastName', 'RecipientFirstName',
                  'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude',
                  'DistributionChannel', 'UserLanguage', '1']

combined_df.drop(columns=cols_to_delete, inplace=True)

In [12]:
# Rename remaining columns to a more sensible and easier to use naming-scheme

# build mapping dict for renaming
rename_cols_dict_1 = {'StartDate': "start", 'EndDate': "end", 'Duration (in seconds)': "duration_in_sec",
                      'RecordedDate': "recorded", 'ResponseId': "response_id",
                      "story_scale_DO": "tss_order",
                      "Qual_Check_1": "qual_check_1", "Qual_Check_1_DO": "qual_check_1_order"}

rename_cols_dict_2 = {}

for i in range(1, 13):
    current_item = "story_scale_" + str(i)
    new_item = "tss_coh_" + str(i)
    rename_cols_dict_2[current_item] = new_item

a = 1
for i in range(13, 21):
    current_item = "story_scale_" + str(i)
    new_item = "tss_conch_" + str(a)
    rename_cols_dict_2[current_item] = new_item
    a += 1

a = 1
for i in range(21, 33):
    current_item = "story_scale_" + str(i)
    new_item = "tss_cre_" + str(a)
    rename_cols_dict_2[current_item] = new_item
    a += 1

a = 1
for i in range(33, 41):
    current_item = "story_scale_" + str(i)
    new_item = "tss_qua_" + str(a)
    rename_cols_dict_2[current_item] = new_item
    a += 1

a = 1
for i in range(41, 53):
    current_item = "story_scale_" + str(i)
    new_item = "tss_rep_" + str(a)
    rename_cols_dict_2[current_item] = new_item
    a += 1

a = 1
for i in range(53, 65):
    current_item = "story_scale_" + str(i)
    new_item = "tss_sty_" + str(a)
    rename_cols_dict_2[current_item] = new_item
    a += 1

a = 1
for i in range(65, 74):
    current_item = "story_scale_" + str(i)
    new_item = "tss_pac_" + str(a)
    rename_cols_dict_2[current_item] = new_item
    a += 1

rename_cols_dict_2["story_scale_74"] = "qual_check_2"

# update items_descr naming
items_descr.rename(rename_cols_dict_2, inplace=True)

# update naming for combined_df
combined_df.rename(columns=rename_cols_dict_1, inplace=True)
combined_df.rename(columns=rename_cols_dict_2, inplace=True)

In [13]:
# We already have the prompt_label,
# time to also create a preset_label
# Extract prompt abbreviation
combined_df["preset_label"] = combined_df["story_id"].str.extract(
    r"^([A-Z]*)")

# Rename prompt abbreviation to full name
preset_rename_dict = {"ACE": "Ace of Spade", "ALL": "All-Nighter",
                      "BAS": "Basic Coherence", "FAN": "Fandango",
                      "GEN": "Genesis", "LOW": "Low Rider",
                      "MOR": "Morpho", "OUR": "Ouroboros"}
combined_df["preset_label"].replace(preset_rename_dict, inplace=True)

# recode likert responses
likert_recode_dict = {'Strongly disagree': 1, 'Somewhat disagree': 2,
                      'Neither agree nor disagree': 3,
                      'Somewhat agree': 4, 'Strongly agree': 5}
combined_df.replace(likert_recode_dict, inplace=True)

In [17]:
# Add word count per story
combined_df["word_count"] = combined_df["full_story"].str.split().apply(len)

In [18]:
# reorder cols
cols_new_order = ['response_id', 'story_id', 'prompt_label', 'preset_label', 'sample',
                  'tss_coh_1', 'tss_coh_2', 'tss_coh_3', 'tss_coh_4', 'tss_coh_5',
                  'tss_coh_6', 'tss_coh_7', 'tss_coh_8', 'tss_coh_9', 'tss_coh_10',
                  'tss_coh_11', 'tss_coh_12', 'tss_conch_1', 'tss_conch_2', 'tss_conch_3',
                  'tss_conch_4', 'tss_conch_5', 'tss_conch_6', 'tss_conch_7',
                  'tss_conch_8', 'tss_cre_1', 'tss_cre_2', 'tss_cre_3', 'tss_cre_4',
                  'tss_cre_5', 'tss_cre_6', 'tss_cre_7', 'tss_cre_8', 'tss_cre_9',
                  'tss_cre_10', 'tss_cre_11', 'tss_cre_12', 'tss_qua_1', 'tss_qua_2',
                  'tss_qua_3', 'tss_qua_4', 'tss_qua_5', 'tss_qua_6', 'tss_qua_7',
                  'tss_qua_8', 'tss_rep_1', 'tss_rep_2', 'tss_rep_3', 'tss_rep_4',
                  'tss_rep_5', 'tss_rep_6', 'tss_rep_7', 'tss_rep_8', 'tss_rep_9',
                  'tss_rep_10', 'tss_rep_11', 'tss_rep_12', 'tss_sty_1', 'tss_sty_2',
                  'tss_sty_3', 'tss_sty_4', 'tss_sty_5', 'tss_sty_6', 'tss_sty_7',
                  'tss_sty_8', 'tss_sty_9', 'tss_sty_10', 'tss_sty_11', 'tss_sty_12',
                  'tss_pac_1', 'tss_pac_2', 'tss_pac_3', 'tss_pac_4', 'tss_pac_5',
                  'tss_pac_6', 'tss_pac_7', 'tss_pac_8', 'tss_pac_9',
                  'full_story', 'word_count', 'prompt', 'memory', 'result',
                  'start', 'end', 'duration_in_sec', 'recorded',
                  'tss_order',
                  'qual_check_1', 'qual_check_1_order', 'qual_check_2', 'pass_qual_1', 'pass_qual_2']
combined_df = combined_df[cols_new_order]

In [None]:
# The follwoing code was an attempt to improve data quality by sorting out extreme response durations
# This did not work too well and seems to sort out valid responses 
# (probably sorted out responses community members that opened the survey,
# closed it then reopened and finished at a later time)
# Importantly, method failed to detect speeders

# Archieved here for now...

# # Sort out extreme long or short response times

# combined_df["duration_in_sec"] = pd.to_numeric(
#     combined_df["duration_in_sec"])

# %matplotlib inline

# fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# # Histograms with KDE
# ax1 = combined_df["duration_in_sec"].plot.hist(ax=axes[0], bins=15)
# ax1.set_xlabel("duration_in_sec")
# combined_df["duration_in_sec"].plot.kde(ax=axes[0], secondary_y=True)

# # Boxplots
# combined_df["duration_in_sec"].plot.box(ax=axes[1])

# fig.tight_layout()
# plt.show()

In [None]:
# # Use Turkey's rule for exreme outliers to sort out extreme response times (no more than IQRx3 from Q1 or Q3)
# def detect_outliers(df, check_col_label):
#     q1 = df[check_col_label].quantile(0.25)
#     q3 = df[check_col_label].quantile(0.75)
#     iqr = q3-q1
#     outlier_lower = q1 - (iqr*1.5)
#     outlier_upper = q3 + (iqr*3)
#     print("25th Percentile (Q1): {}\n75th Percentile (Q3): {}\nIQR: {}".format(q1, q3, iqr))
#     print("will detect outliers with values lower than {} or higher than {}".format(
#         outlier_lower, outlier_upper))

#     out_series = df[check_col_label].apply(lambda x: True if ((x < outlier_lower) or (x > outlier_upper)) else False)
#     count_out = df[out_series == True].shape[0]
    
#     print("\n{} Outliers".format(count_out))
#     if count_out > 0:
#         print(df[check_col_label][out_series == True])
    
#     return(out_series, count_out)

In [None]:
# # Sort out outliers until check comes up with no extreme outliers

# combined_df["Duration (in seconds)"] = pd.to_numeric(combined_df["Duration (in seconds)"])
# count_outliers = 99

# while count_outliers > 0:
#     out_result = detect_outliers(combined_df, "Duration (in seconds)")
#     count_outliers = out_result[1]
#     out_series = out_result[0]

#     mask_good_resp = out_series != True
#     combined_df = combined_df[mask_good_resp]

In [None]:
# # Plot new duration distribution
# %matplotlib inline

# fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# # Histograms with KDE
# ax1 = combined_df["duration_in_sec"].plot.hist(ax=axes[0], bins=15)
# ax1.set_xlabel("duration_in_sec")
# combined_df["duration_in_sec"].plot.kde(ax=axes[0], secondary_y=True)

# # Boxplots
# combined_df["duration_in_sec"].plot.box(ax=axes[1])

# fig.tight_layout()
# plt.show()

In [19]:
print("Case count after cleaning:")
print("Community sample: {}\nPanel Sample: {}\nTotal Sample: {}".format(
    combined_df[combined_df["sample"] == "Community"].shape[0],
    combined_df[combined_df["sample"] == "Panel"].shape[0],
    combined_df.shape[0]))

Case count after cleaning:
Community sample: 118
Panel Sample: 64
Total Sample: 182


In [20]:
# save to file
combined_df.to_csv("data/combined_data.csv")
items_descr.to_csv("data/description_items.csv")