# Convert and evaluate raw data

Basic processing to convert data from its raw form
returned by the website to a format useful for statistical analysis.

In [1]:
import glob
import numpy as np
import pandas as pd

# set up a DataFrame to collect the processed data
columns = [
    "StudyID", "ParticipantID", "Fluent", "Training",
    "TestID", "Type", "TrialID",
    "Font", "Sample", "Category",
    "Response", "Correct", "Seen", "Foil", "RT", "RTnorm",
    "JoM", "JoL", "Date",
]
d = pd.DataFrame(columns=columns)

## Convert data from the raw format to stats-ready format

The raw format has all responses from one participant in a single row
this breaks down results for individual trials (saved in columns like “test_1_lexical”)
and saves these as individual rows.

Deal with some minor format differences as the formatting evolved with time.

In [6]:
# Warning: this takes quite a while to compute

# participant counter (Participant ID)
pid = 0
# counter for trials within each session of a single participant
x = 0
for fn in glob.glob("data__*.csv"):
    raw = pd.read_csv(fn)
    for i, rraw in raw.iterrows():
        # collect data that will be shared across all rows
        # for one participant
        shared = pd.Series(index=d.columns, dtype="float64")
        if "studyid" in rraw:
            shared["StudyID"] = rraw["studyid"]
        else:
            shared["StudyID"] = 0  # pilot study
        shared["ParticipantID"] = pid
        if "Fluent" in rraw:
            shared["Fluent"] = rraw["Fluent"]
        # deal with legacy column names
        if "Native" in rraw:
            shared["Fluent"] = rraw["Native"]
        if "Designer" in rraw:
            shared["Training"] = rraw["Designer"]
        if "Design_skills" in rraw:
            shared["Training"] = rraw["Design_skills"]
        for c in rraw.index:
            # get values from columns like this: test_1_lexical_5
            # ignore values from columns like this: test_1_remember
            # or test_1_legibility
            if c.startswith("test_") and \
               not (c.endswith("_remember") or c.endswith("_legibility")):
                # prefill with shared data
                rd = pd.Series(shared)
                # set defaults
                rd["Category"], rd["Seen"], rd["Foil"] = np.nan, np.nan, np.nan
                # get Test ID, Type, and Trial ID from the column name
                _, rd["TestID"], rd["Type"], rd["TrialID"] = c.strip().split("_")
                # get respond from the value in this column
                response = rraw[c].strip().split(",")
                # tackle legacy formats of responses
                # when only some values were provided
                rd["Font"] = response[0].strip()
                rd["Response"] = response[-2].strip()
                rd["RT"] = float(response[-1].strip())
                if rd["Type"] == "lexical":
                    if len(response) == 4:
                        rd["Sample"] = response[1].strip()
                    else:
                        rd["Category"] = response[1].strip()
                        rd["Sample"] = response[2].strip()
                else:
                    if len(response) == 5:
                        rd["Sample"] = response[1].strip()
                        rd["Seen"] = response[2].strip()
                    elif len(response) == 6:
                        rd["Category"] = response[1].strip()
                        rd["Sample"] = response[2].strip()
                        rd["Seen"] = response[3].strip()
                    else:
                        rd["Category"] = response[1].strip()
                        rd["Sample"] = response[2].strip()
                        rd["Seen"] = response[3].strip()
                        rd["Foil"] = response[4].strip()
                # fix legacy values
                if isinstance(rd["Category"], str):
                    rd["Category"] = rd["Category"].replace("nonword", "non-word")
                if isinstance(rd["Seen"], str):
                    rd["Seen"] = rd["Seen"].replace("non-seen", "not seen")
                rd["Response"] = rd["Response"].replace("non-seen", "not seen")
                # add the judgement of learning for this part
                # value from column test_1_remember
                rd["JoM"] = rraw["test_%s_remember" % rd["TestID"]]
                # add the judgement of legibility for this part
                # value from column test_1_legibility
                rd["JoL"] = rraw["test_%s_legibility" % rd["TestID"]]
                rd["Date"] = rraw[-1]
                # add a row with for individual trial
                d.loc[x] = rd
                x += 1
        pid += 1
# fix types
d["StudyID"] = d["StudyID"].astype(int)
d["ParticipantID"] = d["ParticipantID"].astype(int)
# add normalized RT
d["RTnorm"] = np.log(d["RT"])

print("Processed %d responses from %d participants." % (len(d), pid))

Processed 15768 responses from 219 participants.


## Add missing data & evaluate responses

Also add response time (RT) transformed using natural logarithm.

In [3]:
# Warning: this takes quite a while to compute

# get a list of words and non-words from txt files used for the website
# and map them to their category names (word, non-word)
categories = {}
for cat in ["words", "non-words"]:
    with open(cat + ".txt") as f:
        for w in f.readlines():
            categories[w.strip()] = cat[:-1] # remove the final "s"

# add missing data & evaluate responses
for i, rd in d.iterrows():
    # convert string "yes" to boolean
    rd["Fluent"] = (rd["Fluent"] == "yes")
    if isinstance(rd["Category"], float) or rd["Category"] is np.nan:
        # assing correct category if missing
        rd["Category"] = categories[rd["Sample"]]
    # set Correct to 1 when the participant said sure or probably
    # set to zero otherwise
    rd["Correct"] = 0
    if rd["Type"] == "lexical":
        if rd["Response"] == ("Sure " + rd["Category"]) or \
          rd["Response"] == ("Probably " + rd["Category"]):
            rd["Correct"] = 1
    elif rd["Type"] == "recognition":
        if rd["Response"] == ("Sure " + rd["Seen"]) or \
          rd["Response"] == ("Probably " + rd["Seen"]):
            rd["Correct"] = 1
    d.loc[i] = rd
            
# add normalized RT
d["RTnorm"] = np.log(d["RT"])

In [5]:
# save the processed data
d.to_csv("data_processed.csv")
print("Successfully saved to a CSV file.")

Successfully saved to a CSV file.
