In [109]:
import pandas as pd
import numpy as np

columns = [
    "Study ID", "Participant ID", "Native", "Designer",
    "Test ID", "Test type", "Trial ID",
    "Font", "Sample", "Category",
    "Response", "Correct", "Seen", "Foil", "Response time",
    "JoM", "JoL", "Date",
]
d = pd.DataFrame(columns=columns)

raw = pd.read_csv("data.csv")
#display(raw)

x = 0
prev_x = -1
options = {}
for i, rraw in raw.iterrows():
    rd_temp = pd.Series(index=d.columns)
    rd_temp["Study ID"] = "Pilot"
    rd_temp["Participant ID"] = i
    if "Native" in rraw:
        rd_temp["Native"] = rraw["Native"]
    else:
        rd_temp["Native"] = rraw["Fluent"] # legacy
    if "Designer" in rraw:
        rd_temp["Designer"] = rraw["Designer"] # legacy
    else:
        rd_temp["Designer"] = rraw["Design_skills"] # legacy
    # save results for individual trials in rows
    for c in rraw.index:
        if c.startswith("test_") and not (c.endswith("_remember") or c.endswith("_legibility")):
            # e.g. test_1_lexical_5
            rd = pd.Series(rd_temp)
            rd["Category"], rd["Seen"], rd["Foil"] = np.nan, np.nan, np.nan # force defaults
            _, rd["Test ID"], rd["Test type"], rd["Trial ID"] = c.strip().split("_")
            response = rraw[c].strip().split(",")
            # tackle legacy formats of responses
            rd["Font"] = response[0].strip()
            rd["Response"] = response[-2].strip()
            rd["Response time"] = response[-1].strip()
            if rd["Test type"] == "lexical":
                if len(response) == 4:
                    rd["Sample"] = response[1].strip()
                else:
                    rd["Category"] = response[1].strip()
                    rd["Sample"] = response[2].strip()
            else:
                if len(response) == 5:
                    rd["Sample"] = response[1].strip()
                    rd["Seen"] = response[2].strip()
                elif len(response) == 6:
                    rd["Category"] = response[1].strip()
                    rd["Sample"] = response[2].strip()
                    rd["Seen"] = response[3].strip()
                else:
                    rd["Category"] = response[1].strip()
                    rd["Sample"] = response[2].strip()
                    rd["Seen"] = response[3].strip()
                    rd["Foil"] = response[4].strip()
            # fix legacy values
            if isinstance(rd["Category"], str):
                rd["Category"] = rd["Category"].replace("nonword", "non-word")
            rd["Response"] = rd["Response"].replace("non-seen", "not seen")
            # add the judgement of learning for this part (from test_1_remember)
            rd["JoM"] = rraw["test_%s_remember" % rd["Test ID"]]
            # add the judgement of legibility for this part (from test_1_legibility)
            rd["JoL"] = rraw["test_%s_legibility" % rd["Test ID"]]
            rd["Date"] = rraw[-1]
            
            d.loc[x] = rd
            x += 1
display(d[:10])

Unnamed: 0,Study ID,Participant ID,Native,Designer,Test ID,Test type,Trial ID,Font,Sample,Category,Response,Correct,Seen,Foil,Response time,JoM,JoL,Date
0,Pilot,0,yes,Non-designer,1,lexical,1,sansforgetica,neeld,non-word,Sure word,,,,2744,56,very easy to read,25-03-2019 19:37
1,Pilot,0,yes,Non-designer,1,lexical,2,sansforgetica,inverstand,non-word,Sure word,,,,841,56,very easy to read,25-03-2019 19:37
2,Pilot,0,yes,Non-designer,1,lexical,3,sansforgetica,sovess,non-word,Sure word,,,,527,56,very easy to read,25-03-2019 19:37
3,Pilot,0,yes,Non-designer,1,lexical,4,sansforgetica,guilty,word,Sure word,,,,536,56,very easy to read,25-03-2019 19:37
4,Pilot,0,yes,Non-designer,1,lexical,5,sansforgetica,actions,word,Sure word,,,,511,56,very easy to read,25-03-2019 19:37
5,Pilot,0,yes,Non-designer,1,lexical,6,sansforgetica,earth,word,Sure word,,,,481,56,very easy to read,25-03-2019 19:37
6,Pilot,0,yes,Non-designer,1,lexical,7,sansforgetica,partnestion,non-word,Sure word,,,,504,56,very easy to read,25-03-2019 19:37
7,Pilot,0,yes,Non-designer,1,lexical,8,sansforgetica,surcenters,non-word,Sure word,,,,512,56,very easy to read,25-03-2019 19:37
8,Pilot,0,yes,Non-designer,1,lexical,9,sansforgetica,status,word,Sure word,,,,496,56,very easy to read,25-03-2019 19:37
9,Pilot,0,yes,Non-designer,1,lexical,10,sansforgetica,miney,non-word,Sure word,,,,489,56,very easy to read,25-03-2019 19:37


In [110]:
# add missing data & evaluate responses

categories = {}
for cat in ["words", "non-words"]:
    with open(cat + ".txt") as f:
        for w in f.readlines():
            categories[w.strip()] = cat[:-1] # remove the final "s"
            
for _, rd in d.iterrows():
    if rd["Category"] is np.nan:
        # get missing category
        rd["Category"] = categories[rd["Sample"]]
    # evaluate responses and check values
    #   0 (totally incorrect)
    #   1/3 (probably incorrect)
    #   2/3 (probably correct)
    #   1 (totally correct)
    if rd["Test type"] == "lexical":
        if rd["Response"] == "Sure word":
            rd["Correct"] = 1
        elif rd["Response"] == "Probably word":
            rd["Correct"] = 2 / 3
        elif rd["Response"] == "Probably non-word":
            rd["Correct"] = 1 / 3
        elif rd["Response"] == "Sure non-word":
            rd["Correct"] = 0
        if rd["Category"] == "non-word":
            rd["Correct"] = 1 - rd["Correct"]
        elif rd["Category"] == "word":
            pass
        else:
            print("Error!")
    else: # recognition
        if rd["Response"] == "Sure seen":
            rd["Correct"] = 1
        elif rd["Response"] == "Probably seen":
            rd["Correct"] = 2 / 3
        elif rd["Response"] == "Probably not seen":
            rd["Correct"] = 1 / 3
        elif rd["Response"] == "Sure not seen":
            rd["Correct"] = 0
        if rd["Seen"] == "non-seen":
            rd["Correct"] = 1 - rd["Correct"]
        elif rd["Seen"] == "seen":
            pass
        else:
            print("Error!")

# save the processed data
d.to_csv("data-processed.csv")