In [11]:
import pandas as pd
import numpy as np

columns = [
    "Study ID", "Participant ID", "Native", "Designer",
    "Test ID", "Test type", "Trial ID",
    "Font", "Sample", "Category",
    "Response", "Correct", "Seen", "Foil", "Response time",
    "JoM", "JoL", "Date",
]
d = pd.DataFrame(columns=columns)

raw = pd.read_csv("data.csv")

x = 0
prev_x = -1
options = {}
for i, rraw in raw.iterrows():
    rd_temp = pd.Series(index=d.columns)
    if "studyid" in rraw:
        rd_temp["Study ID"] = rraw["studyid"]
    else:
        rd_temp["Study ID"] = 0 # pilot study
    rd_temp["Participant ID"] = i
    if "Native" in rraw:
        rd_temp["Native"] = rraw["Native"]
    else:
        rd_temp["Native"] = rraw["Fluent"] # legacy
    if "Designer" in rraw:
        rd_temp["Designer"] = rraw["Designer"] # legacy
    else:
        rd_temp["Designer"] = rraw["Design_skills"] # legacy
    # save results for individual trials in rows
    for c in rraw.index:
        if c.startswith("test_") and not (c.endswith("_remember") or c.endswith("_legibility")):
            # e.g. test_1_lexical_5
            rd = pd.Series(rd_temp)
            rd["Category"], rd["Seen"], rd["Foil"] = np.nan, np.nan, np.nan # force defaults
            _, rd["Test ID"], rd["Test type"], rd["Trial ID"] = c.strip().split("_")
            response = rraw[c].strip().split(",")
            
            # tackle legacy formats of responses
            rd["Font"] = response[0].strip()
            rd["Response"] = response[-2].strip()
            rd["Response time"] = response[-1].strip()
            if rd["Test type"] == "lexical":
                if len(response) == 4:
                    rd["Sample"] = response[1].strip()
                else:
                    rd["Category"] = response[1].strip()
                    rd["Sample"] = response[2].strip()
            else:
                if len(response) == 5:
                    rd["Sample"] = response[1].strip()
                    rd["Seen"] = response[2].strip()
                elif len(response) == 6:
                    rd["Category"] = response[1].strip()
                    rd["Sample"] = response[2].strip()
                    rd["Seen"] = response[3].strip()
                else:
                    rd["Category"] = response[1].strip()
                    rd["Sample"] = response[2].strip()
                    rd["Seen"] = response[3].strip()
                    rd["Foil"] = response[4].strip()
            
            # fix legacy values
            if isinstance(rd["Category"], str):
                rd["Category"] = rd["Category"].replace("nonword", "non-word")
            if isinstance(rd["Seen"], str):
                rd["Seen"] = rd["Seen"].replace("non-seen", "not seen")
            rd["Response"] = rd["Response"].replace("non-seen", "not seen")
            # add the judgement of learning for this part (from test_1_remember)
            rd["JoM"] = rraw["test_%s_remember" % rd["Test ID"]]
            # add the judgement of legibility for this part (from test_1_legibility)
            rd["JoL"] = rraw["test_%s_legibility" % rd["Test ID"]]
            rd["Date"] = rraw[-1]
            
            d.loc[x] = rd
            x += 1
display(d[-10:])

Unnamed: 0,Study ID,Participant ID,Native,Designer,Test ID,Test type,Trial ID,Font,Sample,Category,Response,Correct,Seen,Foil,Response time,JoM,JoL,Date
1502,0.0,20.0,yes,Non-designer,2,recognition,7,inputsans,arges,,Probably seen,,not seen,,2121,52,easy to read,28-01-2019 10:59
1503,0.0,20.0,yes,Non-designer,2,recognition,8,inputsans,insumn,,Sure seen,,seen,,2663,52,easy to read,28-01-2019 10:59
1504,0.0,20.0,yes,Non-designer,2,recognition,9,inputsans,relatric,,Sure seen,,seen,,1607,52,easy to read,28-01-2019 10:59
1505,0.0,20.0,yes,Non-designer,2,recognition,10,inputsans,splectore,,Probably not seen,,seen,,2105,52,easy to read,28-01-2019 10:59
1506,0.0,20.0,yes,Non-designer,2,recognition,11,inputsans,mutgresy,,Sure not seen,,not seen,,2208,52,easy to read,28-01-2019 10:59
1507,0.0,20.0,yes,Non-designer,2,recognition,12,inputsans,gatimasher,,Probably seen,,seen,,2601,52,easy to read,28-01-2019 10:59
1508,0.0,20.0,yes,Non-designer,2,recognition,13,inputsans,physical,,Sure seen,,seen,,1455,52,easy to read,28-01-2019 10:59
1509,0.0,20.0,yes,Non-designer,2,recognition,14,inputsans,discritunds,,Probably not seen,,not seen,,2161,52,easy to read,28-01-2019 10:59
1510,0.0,20.0,yes,Non-designer,2,recognition,15,inputsans,menerwary,,Probably not seen,,not seen,,2607,52,easy to read,28-01-2019 10:59
1511,0.0,20.0,yes,Non-designer,2,recognition,16,inputsans,fighting,,Sure not seen,,not seen,,2184,52,easy to read,28-01-2019 10:59


In [17]:
# add missing data & evaluate responses

categories = {}
for cat in ["words", "non-words"]:
    with open(cat + ".txt") as f:
        for w in f.readlines():
            categories[w.strip()] = cat[:-1] # remove the final "s"


for i, rd in d.iterrows():
    rd["Native"] = rd["Native"] == "yes"
    if isinstance(rd["Category"], float) or rd["Category"] is np.nan:
        # get missing category
        rd["Category"] = categories[rd["Sample"]]
    # evaluate responses and check values
    if rd["Test type"] == "lexical":
        if rd["Response"] == ("Sure " + rd["Category"]) :
            rd["Correct"] = 1
        elif rd["Response"] == ("Probably " + rd["Category"]) :
            rd["Correct"] = 1
        else:
            rd["Correct"] = 0
    else: # recognition
        if rd["Response"] == ("Sure " + rd["Seen"]) :
            rd["Correct"] = 1
        elif rd["Response"] == ("Probably " + rd["Seen"]) :
            rd["Correct"] = 1
        else:
            rd["Correct"] = 0
    d.loc[i] = rd

display(d[-10:])
            
# save the processed data
d.to_csv("data-processed.csv")

Unnamed: 0,Study ID,Participant ID,Native,Designer,Test ID,Test type,Trial ID,Font,Sample,Category,Response,Correct,Seen,Foil,Response time,JoM,JoL,Date
1502,0.0,20.0,True,Non-designer,2,recognition,7,inputsans,arges,non-word,Probably seen,0.0,not seen,,2121,52,easy to read,28-01-2019 10:59
1503,0.0,20.0,True,Non-designer,2,recognition,8,inputsans,insumn,non-word,Sure seen,1.0,seen,,2663,52,easy to read,28-01-2019 10:59
1504,0.0,20.0,True,Non-designer,2,recognition,9,inputsans,relatric,non-word,Sure seen,1.0,seen,,1607,52,easy to read,28-01-2019 10:59
1505,0.0,20.0,True,Non-designer,2,recognition,10,inputsans,splectore,non-word,Probably not seen,0.0,seen,,2105,52,easy to read,28-01-2019 10:59
1506,0.0,20.0,True,Non-designer,2,recognition,11,inputsans,mutgresy,non-word,Sure not seen,1.0,not seen,,2208,52,easy to read,28-01-2019 10:59
1507,0.0,20.0,True,Non-designer,2,recognition,12,inputsans,gatimasher,non-word,Probably seen,1.0,seen,,2601,52,easy to read,28-01-2019 10:59
1508,0.0,20.0,True,Non-designer,2,recognition,13,inputsans,physical,word,Sure seen,1.0,seen,,1455,52,easy to read,28-01-2019 10:59
1509,0.0,20.0,True,Non-designer,2,recognition,14,inputsans,discritunds,non-word,Probably not seen,1.0,not seen,,2161,52,easy to read,28-01-2019 10:59
1510,0.0,20.0,True,Non-designer,2,recognition,15,inputsans,menerwary,non-word,Probably not seen,1.0,not seen,,2607,52,easy to read,28-01-2019 10:59
1511,0.0,20.0,True,Non-designer,2,recognition,16,inputsans,fighting,word,Sure not seen,1.0,not seen,,2184,52,easy to read,28-01-2019 10:59


array([nan, 'not seen', 'seen'], dtype=object)