In [1]:
import polars as pl
import sys

sys.path.append("../../")
from src.utils import *

In [2]:
pl.Config.load_from_file("../../polars_cfg.json")

<polars.config.Config at 0x104be8b30>

In [3]:
all_data_path = "../../data/clean_all_data_merged.csv"
errors_path = "../../data/clean_errors_merged.csv"
feedback_path = "../../data/clean_feedback_data_merged.csv"
incomplete_path = "../../data/clean_incomplete_data_merged.csv"
summary_path = "../../data/clean_summary_data_merged.csv"
demographics_path = "../../data/clean_demographics_merged.csv"

df_all = pl.read_csv(all_data_path)
df_errors = pl.read_csv(errors_path)
df_feedback = pl.read_csv(feedback_path)
df_incomplete = pl.read_csv(incomplete_path)
df_summary = pl.read_csv(summary_path)

In [4]:
reordered_columns = [
    # Experiment metadata
    "exp_name",
    "task_type",
    # Identifiers and metadata
    "hashed_id",
    "joint_id_task",
    # Task-related information
    "task_name",
    "task_number",
    "is_tutorial",
    # Time-related information
    "time",
    # Task performance
    "attempt_number",
    "num_actions",
    "solved",
    "done",
    # Input and output data
    "test_input_grid",
    "test_input_size_x",
    "test_input_size_y",
    "test_output_grid",
    "test_output_size_x",
    "test_output_size_y",
    # User actions
    "action",
    "action_x",
    "action_y",
    "select_loc",
    "selected_data",
    "selected_symbol",
    "selected_tool",
    "copy_paste_data",
    # User solutions
    "first_written_solution",
    "last_written_solution",
    # Withdrawal information
    "withdraw",
    "withdraw_reason",
    "withdraw_comment",
    # Demographic information
    "age",
    "gender",
    "race",
    "education_level",
    "household_income",
    "normal_vision",
    "color_blind",
    "fluent_english",
]

In [5]:
def standardize_test_input_size(df):
    df = (
        df.with_columns(
            pl.col("test_input_size")
            .str.strip_chars("[]")
            .str.split(",")
            .list.to_struct(fields=["test_input_size_x", "test_input_size_y"])
        )
        .unnest("test_input_size")
        .with_columns(
            [
                pl.col("test_input_size_x").str.strip_chars(" ").cast(pl.Int32),
                pl.col("test_input_size_y").str.strip_chars(" ").cast(pl.Int32),
            ]
        )
    )
    df = (
        df.with_columns(
            pl.col("test_output_size")
            .map_elements(lambda x: eval(x), return_dtype=pl.List(pl.String))
            .list.to_struct(fields=["test_output_size_x", "test_output_size_y"])
        )
        .unnest("test_output_size")
        .with_columns(
            [
                pl.col("test_output_size_x").str.strip_chars(" ").cast(pl.Int32),
                pl.col("test_output_size_y").str.strip_chars(" ").cast(pl.Int32),
            ]
        )
    )
    return df

In [6]:
df_incomplete = standardize_test_input_size(df_incomplete)
df_all = standardize_test_input_size(df_all)

In [7]:
# output time in standard format
df_all = df_all.with_columns(
    pl.col("time").map_elements(parse_mixed_datetime, return_dtype=pl.Datetime),
)
df_incomplete = df_incomplete.with_columns(
    pl.col("time").map_elements(parse_mixed_datetime, return_dtype=pl.Datetime),
)

In [8]:
df_incomplete = (
    df_incomplete.select(reordered_columns)
    .with_columns(
        pl.when(pl.col("withdraw").str.to_lowercase() == "true")
        .then(True)
        .when(pl.col("withdraw").str.to_lowercase() == "false")
        .then(False)
        .otherwise(None)
        .alias("withdraw")
        .cast(pl.Boolean),
        pl.col("age").cast(pl.Float32),
    )
    .with_columns(pl.lit(False).alias("complete"))
)
df_all = (
    df_all.select(reordered_columns)
    .with_columns(pl.col("withdraw").cast(pl.Boolean), pl.col("age").cast(pl.Float32))
    .with_columns(pl.lit(True).alias("complete"))
)

Got rid of:

- feedback
- indexOf
- phase
- template
- templates
- viewTime
- tutorial_response
- beginHit
- endHit


In [9]:
df_all.write_csv("../../data/clean_data.csv")
df_incomplete.write_csv("../../data/clean_data_incomplete.csv")

Clean errors


In [10]:
df_errors.head()

task_name,test_output_grid,hashed_output_grid,exp_name,task_type,count
str,str,str,str,str,i64
"""025d127b.json""","""|0000000000|0044444400|0040000040|0004000004|…","""dc6dff245394753e3001d855fff1ec09""","""expv0""","""training""",1
"""025d127b.json""","""|0000000000|0044444400|0040000040|0004000004|…","""da376bc568b03d7f5702d4ebb8e4f0ed""","""expv0""","""training""",2
"""025d127b.json""","""|0000000000|0044444400|0040000040|0040000004|…","""15768d2cdee1eba0a55d9d400d9037ad""","""expv0""","""training""",3
"""025d127b.json""","""|0000000000|0444444000|0400000400|0040000040|…","""e75d4ea97fffc3d9801520e1d724ff4c""","""expv0""","""training""",1
"""025d127b.json""","""|0000000000|4444440000|4000004000|0400000400|…","""6e8528c1fd343e31980688552cae311b""","""expv0""","""training""",1


In [11]:
df_errors = (
    df_errors.group_by(["task_name", "test_output_grid"])
    .agg(
        pl.first("hashed_output_grid"),
        pl.first("task_type"),
        pl.sum("count").alias("count"),
    )
    .sort("count", descending=True)
)

In [12]:
df_errors.write_csv("../../data/clean_errors.csv")

In [13]:
df_errors_incomplete = get_errors(df_incomplete)
df_errors_incomplete.select(df_errors.columns).head()

task_name,test_output_grid,hashed_output_grid,task_type,count
str,str,str,str,u32
"""25094a63.json""","""|683338183881631231113122682386|1668813363882…","""044024c693f80e3993a94ec260b4aa83""","""evaluation""",1
"""9b4c17c4.json""","""|11111188888|11111182888|11111188888|11112288…","""2578d66c85002b64a79ad7d66f55c74a""","""evaluation""",5
"""0a2355a6.json""","""|00000000000000000|00888800000002220|00800800…","""2bf8e14b411bd9dad27708d7bd19b318""","""evaluation""",1
"""b457fec5.json""","""|00000000000000000000000|00000013420000000111…","""d00959947318b386cf23b73d6d145f49""","""evaluation""",1
"""c62e2108.json""","""|000000000400400000000000|0000000004004000000…","""3a034462aa6b1293509f43c0aa9aafa3""","""evaluation""",1


In [14]:
df_errors_incomplete.write_csv("../../data/clean_errors_incomplete.csv")

Clean summary


In [15]:
df_summary.head()

hashed_id,task_name,joint_id_task,task_number,attempt_number,solved,test_output_grid,first_written_solution,last_written_solution,num_actions,exp_name,condition,task_type
str,str,str,i64,i64,bool,str,str,str,i64,str,str,str
"""195f0749824286cb4b24f431004a3b87""","""b8cdaf2b.json""","""195f0749824286cb4b24f431004a3b87_b8cdaf2b.jso…",1,1,True,"""|000000000|000000000|000000000|000000000|2000…","""To add the opposite color diagonally""","""To add the opposite color diagonally""",20,"""expv0""",,"""training"""
"""195f0749824286cb4b24f431004a3b87""","""a87f7484.json""","""195f0749824286cb4b24f431004a3b87_a87f7484.jso…",2,1,True,"""|606|660|606|""","""To duplicate the odd one""","""To duplicate the odd one""",10,"""expv0""",,"""training"""
"""195f0749824286cb4b24f431004a3b87""","""6773b310.json""","""195f0749824286cb4b24f431004a3b87_6773b310.jso…",4,1,False,"""|006|600|006|""","""I have no idea""","""Lines across that has only 1 colored square i…",8,"""expv0""",,"""training"""
"""195f0749824286cb4b24f431004a3b87""","""6773b310.json""","""195f0749824286cb4b24f431004a3b87_6773b310.jso…",4,2,False,"""|006|066|600|""","""I have no idea""","""Lines across that has only 1 colored square i…",15,"""expv0""",,"""training"""
"""195f0749824286cb4b24f431004a3b87""","""6773b310.json""","""195f0749824286cb4b24f431004a3b87_6773b310.jso…",4,3,False,"""|006|066|600|""","""I have no idea""","""Lines across that has only 1 colored square i…",16,"""expv0""",,"""training"""


In [16]:
reordered_columns_summary = [
    # Experiment metadata
    "exp_name",
    "task_type",
    # Identifiers and metadata
    "hashed_id",
    "joint_id_task",
    # Task-related information
    "task_name",
    "task_number",
    # Task performance
    "attempt_number",
    "num_actions",
    "solved",
    # Output data
    "test_output_grid",
    # User solutions
    "first_written_solution",
    "last_written_solution",
]

In [17]:
df_summary = df_summary.select(reordered_columns_summary)
df_summary = df_summary.with_columns(pl.lit(True).alias("complete"))
df_summary.head()

exp_name,task_type,hashed_id,joint_id_task,task_name,task_number,attempt_number,num_actions,solved,test_output_grid,first_written_solution,last_written_solution,complete
str,str,str,str,str,i64,i64,i64,bool,str,str,str,bool
"""expv0""","""training""","""195f0749824286cb4b24f431004a3b87""","""195f0749824286cb4b24f431004a3b87_b8cdaf2b.jso…","""b8cdaf2b.json""",1,1,20,True,"""|000000000|000000000|000000000|000000000|2000…","""To add the opposite color diagonally""","""To add the opposite color diagonally""",True
"""expv0""","""training""","""195f0749824286cb4b24f431004a3b87""","""195f0749824286cb4b24f431004a3b87_a87f7484.jso…","""a87f7484.json""",2,1,10,True,"""|606|660|606|""","""To duplicate the odd one""","""To duplicate the odd one""",True
"""expv0""","""training""","""195f0749824286cb4b24f431004a3b87""","""195f0749824286cb4b24f431004a3b87_6773b310.jso…","""6773b310.json""",4,1,8,False,"""|006|600|006|""","""I have no idea""","""Lines across that has only 1 colored square i…",True
"""expv0""","""training""","""195f0749824286cb4b24f431004a3b87""","""195f0749824286cb4b24f431004a3b87_6773b310.jso…","""6773b310.json""",4,2,15,False,"""|006|066|600|""","""I have no idea""","""Lines across that has only 1 colored square i…",True
"""expv0""","""training""","""195f0749824286cb4b24f431004a3b87""","""195f0749824286cb4b24f431004a3b87_6773b310.jso…","""6773b310.json""",4,3,16,False,"""|006|066|600|""","""I have no idea""","""Lines across that has only 1 colored square i…",True


In [18]:
df_summary.write_csv("../../data/clean_summary_data.csv")

Clean feedback


In [19]:
reordered_columns_feedback = [
    # Experiment metadata
    "exp_name",
    "task_type",
    # Identifiers and metadata
    "hashed_id",
    # feeedback
    "feedback",
]

In [20]:
df_feedback = df_feedback.select(reordered_columns_feedback)
df_feedback.head()

exp_name,task_type,hashed_id,feedback
str,str,str,str
"""expv0""","""training""","""195f0749824286cb4b24f431004a3b87""","""Interesting. I can't really explain my strate…"
"""expv0""","""training""","""bf66a164234034d4709666dc364c8d55""","""this was fun - thanks! """
"""expv0""","""training""","""2b06355cea411af7a6fa212c90eb3c6d""","""Thank you."""
"""expv0""","""training""","""19f69ac203258689f6823e2c49cea6b7""","""Very interesting."""
"""expv0""","""training""","""37289334396727fffe878187ca738a35""","""The first 5 were jacked up. The controls didn…"


In [21]:
df_feedback.write_csv("../../data/clean_feedback_data.csv")

make df_summary_incomplete


In [22]:
df_summary_incomplete = get_summary(df_incomplete)
df_summary_incomplete = df_summary_incomplete.select(reordered_columns_summary)
df_summary_incomplete = df_summary_incomplete.with_columns(
    pl.lit(False).alias("complete")
)
df_summary_incomplete.head()

exp_name,task_type,hashed_id,joint_id_task,task_name,task_number,attempt_number,num_actions,solved,test_output_grid,first_written_solution,last_written_solution,complete
str,str,str,str,str,i64,i64,i64,bool,str,str,str,bool
"""expv1""","""training""","""1468bc4d0ad517332fe1e682f039e9c7""","""1468bc4d0ad517332fe1e682f039e9c7_4093f84a.jso…","""4093f84a.json""",1,1,99,True,"""|00000550000000|00005555000000|00000550000000…","""I put the gray rows going down in the 6th and…","""I put the gray rows going down in the 6th and…",False
"""expv1""","""training""","""92c7f74d6cf7b775003571db2840a855""","""92c7f74d6cf7b775003571db2840a855_1bfc4729.jso…","""1bfc4729.json""",1,1,82,True,"""|2222222222|2000000002|2222222222|2000000002|…","""On the same size grid as the test input grid …","""On the same size grid as the test input grid …",False
"""expv1""","""training""","""92c7f74d6cf7b775003571db2840a855""","""92c7f74d6cf7b775003571db2840a855_48d8fb45.jso…","""48d8fb45.json""",2,1,10,True,"""|030|330|033|""","""Make a copy of the the shape that has a grey …","""Make a copy of the the shape that has a grey …",False
"""expv1""","""training""","""4be4044cfed6f294a7f25d61c8736329""","""4be4044cfed6f294a7f25d61c8736329_6aa20dc0.jso…","""6aa20dc0.json""",1,1,9,False,"""|3333333333333333333333|333333333344433333333…","""I can't do it because the grid is 22x22 and I…","""On the test input there is a complete small i…",False
"""expv1""","""training""","""4be4044cfed6f294a7f25d61c8736329""","""4be4044cfed6f294a7f25d61c8736329_6aa20dc0.jso…","""6aa20dc0.json""",1,2,157,False,"""|3333333333333333333333|333333333344488888833…","""I can't do it because the grid is 22x22 and I…","""On the test input there is a complete small i…",False


In [23]:
df_summary_incomplete.write_csv("../../data/clean_summary_data_incomplete.csv")

Make df demographics


In [24]:
df_demographics_complete = (
    df_all.select(
        "exp_name",
        "task_type",
        "hashed_id",
        "age",
        "gender",
        "race",
        "education_level",
        "household_income",
        "normal_vision",
        "color_blind",
        "fluent_english",
    )
    .with_columns(pl.lit(True).alias("complete"))
    .unique("hashed_id")
)
df_demographics_incomplete = (
    df_incomplete.select(
        "exp_name",
        "task_type",
        "hashed_id",
        "age",
        "gender",
        "race",
        "education_level",
        "household_income",
        "normal_vision",
        "color_blind",
        "fluent_english",
    )
    .with_columns(pl.lit(False).alias("complete"))
    .unique("hashed_id")
)
df_demographics = pl.concat([df_demographics_complete, df_demographics_incomplete])
df_demographics.head()

exp_name,task_type,hashed_id,age,gender,race,education_level,household_income,normal_vision,color_blind,fluent_english,complete
str,str,str,f32,str,str,str,str,str,str,str,bool
"""expv4""","""training""","""481ce76744c8f53d50e08cb4370d38da""",27.0,"""Woman""","""Black/African American""","""Undergraduate Degree (BA/BS/Other)""","""Prefer not to specify""","""Yes""","""No""","""Yes""",True
"""expv2""","""training""","""64dc330327b5ee130bf34b5e32c36d3f""",27.0,"""Man""","""Black/African American""","""Undergraduate Degree (BA/BS/Other)""","""$60,000–$79,999""","""Yes""","""No""","""Yes""",True
"""expv6_3""","""evaluation""","""de83e970874255a0c9bef2b767dbb753""",32.0,"""Woman""","""White""","""Undergraduate Degree (BA/BS/Other)""","""$80,000–$99,999""","""Yes""","""No""","""Yes""",True
"""expv4""","""training""","""1123c2a5c52bff42c22de6fc5715b9ae""",26.0,"""Woman""","""White""","""Undergraduate Degree (BA/BS/Other)""","""$40,000–$59,999""","""Yes""","""No""","""Yes""",True
"""expv6_1""","""evaluation""","""f8bd63c74cf0dc6f371ab972c0e425a4""",54.0,"""Woman""","""White""","""High School Diploma (A-levels)""","""Less than $20,000""","""Yes""","""No""","""Yes""",True


In [25]:
df_demographics.write_csv("../../data/clean_demographics_data.csv")

make df_withdraw


In [26]:
reordered_columns_withdraw = [
    # Experiment metadata
    "exp_name",
    "task_type",
    # Identifiers and metadata
    "hashed_id",
    "withdraw",
    "withdraw_reason",
    "withdraw_comment",
]

In [27]:
df_withdraw_incomplete = df_incomplete.select(reordered_columns_withdraw).with_columns(
    pl.lit(False).alias("complete")
)
df_withdraw_complete = df_all.select(reordered_columns_withdraw).with_columns(
    pl.lit(True).alias("complete")
)
df_withdraw = pl.concat([df_withdraw_incomplete, df_withdraw_complete])
df_withdraw = df_withdraw.unique("hashed_id")
df_withdraw.head()

exp_name,task_type,hashed_id,withdraw,withdraw_reason,withdraw_comment,complete
str,str,str,bool,str,str,bool
"""expv4""","""training""","""ef31fe508f1ed204a67a1cf0856c1954""",False,,,True
"""expv6_2""","""evaluation""","""ca48922121ec9d3b3882d92da45dd1f1""",True,"""I do not understand what I am supposed to do.""","""This is not clear. What are the additional e…",False
"""expv6_4""","""evaluation""","""6968ac1f90e3a9420cf04e1068570777""",False,,,False
"""expv6_1""","""evaluation""","""f7d75bfe95153df73f32082ebd500aba""",False,,,False
"""expv6_1""","""evaluation""","""2184ab698209e9f20e99a8a7fbaec9df""",False,,,True


In [28]:
df_withdraw.write_csv("../../data/clean_withdraw_data.csv")