In [1]:
# Import experimental data for model fitting
import pandas as pd
import numpy as np
from pyro import distributions as dist

In [2]:
# Read csv
df_experiment = pd.read_csv('huashan.csv')
df_stimuli = pd.read_csv('stimuli_table_no_NA.csv')

In [4]:
# Merge dataframes
df = pd.merge(df_experiment, df_stimuli, on=['list', 'conditions','item'], how='left')
print(f"Length of merged df: {len(df)}")
print(f"Length of data: {len(df_experiment)}")

Length of merged df: 27000
Length of data: 27000


# Preprocess data
## NA values

In [11]:
# Mark slider value of -1 as NaN
df['slider_value'] = df['slider_value'].replace("-1", np.nan)

# show proportion of NaN values
print(f"Proportion of NaN values of slider value: {df['slider_value'].isna().sum()/len(df)}")

Proportion of NaN values: 0.08503703703703704


In [13]:
### Which columns have missing values?
print(f"Columns with missing values: {df.columns[df.isna().any()].tolist()}")
### How many missing values per column?
print(f"Number of missing values per column: {df.isnull().sum()}")
### Show rows with missing values with colname "list"
df[df['list'].isnull()]


Columns with missing values: ['slider_value', 'list', 'trials', 'D', 'C', 'F', 'size_A', 'size_B', 'size_C', 'size_D', 'size_E', 'size_F', 'color_A', 'color_B', 'color_C', 'color_D', 'color_E', 'color_F', 'form_A', 'form_B', 'form_C', 'form_D', 'form_E', 'form_F', 'noun', 'sharpness']
Number of missing values per column: id                    0
item                  0
conditions            0
read_time             0
image_error           0
slider_value       2296
leftright_trial       0
list                 89
trials               89
D                  9060
C                  9057
F                  9061
size_A               89
size_B               89
size_C               89
size_D               89
size_E               89
size_F               89
color_A              89
color_B              89
color_C              89
color_D              89
color_E              89
color_F              89
form_A               89
form_B               89
form_C               89
form_D               89
form_

Unnamed: 0,id,item,conditions,read_time,image_error,slider_value,leftright_trial,list,trials,D,...,color_E,color_F,form_A,form_B,form_C,form_D,form_E,form_F,noun,sharpness
1547,6241ee95e86a3,9,brcf,[6492],False,1right,5,,,,...,,,,,,,,,,
1569,6241ee95e86a3,11,ercf,[4226],False,1left,5,,,,...,,,,,,,,,,
1573,6241ee95e86a3,34,fzrdc,[4390],False,1left,5,,,,...,,,,,,,,,,
1911,6241f3f13c93c,35,fbrcf,[4973],False,1right,1,,,,...,,,,,,,,,,
2647,6241fe6a604b4,31,fzrdc,[3582],False,1right,6,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24237,626167b76e314,30,fzrdf,[6333],False,1right,4,,,,...,,,,,,,,,,
24238,626167b76e314,36,fbrdc,[2977],False,1right,4,,,,...,,,,,,,,,,
24239,626167b76e314,30,fzrcf,[4059],False,1left,4,,,,...,,,,,,,,,,
24419,626168378ca1c,35,fzrdf,[3966],False,1right,4,,,,...,,,,,,,,,,


In [15]:
### Drop rows with missing values and colname "list"
df = df.dropna(subset=['list'])
df = df.dropna(subset=['slider_value'])
### Check again how many missing values per column
print(f"Number of missing values per column: {df.isnull().sum()}")

Number of missing values per column: id                    0
item                  0
conditions            0
read_time             0
image_error           0
slider_value          0
leftright_trial       0
list                  0
trials                0
D                  8245
C                  8245
F                  8125
size_A                0
size_B                0
size_C                0
size_D                0
size_E                0
size_F                0
color_A               0
color_B               0
color_C               0
color_D               0
color_E               0
color_F               0
form_A                0
form_B                0
form_C                0
form_D                0
form_E                0
form_F                0
noun                  0
sharpness             0
dtype: int64


## Encode slider values

In [18]:
### Make slider value integer
df['slider_value'] = df['slider_value'].astype(int)

### Create a new column called 'prefer_first_1st' based on the 'leftright_trial' and 'slider_value' columns
df['prefer_first_1st'] = df.apply(lambda x: 100 - x['slider_value'] if x['leftright_trial'] == '1left' else x['slider_value'], axis=1)

### Subtract 50 from the 'prefer_first_1st' column
#df['prefer_first_1st'] = df['prefer_first_1st']

In [19]:
df['prefer_first_1st'].describe()  

count    24615.000000
mean        61.763721
std         42.717204
min          0.000000
25%          7.000000
50%         87.000000
75%        100.000000
max        100.000000
Name: prefer_first_1st, dtype: float64

## Drop unnecessary columns

In [20]:
df = df.drop(columns=['slider_value', 'leftright_trial', 'D', 'C', 'F', 'read_time', 'image_error', 'noun'])

## Exclude pps (froms Rscirpt)

In [21]:
exclude = ["6242d7b615bf4", "6242d82c12542", "6242e450b3b5c", "624d80a5bdbf2", "62557f3d79780", 
           "626167b76e314", "6242bd7514401", "624da2cd1ef0b", "624b22243fd06", "624b303d6bbb7", 
           "6242d016c0e11", "624b0affa1dad", "624b0b35b7990", "62420e37579a8", "626167b76e314", 
           "624d80a5bdbf2", "62420e37579a8", "6242d7b615bf4", "6241fc4a4e8ee", "6242bd7514401", 
           "6242197550443", "624b22243fd06", "6242cc346032c", "6242173eeab01", "6242d1a5bc481", 
           "624b10a84ecda", "624d710db494a", "625596dfdfa50", "626167b76e314", "626179bd7e471", 
           "6242d0cc141b3", "6242cc346032c", "6242c9a5256b1", "6242c6ebdcebd", "6242197550443", 
           "6242140018a74", "6241eec9bf12f", "62408e90eaa0f", "6241fe6a604b4", "6241f580649f8"]
## Print length of df before excluding pps
print(f"Length of df before excluding pps: {len(df)}")
## Exclude pps
df = df[~df['id'].isin(exclude)]
## Print length of df after excluding pps
print(f"Length of df after excluding pps: {len(df)}")

Length of df before excluding pps: 24615
Length of df after excluding pps: 19692


## Subset filler items

In [22]:
# Create a new DataFrame called 'data_filler' by subsetting the rows where the first character of the 'conditions' column is 'f'
data_filler = df[df['conditions'].str[0:1] == 'f']

# Remove the rows where the first character of the 'conditions' column is 'f' from the original DataFrame 'df'
df = df[df['conditions'].str[0:1] != 'f'].reset_index(drop=True)

## Encode conditions

In [23]:
# Create a new DataFrame called 'data_filler' by subsetting the rows where the first character of the 'conditions' column is 'f'
data_filler = df[df['conditions'].str[0:1] == 'f']

# Remove the rows where the first character of the 'conditions' column is 'f' from the original DataFrame 'df'
df = df[df['conditions'].str[0:1] != 'f'].reset_index(drop=True)

In [24]:
# Create a new column called 'combination' based on the 'conditions' column
df['combination'] = df['conditions'].str[2:4].apply(lambda x: 'color_form' if x == 'cf' else 'dimension_color' if x == 'dc' else 'dimension_form')
df['combination'] = df['combination'].astype('category')

# Create a new column called 'relevant_property' based on the 'conditions' column
df['relevant_property'] = df['conditions'].str[0:1].apply(lambda x: 'first' if x == 'e' else 'second' if x == 'z' else 'both')
df['relevant_property'] = df['relevant_property'].astype('category')

# Also merge the production data to dataset

In [25]:
df_production = pd.read_csv('taishan_full_annotiert.csv', encoding='latin-1')

## Preprocess data
Exclude pps (see R script for more details)

In [26]:
exclude = ["62753469f3d18", "627d25f944eaf", "628fd4a002548", "626eec4126d0c", "62751c4a2c50b", 
             "627521d32039a", "6275239eeb5ca", "627525d930314", "627526c6b01cb", "627d1a2015145",
             "627d1bebe6501", "627d210c46d69", "627d21877f013", "627d22e2344d8", "627d2378e498c", 
             "627d244d542c2", "627d2801402f6", "627d29f45e98c", "627d2fe17b2db", "627d39d3523dd",
             "628f8ceb3bd87", "628f9ef753cfc", "628fc7a170851", "628fca7dd18c6", "629081127abf8", 
             "6290c9a8383a8", "6290ce2a98253", "6292529d182fe", "62751c4a2c50b", "62751d07a8065",
             "62752339dc09f", "6275239eeb5ca", "6275271cac7f8", "627d228d2d766", "628f9d398519e",
             "629220f39875e"]
## Print length of df before excluding pps
print(f"Length of df before excluding pps: {len(df_production)}")
# Exclude pps
df_production = df_production[~df_production['id'].isin(exclude)]
## Print length of df after excluding pps
print(f"Length of df after excluding pps: {len(df_production)}")


Length of df before excluding pps: 22140
Length of df after excluding pps: 16335


Drop the column "id" in both dfs for avioding duplicate

In [27]:
# Drop the column "id" in both dfs for avioding duplicate
df = df.drop(columns=['id'])
df_production = df_production.drop(columns=['id'])

In [28]:
# Merge the two dfs
print(f"Length of df before merge: {len(df)}")
print(f"Length of df_production before merge: {len(df_production)}")
df_merged = pd.merge(df_production, df_stimuli, on=['list', 'conditions','item'], how='left', copy=False)
print(f"Length of merged df: {len(df_merged)}")
print(f"Length of df_production after merge: {len(df_production)}")

Length of df before merge: 9469
Length of df_production before merge: 16335
Length of merged df: 16335
Length of df_production after merge: 16335


In [29]:
# Create a new DataFrame called 'data_filler' by subsetting the rows where the first character of the 'conditions' column is 'f'
data_filler = df_merged[df_merged['conditions'].str[0:1] == 'f']

# Remove the rows where the first character of the 'conditions' column is 'f' from the original DataFrame 'df'
df_merged = df_merged[df_merged['conditions'].str[0:1] != 'f'].reset_index(drop=True)

# Create a new column called 'combination' based on the 'conditions' column
df_merged['combination'] = df_merged['conditions'].str[2:4].apply(lambda x: 'color_form' if x == 'cf' else 'dimension_color' if x == 'dc' else 'dimension_form')
df_merged['combination'] = df_merged['combination'].astype('category')

# Create a new column called 'relevant_property' based on the 'conditions' column
df_merged['relevant_property'] = df_merged['conditions'].str[0:1].apply(lambda x: 'first' if x == 'e' else 'second' if x == 'z' else 'both')
df_merged['relevant_property'] = df_merged['relevant_property'].astype('category')

In [30]:
# Drop unnecessary columns
df_merged = df_merged.drop(columns=['response', 'D', 'C', 'F', 'read_time', 'image_error', 'noun'])


In [31]:
# Re-encode the data such that there are only two categories for colors and forms while keeping the same pattern (experimental manipulation)
# ... in order for simplicity of the model
def encode_color(line):
    if (line.conditions == 'ercf') or (line.conditions == 'zrdc'):
        line.color_A = "blue"
        line.color_B = "red"
        line.color_C = "red"
        line.color_D = "red"
        line.color_E = "red"
        line.color_F = "red"
    elif (line.conditions == 'erdc') or (line.conditions == 'zrcf'):
        line.color_A = "red"
        line.color_B = "red"
        line.color_C = "red"
        line.color_D = "red"
        line.color_E = "red"
        line.color_F = "blue"
    elif (line.conditions == 'brdc'):
        line.color_A = "blue"
        line.color_B = "red"
        line.color_C = "blue"
        line.color_D = "blue"
        line.color_E = "red"
        line.color_F = "red"
    elif (line.conditions == 'brcf'):
        line.color_A = "blue"
        line.color_B = "blue"
        line.color_C = "red"
        line.color_D = "red"
        line.color_E = "blue"
        line.color_F = "red"
    elif (line.conditions == 'erdf') or (line.conditions == 'zrdf') or (line.conditions == 'brdf'):
        line.color_A = "blue"
        line.color_B = "blue" if dist.Bernoulli(0.7).sample() == 1 else "red"
        line.color_C = "blue" if dist.Bernoulli(0.7).sample() == 1 else "red"
        line.color_D = "blue" if dist.Bernoulli(0.7).sample() == 1 else "red"
        line.color_E = "blue" if dist.Bernoulli(0.7).sample() == 1 else "red"
        line.color_F = "blue" if dist.Bernoulli(0.7).sample() == 1 else "red"
        
    return line

def encode_form(line):
    if (line.conditions == 'zrdf') or (line.conditions == 'zrcf'):
        line.form_A = "circle"
        line.form_B = "square"
        line.form_C = "square"
        line.form_D = "square"
        line.form_E = "square"
        line.form_F = "square"
    elif (line.conditions == 'erdf') or (line.conditions == 'ercf'):
        line.form_A = "circle"
        line.form_B = "circle"
        line.form_C = "circle"
        line.form_D = "circle"
        line.form_E = "circle"
        line.form_F = "square"
    elif (line.conditions == 'brdf') or (line.conditions == 'brcf'):
        line.form_A = "circle"
        line.form_B = "square"
        line.form_C = "circle"
        line.form_D = "circle"
        line.form_E = "square"
        line.form_F = "square"
    elif (line.conditions == 'erdc') or (line.conditions == 'zrdc') or (line.conditions == 'brdc'):
        line.form_A = "circle"
        line.form_B = "circle" if dist.Bernoulli(0.7).sample() == 1 else "square"
        line.form_C = "circle" if dist.Bernoulli(0.7).sample() == 1 else "square"
        line.form_D = "circle" if dist.Bernoulli(0.7).sample() == 1 else "square"
        line.form_E = "circle" if dist.Bernoulli(0.7).sample() == 1 else "square"
        line.form_F = "circle" if dist.Bernoulli(0.7).sample() == 1 else "square"
    return line

In [32]:
# apply the functions, comment out if not needed
df_merged_new = df_merged.apply(encode_color,axis=1)
df_merged_new = df_merged_new.apply(encode_form, axis=1)
df_merged_new.head()

Unnamed: 0,item,conditions,list,annotation,trials,size_A,size_B,size_C,size_D,size_E,...,color_F,form_A,form_B,form_C,form_D,form_E,form_F,sharpness,combination,relevant_property
0,5,ercf,1,C,4,2,2,9,9,9,...,red,circle,circle,circle,circle,circle,square,sharp,color_form,first
1,15,zrdc,1,C,8,9,10,10,10,10,...,red,circle,square,circle,circle,circle,square,sharp,dimension_color,second
2,18,zrdc,1,C,8,9,10,9,10,9,...,red,circle,circle,circle,circle,square,circle,sharp,dimension_color,second
3,1,brdc,1,DCF,2,10,10,3,2,10,...,red,circle,square,circle,circle,square,square,sharp,dimension_color,both
4,10,brcf,1,CF,1,2,10,10,9,3,...,red,circle,square,circle,circle,square,square,sharp,color_form,both


In [33]:
# also apply the functions to the slider values
df_slider = df.apply(encode_color,axis=1)
df_slider = df_slider.apply(encode_form, axis=1)
df_slider.head()

Unnamed: 0,item,conditions,list,trials,size_A,size_B,size_C,size_D,size_E,size_F,...,form_A,form_B,form_C,form_D,form_E,form_F,sharpness,prefer_first_1st,combination,relevant_property
0,12,zrdf,5.0,9.0,10.0,9.0,9.0,9.0,9.0,1.0,...,circle,square,square,square,square,square,blurred,99,dimension_form,second
1,1,erdf,5.0,6.0,10.0,4.0,4.0,2.0,2.0,5.0,...,circle,circle,circle,circle,circle,square,blurred,96,dimension_form,first
2,19,zrcf,5.0,7.0,3.0,10.0,2.0,10.0,3.0,10.0,...,circle,square,square,square,square,square,blurred,98,color_form,second
3,10,zrdc,5.0,8.0,9.0,9.0,10.0,9.0,9.0,3.0,...,circle,circle,circle,circle,square,circle,blurred,97,dimension_color,second
4,13,erdc,5.0,5.0,10.0,5.0,3.0,1.0,1.0,2.0,...,circle,circle,circle,square,circle,circle,blurred,98,dimension_color,first


In [26]:
# Remove negative values in df_slider.prefer_first_1st
# ... first show how many are there
print(df_slider[df_slider.prefer_first_1st < 0].shape)
# and proportion of the total
print(df_slider[df_slider.prefer_first_1st < 0].shape[0] / df_slider.shape[0])
# remove them
df_slider = df_slider[df_slider.prefer_first_1st >= 0].reset_index(drop=True)

(84, 26)
0.008724553385957623


# Export dataframe for further analysis

In [34]:
df_slider.to_csv('dataset_slider.csv', index=False)
df_merged_new.to_csv('dataset_production.csv', index=False)