## Readme

* This code has two purposes: 
    1. Clean rating data collected from Prolific workers 
    2. Build rating matrices from the valid data that can be read by TR Hayes' matlab code. 
    
* Workers who had duplicate ip address, had incomplete data, or rated the catch trial too high (>1.5) were filter out.
* Then, a 768\*1024 matrix was constructed for each rating scale (fine, coarse) for each picture. Each cell of this matrix stores rating scores for the occupying patches.

In [2]:
import sys
print (sys.version)
import pandas as pd

2.7.16 |Anaconda, Inc.| (default, Mar 14 2019, 15:42:17) [MSC v.1500 64 bit (AMD64)]


## Extract patch assignment info

In [45]:
# get block info by searching the survey texts

import re
import os
from glob import glob

images_path = '..\\scene_images\\*' # path to original scene images
survey_path = '..\\surveys\\*\\*.txt' # path to generated survey texts
url_pattern = r'https://raw.githubusercontent.com/HanZhang-psych/SceneMeaningMapping/master/patch_stimuli/\S+' # the url pattern to search for in the survey texts

image_list = []
for p in glob(images_path):
    image_list.append(os.path.basename(p)[:-4])
    
    
patch_assign_df = pd.DataFrame()
for sur in glob(survey_path):
    pics = []
    survey_name = int(os.path.basename(sur)[6:-4])
    survey_type = sur.split('\\')[2]
    with open(sur, 'r') as f:
        for line in f:
            s = re.search(url_pattern, line)
            if s is not None:
                pic = s.group(0).split('/')[-1][:-1]
                pics.append(pic)
    df = pd.DataFrame(list(zip(pics, 
                               range(len(pics)), 
                               [survey_type]*len(pics), 
                               [survey_name]*len(pics))),
                      columns=['patch','idx','survey_type','survey_name'])
    patch_assign_df = patch_assign_df.append(df)

In [46]:
# from patch name extract the original pic name
def scene_name_from_patch(row):
    for i in image_list:
        if i in row.patch:
            row['scene_name'] = i
            row['patch_id'] = int(row.patch.replace(i.split('.')[0],'').split('.')[0])
    return row
patch_assign_df = patch_assign_df.apply(scene_name_from_patch, axis=1)

In [47]:
# sort values 
patch_assign_df.sort_values(by=['survey_type','survey_name'], inplace=True)

In [48]:
patch_assign_df.head()

Unnamed: 0,idx,patch,patch_id,scene_name,survey_name,survey_type
0,0,after_IMG_021317.png,17.0,after_IMG_0213,1,coarse
1,1,after_sun_awkhwzafhbdrlgkt93.png,93.0,after_sun_awkhwzafhbdrlgkt,1,coarse
2,2,after_IMG_158153.png,53.0,after_IMG_1581,1,coarse
3,3,after_img_10078.png,78.0,after_img_100,1,coarse
4,4,after_sun_awkhwzafhbdrlgkt26.png,26.0,after_sun_awkhwzafhbdrlgkt,1,coarse


## Read survey: coarse

In [49]:
# survey coarse
survey_coarse = pd.read_csv('../data/qualtrics/Scene Patch Rating_Coarse_January 20, 2020_11.14.csv',skiprows=[1,2])

In [50]:
# remove those who did not sign consent form
survey_coarse = survey_coarse[survey_coarse.QID3==1].reset_index(drop=True)

In [51]:
# identify block assingment 
var_assign = survey_coarse.columns[-1]
print (var_assign)
survey_coarse['survey_type'] = survey_coarse[var_assign].apply(lambda x: 'coarse' if 'coarse' in x else 'fine')
survey_coarse['survey_name'] = survey_coarse[var_assign].apply(lambda x: int(re.search(r'\d+', x).group(0)))

FL_30_DO


In [52]:
# check ip address duplicates
survey_coarse[survey_coarse.IPAddress.duplicated()].PROLIFIC_PID

Series([], Name: PROLIFIC_PID, dtype: object)

In [53]:
# check if have complete data (320 ratings)
surv_rating = survey_coarse[survey_coarse.columns[survey_coarse.columns.str.contains('QID')][1:]]

# coasre mask: note that the coarse 13 has 308 ratings
complete_mask = (surv_rating.count(axis=1) == 320) | (surv_rating.count(axis=1) == 308)
incomplete_mask = (surv_rating.count(axis=1) != 320) & (surv_rating.count(axis=1) != 308)

completed_subs = survey_coarse[complete_mask].PROLIFIC_PID.tolist()
incompleted_subs = survey_coarse[incomplete_mask].PROLIFIC_PID.tolist()

In [54]:
incompleted_subs 

[]

In [55]:
# rename survey column names
# coarse 1-13
new_colnames = patch_assign_df[patch_assign_df.survey_type=='coarse'].survey_name.astype(str)+':'+ patch_assign_df[patch_assign_df.survey_type=='coarse'].patch

In [56]:
column_dict = dict(zip(surv_rating.columns,new_colnames.tolist()))
survey_coarse.rename(columns=column_dict, inplace=True)

In [57]:
# compute catch trial ratings to identify spammers
survey_coarse['catch_ratings'] = survey_coarse[survey_coarse.columns[survey_coarse.columns.str.contains('catch')]].mean(axis=1)
spammers = survey_coarse[survey_coarse['catch_ratings']>1.5].PROLIFIC_PID.tolist()
print (spammers)

['599bf0606c165e000113fe27']


In [58]:
# get a list of approved subs
approved_subs = []
for i in completed_subs:
    if i not in spammers:
        approved_subs.append(i)
        
#for i in approved_subs:
#    print i

In [59]:
# rolling back qualtrics randomizer
survey_coarse[survey_coarse.PROLIFIC_PID.isin(approved_subs)].groupby(['survey_type','survey_name']).PROLIFIC_PID.count()

survey_type  survey_name
coarse       1              3
             2              3
             3              3
             4              3
             5              3
             6              3
             7              3
             8              3
             9              3
             10             3
             11             3
             12             4
             13             3
Name: PROLIFIC_PID, dtype: int64

In [60]:
# get valid subjects
valid_survey_coarse = survey_coarse[survey_coarse.PROLIFIC_PID.isin(approved_subs)].reset_index(drop=True)
# get useful cols
valid_survey_coarse = valid_survey_coarse[['PROLIFIC_PID','survey_type','survey_name']+ list(column_dict.values())]
# sort by survey
valid_survey_coarse.sort_values(by=['survey_type','survey_name'], inplace=True)

## Read survey: Fine (1-24)

In [62]:
# survey fine
survey_fine1 = pd.read_csv('../data/qualtrics/Scene Patch Rating - Fine (1 - 24)_January 23, 2020_10.22.csv',skiprows=[1,2])

In [63]:
# remove those who did not sign consent form
survey_fine1 = survey_fine1[survey_fine1.QID325==1].reset_index(drop=True)

In [64]:
# identify block assingment 
var_assign = survey_fine1.columns[-1]
print (var_assign)
survey_fine1['survey_type'] = survey_fine1[var_assign].apply(lambda x: 'coarse' if 'coarse' in x else 'fine')
survey_fine1['survey_name'] = survey_fine1[var_assign].apply(lambda x: int(re.search(r'\d+', x).group(0)))

FL_45_DO


In [65]:
# check ip address duplicates
survey_fine1[survey_fine1.IPAddress.duplicated()].PROLIFIC_PID

Series([], Name: PROLIFIC_PID, dtype: object)

In [66]:
# check if have complete data (320 ratings)
surv_rating = survey_fine1[survey_fine1.columns[survey_fine1.columns.str.contains('QID')][1:]]

# fine mask
complete_mask = (surv_rating.count(axis=1) == 320)
incomplete_mask = (surv_rating.count(axis=1) != 320)

completed_subs = survey_fine1[complete_mask].PROLIFIC_PID.tolist()
incompleted_subs = survey_fine1[incomplete_mask].PROLIFIC_PID.tolist()

In [67]:
incompleted_subs 

['5d118de2c80b57001721f93e',
 '5d404bc74db54a0001fecb92',
 '5d93f4f22bde0400161aca3f',
 '5d3108926f0585001a7d0cc1']

In [68]:
# rename survey column names

# fine < 25
new_colnames = patch_assign_df[(patch_assign_df.survey_name<25)&(patch_assign_df.survey_type=='fine')].survey_name.astype(str)+':'+ patch_assign_df[(patch_assign_df.survey_name<25)&(patch_assign_df.survey_type=='fine')].patch

In [69]:
column_dict = dict(zip(surv_rating.columns,new_colnames.tolist()))
survey_fine1.rename(columns=column_dict, inplace=True)

In [70]:
survey_fine1['catch_ratings'] = survey_fine1[survey_fine1.columns[survey_fine1.columns.str.contains('catch')]].mean(axis=1)

In [71]:
spammers = survey_fine1[survey_fine1['catch_ratings']>1.5].PROLIFIC_PID.tolist()
print (spammers)

['5d118de2c80b57001721f93e', '59c6305e6a287600011c388d', '5dc59df162ec5041c3e8807a', '5d6394518df2870001a74b42']


In [72]:
# get a list of approved subs
approved_subs = []
for i in completed_subs:
    if i not in spammers:
        approved_subs.append(i)
        
#for i in approved_subs:
#    print i

In [73]:
# rolling back qualtrics randomizer
survey_fine1[survey_fine1.PROLIFIC_PID.isin(approved_subs)].groupby(['survey_type','survey_name']).PROLIFIC_PID.count()

survey_type  survey_name
fine         1              3
             2              3
             3              3
             4              4
             5              3
             6              3
             7              3
             8              3
             9              3
             10             3
             11             3
             12             3
             13             4
             14             4
             15             3
             16             3
             17             3
             18             3
             19             3
             20             3
             21             3
             22             3
             23             3
             24             4
Name: PROLIFIC_PID, dtype: int64

In [74]:
# get valid subjects
valid_survey_fine1 = survey_fine1[survey_fine1.PROLIFIC_PID.isin(approved_subs)].reset_index(drop=True)
# get useful cols
valid_survey_fine1 = valid_survey_fine1[['PROLIFIC_PID','survey_type','survey_name']+list(column_dict.values())]
# sort by survey
valid_survey_fine1.sort_values(by=['survey_type','survey_name'], inplace=True)

## Read survey: Fine 25-36

In [76]:
# survey fine
survey_fine2 = pd.read_csv('../data/qualtrics/Scene Patch Rating - Fine (25-36)_January 21, 2020_01.05.csv',skiprows=[1,2])

In [77]:
# remove those who did not sign consent form
survey_fine2 = survey_fine2[survey_fine2.QID3==1].reset_index(drop=True)

In [78]:
# identify block assingment 
var_assign = survey_fine2.columns[-1]
print (var_assign)
survey_fine2['survey_type'] = survey_fine2[var_assign].apply(lambda x: 'coarse' if 'coarse' in x else 'fine')
survey_fine2['survey_name'] = survey_fine2[var_assign].apply(lambda x: int(re.search(r'\d+', x).group(0)))

FL_28_DO


In [79]:
# check ip address duplicates
survey_fine2[survey_fine2.IPAddress.duplicated()].PROLIFIC_PID

Series([], Name: PROLIFIC_PID, dtype: object)

In [80]:
# check if have complete data (320 ratings)
surv_rating = survey_fine2[survey_fine2.columns[survey_fine2.columns.str.contains('QID')][1:]]

# fine mask
complete_mask = (surv_rating.count(axis=1) == 320)
incomplete_mask = (surv_rating.count(axis=1) != 320)

completed_subs = survey_fine2[complete_mask].PROLIFIC_PID.tolist()
incompleted_subs = survey_fine2[incomplete_mask].PROLIFIC_PID.tolist()

In [81]:
incompleted_subs 

['5aea351112ed990001f873c3',
 '5a11e6eb30ad7f0001b6ca86',
 '5b8f08cbb007ed0001aaa0bd',
 '5c340181938e730001af10b5',
 '5abfe64d436f550001ffad93',
 '5ad0172da010190001b35148',
 '5c10e8870c0a040001fe7b86',
 '5b9b4cc406114100010660f0',
 '5b7d28b6c4e6ad0001d37b68',
 '5d98fd7c3e7d4c03d2a533ae']

In [82]:
# rename survey column names

# fine >= 25
new_colnames = patch_assign_df[(patch_assign_df.survey_name>=25)&(patch_assign_df.survey_type=='fine')].survey_name.astype(str)+':'+ patch_assign_df[(patch_assign_df.survey_name>=25)&(patch_assign_df.survey_type=='fine')].patch

In [83]:
column_dict = dict(zip(surv_rating.columns,new_colnames.tolist()))
survey_fine2.rename(columns=column_dict, inplace=True)

In [84]:
survey_fine2['catch_ratings'] = survey_fine2[survey_fine2.columns[survey_fine2.columns.str.contains('catch')]].mean(axis=1)

In [85]:
spammers = survey_fine2[survey_fine2['catch_ratings']>1.5].PROLIFIC_PID.tolist()
print (spammers)

['5b9b4cc406114100010660f0']


In [86]:
# get a list of approved subs
approved_subs = []
for i in completed_subs:
    if i not in spammers:
        approved_subs.append(i)
        
#for i in approved_subs:
#    print i

In [87]:
# rolling back qualtrics randomizer
survey_fine2[survey_fine2.PROLIFIC_PID.isin(approved_subs)].groupby(['survey_type','survey_name']).PROLIFIC_PID.count()

survey_type  survey_name
fine         25             3
             26             3
             27             3
             28             3
             29             3
             30             3
             31             3
             32             3
             33             3
             34             3
             35             3
             36             3
Name: PROLIFIC_PID, dtype: int64

In [88]:
# get valid subjects
valid_survey_fine2 = survey_fine2[survey_fine2.PROLIFIC_PID.isin(approved_subs)].reset_index(drop=True)
# get useful cols
valid_survey_fine2 = valid_survey_fine2[['PROLIFIC_PID','survey_type','survey_name']+list(column_dict.values())]
# sort by survey
valid_survey_fine2.sort_values(by=['survey_type','survey_name'], inplace=True)

In [90]:
# merge the two fine surveys together
valid_survey_fine = pd.merge(valid_survey_fine1, valid_survey_fine2,how='outer')

In [91]:
# note that there are some cases in which a survey was completed 4 times
# so we delete those extra subjects
#extra_subs = valid_survey_fine.groupby(['survey_type','survey_name']).PROLIFIC_PID.nth(3).values
#valid_survey_fine = valid_survey_fine[~valid_survey_fine.PROLIFIC_PID.isin(extra_subs)]

#extra_subs = valid_survey_coarse.groupby(['survey_type','survey_name']).PROLIFIC_PID.nth(3).values
#valid_survey_coarse = valid_survey_coarse[~valid_survey_coarse.PROLIFIC_PID.isin(extra_subs)]

## Construct matrix for build_meaning_map.m

In [92]:
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [93]:
def extract_ratings(df):
    df_t = df.transpose()[3:]
    rate1 = df_t.apply(lambda row: [x for x in row.dropna()][0],axis=1)
    rate2 = df_t.apply(lambda row: [x for x in row.dropna()][1],axis=1)
    rate3 = df_t.apply(lambda row: [x for x in row.dropna()][2],axis=1)
    result = pd.DataFrame({'rate1':rate1,
                         'rate2':rate2,
                         'rate3':rate3})
    result.reset_index(inplace=True)
    result[['survey_name','patch']] = result['index'].str.split(':', expand=True)
    result.survey_name = result.survey_name.astype('int64')
    result = result.drop('index',axis=1)
    return result

In [94]:
coarse_ratings = extract_ratings(valid_survey_coarse)
coarse_ratings['survey_type'] = 'coarse'

In [95]:
fine_ratings = extract_ratings(valid_survey_fine)
fine_ratings['survey_type'] = 'fine'

In [96]:
ratings = fine_ratings.append(coarse_ratings).reset_index(drop=True)

In [97]:
ratings = patch_assign_df.merge(ratings, on=['survey_type','survey_name','patch'],how='inner')

In [98]:
# drop catch trials
ratings = ratings.dropna().reset_index(drop=True)

In [99]:
ratings.sort_values(by=['survey_type','scene_name','patch_id'], inplace=True)

In [100]:
ratings.head()

Unnamed: 0,idx,patch,patch_id,scene_name,survey_name,survey_type,rate1,rate2,rate3
2693,312,after_IMG_02131.png,1.0,after_IMG_0213,9,coarse,1.0,1.0,1.0
355,59,after_IMG_02132.png,2.0,after_IMG_0213,2,coarse,3.0,3.0,2.0
669,73,after_IMG_02133.png,3.0,after_IMG_0213,3,coarse,2.0,2.0,1.0
1413,229,after_IMG_02134.png,4.0,after_IMG_0213,5,coarse,3.0,1.0,1.0
2269,180,after_IMG_02135.png,5.0,after_IMG_0213,8,coarse,3.0,5.0,4.0


In [101]:
# load matlab mat that store the masks
import scipy.io as sio
coarse_masks = sio.loadmat('../data/masks/coarse_masks.mat')['masks_all'].astype('float')
fine_masks = sio.loadmat('../data/masks/fine_masks.mat')['masks_all'].astype('float')
#plt.matshow(coarse_masks[:,:,0]) # plot the first mask

In [102]:
def elementwise_concat(A, B):
    if B == 0:
        return A
    elif isinstance(A, tuple):
        return A+tuple([B])
    else : # for the first concat
        return tuple([B])
elementwise_concat = np.vectorize(elementwise_concat,otypes=[object])

In [203]:
for scene_name in ratings.scene_name.unique():
    # select the current picture
    ratings_pic = ratings[(ratings.survey_type=='coarse') & (ratings.scene_name==scene_name)]
    patch_id_max = int(ratings_pic.patch_id.max())
    # create an empty matrix to store the output
    rating_mat_coarse = np.empty((768,1024))
    print scene_name
    # iterate through patch_ids to creat rating maps
    for i in range(0,patch_id_max):
        # rater 1
        rater1_score = int(ratings_pic[ratings_pic.patch_id==i+1].rate1)
        rater1_matrix = coarse_masks[:,:,i]*rater1_score
        rating_mat_coarse = elementwise_concat(rating_mat_coarse, rater1_matrix)
        # rater 2
        rater2_score = int(ratings_pic[ratings_pic.patch_id==i+1].rate2)
        rater2_matrix = coarse_masks[:,:,i]*rater2_score
        rating_mat_coarse = elementwise_concat(rating_mat_coarse, rater2_matrix)
        # rater 3
        rater3_score = int(ratings_pic[ratings_pic.patch_id==i+1].rate3)
        rater3_matrix = coarse_masks[:,:,i]*rater3_score
        rating_mat_coarse = elementwise_concat(rating_mat_coarse, rater3_matrix)

    # select the current picture
    ratings_pic = ratings[(ratings.survey_type=='fine') & (ratings.scene_name==scene_name)]
    patch_id_max = int(ratings_pic.patch_id.max())
    # create an empty matrix to store the output
    rating_mat_fine = np.empty((768,1024))
    # iterate through patch_ids to creat rating maps
    for i in range(0,patch_id_max):
        # rater 1
        rater1_score = int(ratings_pic[ratings_pic.patch_id==i+1].rate1)
        rater1_matrix = fine_masks[:,:,i]*rater1_score
        rating_mat_fine = elementwise_concat(rating_mat_fine, rater1_matrix)
        # rater 2
        rater2_score = int(ratings_pic[ratings_pic.patch_id==i+1].rate2)
        rater2_matrix = fine_masks[:,:,i]*rater2_score
        rating_mat_fine = elementwise_concat(rating_mat_fine, rater2_matrix)
        # rater 3
        rater3_score = int(ratings_pic[ratings_pic.patch_id==i+1].rate3)
        rater3_matrix = fine_masks[:,:,i]*rater3_score
        rating_mat_fine = elementwise_concat(rating_mat_fine, rater3_matrix)
        
    rating_mat_all = np.dstack((rating_mat_coarse, rating_mat_fine))
    sio.savemat('../data/rating_matrices/'+scene_name+'.mat',{'rating_matrix':rating_mat_all}, do_compression=True)

after_IMG_0213
after_IMG_0626
after_IMG_1333
after_IMG_1581
after_IMG_2350
after_IMG_3124
after_IMG_4101
after_IMG_4122
after_IMG_7096
after_dsc07169
after_img_025
after_img_089
after_img_094
after_img_095
after_img_096
after_img_100
after_img_107
after_img_111
after_img_114
after_img_125
after_sun_aafqetztbkmzxsif
after_sun_aalbxgjwijppgzjr
after_sun_aawvoklsoluipehz
after_sun_abzaicgobhjswlxg
after_sun_aegixgtgmmvqheiw-0
after_sun_ahvszpwjtodgwusx
after_sun_ajgebwheenyurlit
after_sun_ajzsgfiolbbxlgxc
after_sun_apueeecjfdprzyjl
after_sun_asrhohyqzajswxsa
after_sun_atyasoydrjavyfux-0
after_sun_aweoztuheglqxpoe-0
after_sun_awkhwzafhbdrlgkt
after_sun_awqxiwmqccwmgejq
after_sun_axarcshsnqsefemx
after_sun_axftyltjljtworuj
