In [51]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
import seaborn as sns
import random

### Clean the data for all the Ego sessions separately from all the stereotype sessions. I start with ego sessions.

In [52]:
# import the data in wide format for eah session
session1 = pd.read_csv("../Data/session1_ego.csv")
session2 = pd.read_csv("../Data/session2_ego.csv")
session3 = pd.read_csv("../Data/session3_ego.csv")
session4 = pd.read_csv("../Data/session4_ego.csv")

# concatenate the sessions
ego_wide = pd.concat([session1, session2, session3, session4], axis=0, sort=False)

ego_wide['treatment'] = 'ego'
ego_wide.reset_index(inplace=True)

# import the stereotype data in wide format
session5 = pd.read_csv("../Data/session5_stereo.csv")
session6 = pd.read_csv("../Data/session6_stereo.csv")
session7 = pd.read_csv("../Data/session7_stereo.csv")
session8 = pd.read_csv("../Data/session8_stereo.csv")

stereo_wide = pd.concat([session5, session6, session7, session8], axis=0, sort=False)
stereo_wide['treatment'] = 'stereotype'



  ego_wide['treatment'] = 'ego'
  ego_wide.reset_index(inplace=True)
  stereo_wide['treatment'] = 'stereotype'


In [53]:
# from the data frame ego_wide, select all the columns that have names that start with 'Quizzes' and append the column 'participant.code
quiz_cols = [col for col in ego_wide.columns if 'Quizzes' in col]+['participant.code', 'treatment']


# from the data frame ego_wide, select all the columns that have names that start with 'participant.'
participant_cols = [col for col in ego_wide.columns if 'participant' in col]+['participant.code']

# from the data frame ego_wide, select all the columns that have names that start with 'session.'
session_cols = [col for col in ego_wide.columns if 'session' in col]+['participant.code']

# from the data frame ego_wide, select all the columns that have names that start with 'Signals.'
signal_cols = [col for col in ego_wide.columns if 'Signals' in col]+['participant.code']

# from the data frame ego_wide, select all the columns that have names that start with 'SignalsOther.'
signal_other_cols = [col for col in ego_wide.columns if 'SignalsOther' in col]+['participant.code']

# from the data frame ego_wide, select all the columns that have names that start with 'Questionnaire.'
questionnaire_cols = [col for col in ego_wide.columns if 'Questionnaire' in col]+['participant.code']


In [54]:
# split the data frame ego_wide into 6 data frames using the columns we just selected
ego_quiz = ego_wide[quiz_cols]
ego_participant = ego_wide[participant_cols]
ego_session = ego_wide[session_cols]
ego_signal = ego_wide[signal_cols]
ego_signal_other = ego_wide[signal_other_cols] # this has no information for the ego treatment
ego_questionnaire = ego_wide[questionnaire_cols]

## Quiz Scores
First I will clean the data for the first part of the experiment to get only the true scores.

In [55]:
# for ego_quiz rename all the columns to remove the prefix 'Quizzes'
ego_quiz.columns = [col.replace('Quizzes.','') for col in ego_quiz.columns]
# for ego_quiz rename all the columns to remove the prefix 'player'
ego_quiz.columns = [col.replace('player.','') for col in ego_quiz.columns]
# drop all the columns that have either 'group.' or 'subsession.' in the name
ego_quiz = ego_quiz[[col for col in ego_quiz.columns if 'group.' not in col and 'subsession.' not in col]]
# melt the data making the participant code the id variable
ego_quiz_long = pd.melt(ego_quiz, id_vars=['participant.code'])
# split the variable column into two columns, one for the round_number and one for the question
ego_quiz_long[['round_number','variable_name']] = ego_quiz_long['variable'].str.split('.', expand=True)
# drop the column variable with the long names
ego_quiz_long = ego_quiz_long.drop('variable', axis=1)
# reshape ego_quiz_long from long to wide format by making each of the values in variable_name a column
ego_quiz_wide = ego_quiz_long.pivot(index=['participant.code','round_number'], columns=['variable_name'], values='value')
# make a table that has only the score for each topic and the participant code
ego_scores = ego_quiz_wide[['topic', 'score']]
# reset the index so that participant_code is just another colum
ego_scores.reset_index(inplace=True)

### Participant level variables

In [56]:
# rename all the other variables withoutht the 'participant.' prefix
ego_participant.columns = [col.replace('participant.','') for col in ego_participant.columns]


## The variables from the main part of the experiment
This table has all the effort choices and signal realizations round by round. It also has the beliefs about their score and the buttons clicked for the type matixes

In [57]:
# from ego_signal drop all the columns that have the 'SignalsOther.' prefix
ego_signal = ego_signal[[col for col in ego_signal.columns if 'SignalsOther.' not in col]]

# remove the 'Signals.' prefix from all the column names and the 'participant.' prefix from the code column
ego_signal.columns = [col.replace('Signals.','') for col in ego_signal.columns]
ego_signal.columns = [col.replace('participant.','') for col in ego_signal.columns]
# replace the 'player.' in the names of the columns
ego_signal.columns = [col.replace('player.', '') for col in ego_signal.columns]

# drop the group and subsession level columns
ego_signal = ego_signal[[col for col in ego_signal.columns if 'group.' not in col and '.subsession' not in col]]

# melt the data set making code the id column. then split all the variable names
ego_signal_long = pd.melt(ego_signal, id_vars='code')

In [58]:
# split the variable column into two columns, one for the round_number and one for the question
ego_signal_long[['round_number','variable_name']] = ego_signal_long['variable'].str.split('.', expand=True)
# drop the column variable with the long names
ego_signal_long = ego_signal_long.drop('variable', axis=1)

# reshape ego_signal_long from long to wide format by making each of the values in variable_name a column
ego_signal_wide = ego_signal_long.pivot(index=['code','round_number'], columns=['variable_name'], values='value')
ego_signal_wide.reset_index(inplace=True)

In [59]:
beliefs = ego_signal_wide.loc[ego_signal_wide['science_belief']>=0, 
                    ['code','science_belief', 'us_belief', 'math_belief', 'verbal_belief', 'pop_belief', 'sports_belief',
                    'science_certainty', 'us_certainty', 'math_certainty', 'verbal_certainty', 'pop_certainty', 'sports_certainty']]

In [60]:
# the round_numbers range from 1 to 60. I want all of them to range from 1 to 11 by setting 11 to 1, 21 to 1, 12 to 2, etc.
ego_signal_wide['round_number'] = ego_signal_wide['round_number'].astype(int)

ego_signal_wide['round_number'].replace([12,23,34,45,56], 1, inplace=True)
ego_signal_wide['round_number'].replace([13,24,35,46,57], 2, inplace=True)
ego_signal_wide['round_number'].replace([14,25,36,47,58], 3, inplace=True)
ego_signal_wide['round_number'].replace([15,26,37,48,59], 4, inplace=True)
ego_signal_wide['round_number'].replace([16,27,38,49,60], 5, inplace=True)
ego_signal_wide['round_number'].replace([17,28,39,50,61], 6, inplace=True)
ego_signal_wide['round_number'].replace([18,29,40,51,62], 7, inplace=True)
ego_signal_wide['round_number'].replace([19,30,41,52,63], 8, inplace=True)
ego_signal_wide['round_number'].replace([20,31,42,53,64], 9, inplace=True)
ego_signal_wide['round_number'].replace([21,32,43,54,65], 10, inplace=True)
ego_signal_wide['round_number'].replace([22,33,44,55,66], 11, inplace=True)


### Split the signal stage by topic. 
For all participants, they got the same exogenous rate and the same order of signals for each topic.
I will then merge the topics that have the same exchange rate and the same order of signals.

there are nine combinations of type and rate that each participant could have had in each topic. Those 9 pairs are the ones I am interested in. 

In [61]:
ego_science = ego_signal_wide.loc[(ego_signal_wide['effort']>=0) & (ego_signal_wide['topic']=='Science and Technology'), 
                    ['code', 'round_number', 'effort', 'fails', 'last_button', 'science_score', 'topic', 'signal']]
# impute the values for science_belief and the science_certainty from the table beliefs
ego_science = ego_science.merge(beliefs[['code', 'science_belief', 'science_certainty']], on='code', how='left')

# add a column with the type according to the science score
ego_science.loc[ego_science['science_score']>15, 'type'] = '2'
ego_science.loc[ego_science['science_score']<=15, 'type'] = '1'
ego_science.loc[ego_science['science_score']<6, 'type'] = '0'

# make a data frame for math exactly as ego_science but with the math score and belief
ego_math = ego_signal_wide.loc[(ego_signal_wide['effort']>=0) & (ego_signal_wide['topic']=='Math'),
                    ['code', 'round_number', 'effort', 'fails', 'last_button', 'math_score', 'topic', 'signal']]
# impute the values for math_belief and the math_certainty from the table beliefs
ego_math = ego_math.merge(beliefs[['code', 'math_belief', 'math_certainty']], on='code', how='left')

# add a column with the type according to the science score
ego_math.loc[ego_math['math_score']>15, 'type'] = '2'
ego_math.loc[ego_math['math_score']<=15, 'type'] = '1'
ego_math.loc[ego_math['math_score']<6, 'type'] = '0'

# make a data frame for verbal exactly as ego_science but with the verbal score and belief
ego_verbal = ego_signal_wide.loc[(ego_signal_wide['effort']>=0) & (ego_signal_wide['topic']=='Verbal'),
                    ['code', 'round_number', 'effort', 'fails', 'last_button', 'verbal_score', 'topic', 'signal']]
# impute the values for verbal_belief and the verbal_certainty from the table beliefs
ego_verbal = ego_verbal.merge(beliefs[['code', 'verbal_belief', 'verbal_certainty']], on='code', how='left')

# add a column with the type according to the science score
ego_verbal.loc[ego_verbal['verbal_score']>15, 'type'] = '2'
ego_verbal.loc[ego_verbal['verbal_score']<=15, 'type'] = '1'
ego_verbal.loc[ego_verbal['verbal_score']<6, 'type'] = '0'

# make a data frame for verbal exactly as ego_science but with the verbal score and belief
ego_pop = ego_signal_wide.loc[(ego_signal_wide['effort']>=0) & (ego_signal_wide['topic']=='Pop-Culture and Art'),
                    ['code', 'round_number', 'effort', 'fails', 'last_button', 'pop_score', 'topic', 'signal']]
# impute the values for verbal_belief and the verbal_certainty from the table beliefs
ego_pop = ego_pop.merge(beliefs[['code', 'pop_belief', 'pop_certainty']], on='code', how='left')

# add a column with the type according to the science score
ego_pop.loc[ego_pop['pop_score']>15, 'type'] = '2'
ego_pop.loc[ego_pop['pop_score']<=15, 'type'] = '1'
ego_pop.loc[ego_pop['pop_score']<6, 'type'] = '0'

# make a data frame for verbal exactly as ego_science but with the verbal score and belief
ego_sports = ego_signal_wide.loc[(ego_signal_wide['effort']>=0) & (ego_signal_wide['topic']=='Sports and Video Games'),
                    ['code', 'round_number', 'effort', 'fails', 'last_button', 'sports_score', 'topic', 'signal']]
# impute the values for verbal_belief and the verbal_certainty from the table beliefs
ego_sports = ego_sports.merge(beliefs[['code', 'sports_belief', 'sports_certainty']], on='code', how='left')

# add a column with the type according to the science score
ego_sports.loc[ego_sports['sports_score']>15, 'type'] = '2'
ego_sports.loc[ego_sports['sports_score']<=15, 'type'] = '1'
ego_sports.loc[ego_sports['sports_score']<6, 'type'] = '0'

# make a data frame for verbal exactly as ego_science but with the verbal score and belief
ego_us = ego_signal_wide.loc[(ego_signal_wide['effort']>=0) & (ego_signal_wide['topic']=='US Geography'),
                    ['code', 'round_number', 'effort', 'fails', 'last_button', 'us_score', 'topic', 'signal']]
# impute the values for verbal_belief and the verbal_certainty from the table beliefs
ego_us = ego_us.merge(beliefs[['code', 'us_belief', 'us_certainty']], on='code', how='left')

# add a column with the type according to the science score
ego_us.loc[ego_us['us_score']>15, 'type'] = '2'
ego_us.loc[ego_us['us_score']<=15, 'type'] = '1'
ego_us.loc[ego_us['us_score']<6, 'type'] = '0'

## Get the rate values for each topic
The rate values are saved at the session level and are the same across all rounds because the seed was set at the beginning

In [62]:
# from the ego session data frame select only the columns that we will use
ego_session = ego_session[['participant.id_in_session', 
                            'session.code', 
                            'session.w_verbal', 
                            'session.w_math', 
                            'session.w_pop',
                            'session.w_science',
                            'session.w_sports',
                            'session.w_us']]


In [63]:
# in ego_session remove the prefixes 'participant.' and 'session.' from the column names
ego_session.columns = [col.replace('participant.','') for col in ego_session.columns]
ego_session.columns = [col.replace('session.','') for col in ego_session.columns]
# add a column to each of the topics tables with the corresponding value of w from the ego_session table
ego_science['rate']=ego_session['w_science'][0]
ego_math['rate']=ego_session['w_math'][0]
ego_verbal['rate']=ego_session['w_verbal'][0]
ego_pop['rate']=ego_session['w_pop'][0]
ego_sports['rate']=ego_session['w_sports'][0]
ego_us['rate']=ego_session['w_us'][0]

## name homogenization
Make all the toipic tables have the same column names so that they can be merged into one table

In [64]:
# from ego_science remove the prefix 'science_' from the column names that have it
ego_science.columns = [col.replace('science_','') for col in ego_science.columns]

# from ego_math remove the prefix 'math_' from the column names that have it
ego_math.columns = [col.replace('math_','') for col in ego_math.columns]

# from ego_verbal remove the prefix 'verbal_' from the column names that have it
ego_verbal.columns = [col.replace('verbal_','') for col in ego_verbal.columns]

# from ego_pop remove the prefix 'pop_' from the column names that have it
ego_pop.columns = [col.replace('pop_','') for col in ego_pop.columns]

# from ego_sports remove the prefix 'sports_' from the column names that have it
ego_sports.columns = [col.replace('sports_','') for col in ego_sports.columns]

# from ego_us remove the prefix 'us_' from the column names that have it
ego_us.columns = [col.replace('us_','') for col in ego_us.columns]

# stack all the topic tables into one larger data frame called ego_updates
ego_updates = pd.concat([ego_science, ego_math, ego_verbal, ego_pop, ego_sports, ego_us])
ego_updates.reset_index(inplace=True, drop=True)


## separate into 9 types:
low type, low rate

low type, medium rate

low type, high rate

medium type, low rate

medium type, medium rate

medium type, high rate

high type, low rate

high type, medium rate

high type, high rate

In [65]:
# add a colun that indicates the treatment
ego_updates['treatment'] = 'ego'

# turn the type column into an integer
ego_updates['type'] = ego_updates['type'].astype(int)

# add a column that indicates if the participant was overconfident
ego_updates['overconfident'] = 0
ego_updates.loc[ego_updates['type']<ego_updates['belief'], 'overconfident'] = 1

# add a column that indicates if the participant was underconfident
ego_updates['underconfident'] = 0
ego_updates.loc[ego_updates['type']>ego_updates['belief'], 'underconfident'] = 1

# add a column that indicates if the participant was correct
ego_updates['correct']=0
ego_updates.loc[ego_updates['type']==ego_updates['belief'], 'correct'] = 1

# create a table for each pair of theta and omega (type and rate) the subindexes are in that order
#low types
updates_ll = ego_updates.loc[(ego_updates['type']==0) & (ego_updates['rate']==0), :]
updates_lm = ego_updates.loc[(ego_updates['type']==0) & (ego_updates['rate']==1), :]
updates_lh = ego_updates.loc[(ego_updates['type']==0) & (ego_updates['rate']==2), :]

# mid types
updates_ml = ego_updates.loc[(ego_updates['type']==1) & (ego_updates['rate']==0), :]
updates_mm = ego_updates.loc[(ego_updates['type']==1) & (ego_updates['rate']==1), :]
updates_mh = ego_updates.loc[(ego_updates['type']==1) & (ego_updates['rate']==2), :]

# high types
updates_hl = ego_updates.loc[(ego_updates['type']==2) & (ego_updates['rate']==0), :]
updates_hm = ego_updates.loc[(ego_updates['type']==2) & (ego_updates['rate']==1), :]
updates_hh = ego_updates.loc[(ego_updates['type']==2) & (ego_updates['rate']==2), :]

In [66]:
ego_updates.loc[ego_updates['overconfident']==1, 'misspecification'] = 'over'
ego_updates.loc[ego_updates['underconfident']==1, 'misspecification'] = 'under'
ego_updates.loc[ego_updates['correct']==1, 'misspecification'] = 'correct'
ego_updates

Unnamed: 0,code,round_number,effort,fails,last_button,score,topic,signal,belief,certainty,type,rate,treatment,overconfident,underconfident,correct,misspecification
0,0m2xzxgv,1,1,9,2,9.0,Science and Technology,1,0,75,1,0,ego,0,1,0,under
1,0m2xzxgv,2,1,7,1,9.0,Science and Technology,3,0,75,1,0,ego,0,1,0,under
2,0m2xzxgv,3,2,10,2,9.0,Science and Technology,0,0,75,1,0,ego,0,1,0,under
3,0m2xzxgv,4,0,7,0,9.0,Science and Technology,3,0,75,1,0,ego,0,1,0,under
4,0m2xzxgv,5,0,8,1,9.0,Science and Technology,2,0,75,1,0,ego,0,1,0,under
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2965,zap4p51l,7,1,9,0,5.0,US Geography,1,0,75,0,1,ego,0,0,1,correct
2966,zap4p51l,8,0,9,0,5.0,US Geography,1,0,75,0,1,ego,0,0,1,correct
2967,zap4p51l,9,2,10,0,5.0,US Geography,0,0,75,0,1,ego,0,0,1,correct
2968,zap4p51l,10,1,5,0,5.0,US Geography,5,0,75,0,1,ego,0,0,1,correct


# The Stereotypes treatment
do the same for all tables and instead of using signals, use SignalsOther. Then merge the two tables together with the column that indicates the treatment

In [67]:
# from the data frame select all the columns that have names that start with 'Quizzes' and append the column 'participant.code
quiz_cols = [col for col in stereo_wide.columns if 'Quizzes' in col]+['participant.code', 'treatment']


# from the data frame select all the columns that have names that start with 'participant.'
participant_cols = [col for col in stereo_wide.columns if 'participant' in col]+['participant.code']

# from the data frame select all the columns that have names that start with 'session.'
session_cols = [col for col in stereo_wide.columns if 'session' in col]+['participant.code']

# from the data frame select all the columns that have names that start with 'Signals.'
signal_cols = [col for col in stereo_wide.columns if 'Signals' in col]+['participant.code']

# from the data frame select all the columns that have names that start with 'SignalsOther.'
signal_other_cols = [col for col in stereo_wide.columns if 'SignalsOther' in col]+['participant.code']

# from the data frame select all the columns that have names that start with 'Questionnaire.'
questionnaire_cols = [col for col in stereo_wide.columns if 'Questionnaire' in col]+['participant.code']

# split the data frame stereo_wide into 6 data frames using the columns we just selected
stereo_quiz = stereo_wide[quiz_cols]
stereo_participant = stereo_wide[participant_cols]
stereo_session = stereo_wide[session_cols]
stereo_signal = stereo_wide[signal_cols] # this has no information for the stereotype treatment
stereo_signal_other = stereo_wide[signal_other_cols] 
stereo_questionnaire = stereo_wide[questionnaire_cols]

# for stereo_quiz rename all the columns to remove the prefix 'Quizzes'
stereo_quiz.columns = [col.replace('Quizzes.','') for col in stereo_quiz.columns]
# for stereo_quiz rename all the columns to remove the prefix 'player'
stereo_quiz.columns = [col.replace('player.','') for col in stereo_quiz.columns]

# drop all the columns that have either 'group.' or 'subsession.' in the name
stereo_quiz = stereo_quiz[[col for col in stereo_quiz.columns if 'group.' not in col and 'subsession.' not in col]]
# melt the data making the participant code the id variable
stereo_quiz_long = pd.melt(stereo_quiz, id_vars=['participant.code'])
# split the variable column into two columns, one for the round_number and one for the question
stereo_quiz_long[['round_number','variable_name']] = stereo_quiz_long['variable'].str.split('.', expand=True)
# drop the column variable with the long names
stereo_quiz_long = stereo_quiz_long.drop('variable', axis=1)
# reshape stereo_quiz_long from long to wide format by making each of the values in variable_name a column
stereo_quiz_wide = stereo_quiz_long.pivot(index=['participant.code','round_number'], columns=['variable_name'], values='value')
# make a table that has only the score for each topic and the participant code
stereo_scores = stereo_quiz_wide[['topic', 'score']]
# reset the index so that participant_code is just another colum
stereo_scores.reset_index(inplace=True)

# Rename all the other variables withoutht the 'participant.' prefix
stereo_participant.columns = [col.replace('participant.','') for col in stereo_participant.columns]

### Main part of the experiment for the sterotype treatment

# remove the 'Signals.' prefix from all the column names and the 'participant.' prefix from the code column
stereo_signal_other.columns = [col.replace('SignalsOther.','') for col in stereo_signal_other.columns]
stereo_signal_other.columns = [col.replace('participant.','') for col in stereo_signal_other.columns]
# replace the 'player.' in the names of the columns
stereo_signal_other.columns = [col.replace('player.', '') for col in stereo_signal_other.columns]

# drop the group and subsession level columns
stereo_signal_other = stereo_signal_other[[col for col in stereo_signal_other.columns if 'group.' not in col and '.subsession' not in col]]

# melt the data set making code the id column. then split all the variable names
stereo_signal_other_long = pd.melt(stereo_signal_other, id_vars='code')

# split the variable column into two columns, one for the round_number and one for the question
stereo_signal_other_long[['round_number','variable_name']] = stereo_signal_other_long['variable'].str.split('.', expand=True)
# drop the column variable with the long names
stereo_signal_other_long = stereo_signal_other_long.drop('variable', axis=1)

# reshape stereo_signal_other_long from long to wide format by making each of the values in variable_name a column
stereo_signal_other_wide = stereo_signal_other_long.pivot(index=['code','round_number'], columns=['variable_name'], values='value')
stereo_signal_other_wide.reset_index(inplace=True)

beliefs_other = stereo_signal_other_wide.loc[stereo_signal_other_wide['science_belief_other']>=0, 
                    ['code','science_belief_other', 'us_belief_other', 'math_belief_other', 'verbal_belief_other', 
                    'pop_belief_other', 'sports_belief_other',
                    'science_certainty_other', 'us_certainty_other', 'math_certainty_other', 'verbal_certainty_other', 
                    'pop_certainty_other', 'sports_certainty_other']]

# the round_numbers range from 1 to 60. I want all of them to range from 1 to 10 by setting 11 to 1, 21 to 1, 12 to 2, etc.
stereo_signal_other_wide['round_number'] = stereo_signal_other_wide['round_number'].astype(int)

stereo_signal_other_wide['round_number'].replace([12,23,34,45,56], 1, inplace=True)
stereo_signal_other_wide['round_number'].replace([13,24,35,46,57], 2, inplace=True)
stereo_signal_other_wide['round_number'].replace([14,25,36,47,58], 3, inplace=True)
stereo_signal_other_wide['round_number'].replace([15,26,37,48,59], 4, inplace=True)
stereo_signal_other_wide['round_number'].replace([16,27,38,49,60], 5, inplace=True)
stereo_signal_other_wide['round_number'].replace([17,28,39,50,61], 6, inplace=True)
stereo_signal_other_wide['round_number'].replace([18,29,40,51,62], 7, inplace=True)
stereo_signal_other_wide['round_number'].replace([19,30,41,52,63], 8, inplace=True)
stereo_signal_other_wide['round_number'].replace([20,31,42,53,64], 9, inplace=True)
stereo_signal_other_wide['round_number'].replace([21,32,43,54,65], 10, inplace=True)
stereo_signal_other_wide['round_number'].replace([22,33,44,55,66], 11, inplace=True)

# split by topic

st_science = stereo_signal_other_wide.loc[(stereo_signal_other_wide['effort']>=0) & (stereo_signal_other_wide['topic']=='Science and Technology'), 
                    ['code', 'round_number', 'effort', 'fails', 'last_button', 'science_other', 'topic', 'signal']]
# impute the values for science_belief and the science_certainty from the table beliefs
st_science = st_science.merge(beliefs_other[['code', 'science_belief_other', 'science_certainty_other']], on='code', how='left')

# add a column with the type according to the science score
st_science.loc[st_science['science_other']>15, 'type'] = '2'
st_science.loc[st_science['science_other']<=15, 'type'] = '1'
st_science.loc[st_science['science_other']<6, 'type'] = '0'

# make a data frame for math exactly as st_science but with the math score and belief
st_math = stereo_signal_other_wide.loc[(stereo_signal_other_wide['effort']>=0) & (stereo_signal_other_wide['topic']=='Math'),
                    ['code', 'round_number', 'effort', 'fails', 'last_button', 'math_other', 'topic', 'signal']]
# impute the values for math_belief and the math_certainty from the table beliefs
st_math = st_math.merge(beliefs_other[['code', 'math_belief_other', 'math_certainty_other']], on='code', how='left')

# add a column with the type according to the science score
st_math.loc[st_math['math_other']>15, 'type'] = '2'
st_math.loc[st_math['math_other']<=15, 'type'] = '1'
st_math.loc[st_math['math_other']<6, 'type'] = '0'

# make a data frame for verbal exactly as st_science but with the verbal score and belief
st_verbal = stereo_signal_other_wide.loc[(stereo_signal_other_wide['effort']>=0) & (stereo_signal_other_wide['topic']=='Verbal'),
                    ['code', 'round_number', 'effort', 'fails', 'last_button', 'verbal_other', 'topic', 'signal']]
# impute the values for verbal_belief and the verbal_certainty from the table beliefs
st_verbal = st_verbal.merge(beliefs_other[['code', 'verbal_belief_other', 'verbal_certainty_other']], on='code', how='left')

# add a column with the type according to the science score
st_verbal.loc[st_verbal['verbal_other']>15, 'type'] = '2'
st_verbal.loc[st_verbal['verbal_other']<=15, 'type'] = '1'
st_verbal.loc[st_verbal['verbal_other']<6, 'type'] = '0'

# make a data frame for verbal exactly as st_science but with the verbal score and belief
st_pop = stereo_signal_other_wide.loc[(stereo_signal_other_wide['effort']>=0) & (stereo_signal_other_wide['topic']=='Pop-Culture and Art'),
                    ['code', 'round_number', 'effort', 'fails', 'last_button', 'pop_other', 'topic', 'signal']]
# impute the values for verbal_belief and the verbal_certainty from the table beliefs
st_pop = st_pop.merge(beliefs_other[['code', 'pop_belief_other', 'pop_certainty_other']], on='code', how='left')

# add a column with the type according to the science score
st_pop.loc[st_pop['pop_other']>15, 'type'] = '2'
st_pop.loc[st_pop['pop_other']<=15, 'type'] = '1'
st_pop.loc[st_pop['pop_other']<6, 'type'] = '0'

# make a data frame for verbal exactly as st_science but with the verbal score and belief
st_sports = stereo_signal_other_wide.loc[(stereo_signal_other_wide['effort']>=0) & (stereo_signal_other_wide['topic']=='Sports and Video Games'),
                    ['code', 'round_number', 'effort', 'fails', 'last_button', 'sports_other', 'topic', 'signal']]
# impute the values for verbal_belief and the verbal_certainty from the table beliefs
st_sports = st_sports.merge(beliefs_other[['code', 'sports_belief_other', 'sports_certainty_other']], on='code', how='left')

# add a column with the type according to the science score
st_sports.loc[st_sports['sports_other']>15, 'type'] = '2'
st_sports.loc[st_sports['sports_other']<=15, 'type'] = '1'
st_sports.loc[st_sports['sports_other']<6, 'type'] = '0'

# make a data frame for verbal exactly as st_science but with the verbal score and belief
st_us = stereo_signal_other_wide.loc[(stereo_signal_other_wide['effort']>=0) & (stereo_signal_other_wide['topic']=='US Geography'),
                    ['code', 'round_number', 'effort', 'fails', 'last_button', 'us_other', 'topic', 'signal']]
# impute the values for verbal_belief and the verbal_certainty from the table beliefs
st_us = st_us.merge(beliefs_other[['code', 'us_belief_other', 'us_certainty_other']], on='code', how='left')

# add a column with the type according to the science score
st_us.loc[st_us['us_other']>15, 'type'] = '2'
st_us.loc[st_us['us_other']<=15, 'type'] = '1'
st_us.loc[st_us['us_other']<6, 'type'] = '0'

# from the ego session data frame select only the columns that we will use
stereo_session = stereo_session[['participant.id_in_session', 
                            'session.code', 
                            'session.w_verbal', 
                            'session.w_math', 
                            'session.w_pop',
                            'session.w_science',
                            'session.w_sports',
                            'session.w_us']]


In [68]:
# get the rows that are nan in the column 'effort'
stereo_signal_other_wide.loc[(stereo_signal_other_wide['round_number']<12) & (stereo_signal_other_wide['topic']=='Science and Technology'), :]

variable_name,code,round_number,effort,fails,gender_other,high_button,id_in_group,last_button,low_button,math_belief_other,...,us_belief_other,us_belief_self,us_certainty_other,us_certainty_self,us_other,verbal_belief_other,verbal_belief_self,verbal_certainty_other,verbal_certainty_self,verbal_other
0,0kxsrg73,1,0,8,Female,2,4,2,2,2,...,1,0,50,75,11,1,0,75,75,4
1,0kxsrg73,10,0,8,,0,4,0,1,,...,,,,,,,,,,
2,0kxsrg73,11,0,6,,1,4,2,0,,...,,,,,,,,,,
11,0kxsrg73,2,0,4,,0,4,0,1,,...,,,,,,,,,,
22,0kxsrg73,3,2,10,,1,4,0,1,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,zwzznpo6,7,0,3,,1,7,2,0,,...,,,,,,,,,,
2122,zwzznpo6,8,0,5,,1,7,2,0,,...,,,,,,,,,,
2124,zwzznpo6,9,0,6,,1,7,2,0,,...,,,,,,,,,,
2125,zwzznpo6,10,1,7,,0,7,1,0,,...,,,,,,,,,,


In [69]:
# in stereo_session remove the prefixes 'participant.' and 'session.' from the column names
stereo_session.columns = [col.replace('participant.','') for col in stereo_session.columns]
stereo_session.columns = [col.replace('session.','') for col in stereo_session.columns]
# add a column to each of the topics tables with the corresponding value of w from the stereo_session table
st_science['rate']=stereo_session['w_science'].unique()[0]
st_math['rate']=stereo_session['w_math'].unique()[0]
st_verbal['rate']=stereo_session['w_verbal'].unique()[0]
st_pop['rate']=stereo_session['w_pop'].unique()[0]
st_sports['rate']=stereo_session['w_sports'].unique()[0]
st_us['rate']=stereo_session['w_us'].unique()[0]

In [70]:
# from st_science rename the score_other column to score so that it we can stack across all the topics and the belief and
# crtainty columns to not include the topic '_prefix' or the other '_suffix'
st_science.rename(columns={'science_other':'score', 'science_belief_other':'belief', 'science_certainty_other':'certainty'}, inplace=True)
st_math.rename(columns={'math_other':'score', 'math_belief_other':'belief', 'math_certainty_other':'certainty'}, inplace=True)
st_verbal.rename(columns={'verbal_other':'score', 'verbal_belief_other':'belief', 'verbal_certainty_other':'certainty'}, inplace=True)
st_pop.rename(columns={'pop_other':'score', 'pop_belief_other':'belief', 'pop_certainty_other':'certainty'}, inplace=True)
st_sports.rename(columns={'sports_other':'score', 'sports_belief_other':'belief', 'sports_certainty_other':'certainty'}, inplace=True)
st_us.rename(columns={'us_other':'score', 'us_belief_other':'belief', 'us_certainty_other':'certainty'}, inplace=True)


# stack all the topic tables into one larger data frame called st_updates
st_updates = pd.concat([st_science, st_math, st_verbal, st_pop, st_sports, st_us])
st_updates.reset_index(inplace=True, drop=True)

In [71]:
# add a colun that indicates the treatment
st_updates['treatment'] = 'stereotype'

# turn the type column into an integer
st_updates['type'] = st_updates['type'].astype(int)

# add a column that indicates if the participant was overconfident
st_updates['overconfident'] = 0
st_updates.loc[st_updates['type']<st_updates['belief'], 'overconfident'] = 1

# add a column that indicates if the participant was underconfident
st_updates['underconfident'] = 0
st_updates.loc[st_updates['type']>st_updates['belief'], 'underconfident'] = 1

# add a column that indicates if the participant was correct
st_updates['correct']=0
st_updates.loc[st_updates['type']==st_updates['belief'], 'correct'] = 1

# create a table for each pair of theta and omega (type and rate) the subindexes are in that order
#low types
updates_ll = st_updates.loc[(st_updates['type']==0) & (st_updates['rate']==0), :]
updates_lm = st_updates.loc[(st_updates['type']==0) & (st_updates['rate']==1), :]
updates_lh = st_updates.loc[(st_updates['type']==0) & (st_updates['rate']==2), :]

# mid types
updates_ml = st_updates.loc[(st_updates['type']==1) & (st_updates['rate']==0), :]
updates_mm = st_updates.loc[(st_updates['type']==1) & (st_updates['rate']==1), :]
updates_mh = st_updates.loc[(st_updates['type']==1) & (st_updates['rate']==2), :]

# high types
updates_hl = st_updates.loc[(st_updates['type']==2) & (st_updates['rate']==0), :]
updates_hm = st_updates.loc[(st_updates['type']==2) & (st_updates['rate']==1), :]
updates_hh = st_updates.loc[(st_updates['type']==2) & (st_updates['rate']==2), :]

In [72]:
st_updates.loc[st_updates['overconfident']==1, 'misspecification'] = 'over'
st_updates.loc[st_updates['underconfident']==1, 'misspecification'] = 'under'
st_updates.loc[st_updates['correct']==1, 'misspecification'] = 'correct'


In [73]:
# merge the data from both treatments into a table called updates
updates = pd.concat([ego_updates, st_updates])

In [74]:
characteristics_ego = ego_wide[['participant.code', 
                        'participant.gender',
                        'participant.nationality']]

characteristics_stereo = stereo_wide[['participant.code', 
                        'participant.gender',
                        'participant.nationality']]

characteristics = pd.concat([characteristics_ego, characteristics_stereo], axis=0, sort=False)


# rename the columns to remove the prefix 'participant.'
characteristics.columns = [col.replace('participant.','') for col in characteristics.columns]

In [75]:
# merge the data from characteristics with the data from updates on code. keep everything from updates
updates = updates.merge(characteristics, on='code', how='left')

In [76]:
# add a column that turns the misspecification column into a numeric variable. -1 if under, 0 if correct and 1 if over
updates['misspecification_num'] = 0
updates.loc[updates['misspecification']=='over', 'misspecification_num'] = 1
updates.loc[updates['misspecification']=='under', 'misspecification_num'] = -1

# save data into a csv file
updates.to_csv('../Clean/updates.csv', index=False)
ego_updates.to_csv('../Clean/ego_updates.csv', index=False)
st_updates.to_csv('../Clean/stereo_updates.csv', index=False)

In [77]:
st_updates['code'].unique()

array(['0kxsrg73', '1zb27tea', '36b7nxpg', '3cv0gsso', '4d7lvql9',
       '4kecnisx', '56op1uib', '6sybrxjn', '6zd6yym2', '78sdphga',
       'a22g0du7', 'byf6tjt0', 'cioxyqvq', 'cl0818oa', 'dk93ewke',
       'ffd2bxk4', 'g2p53iqv', 'idzf7yro', 'ilooy3i8', 'j3r6tuv8',
       'jk70axat', 'jka2nr9v', 'k43107gm', 'ko3jhnbj', 'lmiiifw0',
       'n86dcmzu', 'ncvtgyzn', 'oqf1d07l', 'q9jar1of', 'uz5m20q1',
       'v3yunf9n', 'wursesku', 'zwzznpo6'], dtype=object)

In [78]:
len(updates[updates['treatment']=='stereotype']['code'].unique())

33