In [1]:
print('Choose whether to display code or not.')
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').show();
 } else {
 $('div.input').hide();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Show Code?"></form>''')

Choose whether to display code or not.


In [2]:
from platform import python_version

print(python_version())

3.6.8


In [3]:
#load packages
import os
from tabulate import tabulate  # Used to display text-based tables.
import pandas as pd
import numpy as np
import math
from matplotlib.pylab import plt
from scipy.stats import chi2
from scipy.stats import chi2_contingency
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression
from statsmodels.discrete.discrete_model import MNLogit
from sklearn import metrics
import statsmodels.api as sm
from tabulate import tabulate
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# The important Kallysto imports.
from kallysto.publication import Publication
from kallysto.export import Export
from kallysto.formatter import Latex, Markdown
import kallysto.markdown

#williams correction from same folder
from williams_correction import williams_correction

In [4]:
#set working directory to top level project folder
os.chdir('..')
os.getcwd()

'/Users/mollyq/Documents/GitHub/Control_and_Valence_in_Unexpected_Events'

In [5]:
# Create a link between this notebook and the target publication (a_report)
latex_report = Publication(
    notebook='4_Experiment1_Analysis.ipynb',  # Current notebook name
    title='control_report',     # Publication name
    pub_path='3_output/',         # Location of publication project relative to this notebook.
    formatter=Latex,          # Default publictaion format; Markdown can also be used.
    
    overwrite=True,           # Overwrite any existing exports if they exist.
    fresh_start=False          # True: Delete log and include files if they exist.
)

# Methods

## Design
The study design is a between-subjects design with two binary categorical independent variables, Valence (Positive or Negative) and Means Availability (Present or Absent). The two dependent variables are proportions of outcomes that fall into a specific category: Valence (proportion negative) and Controllability (proportion uncontrollable).

## Participants
An a priori power analysis was conducted to determine the necessary sample size for a binomial logistic regression with two dichotomous predictor variables. A small effect size of OR = 2.04 was found for the valence predictor in previous research (Chen, Cohen, & Chen, 2010; Quinn & Keane, 2020), and a very small effect size (OR = 1.17) was found for the interaction effect in the pilot test of this study; as such, we estimate a small effect size (OR = 2.04) to maximize the likelihood of correctly rejecting the null hypothesis for the main effects. Following the guidelines set in (Verma & Verma, 2020), G*Power was used to reveal that a sample size of N = 218 would be sufficient to observe an effect of this size at an alpha of 0.05 and power of 0.80 for a one-tailed test with a binomial distribution in which the probability of a negative outcome is 0.56 when the predictor variable of valence is negative and when the predictor variables of valence and means availability are not correlated. For ease, 220 particpants were recruited through the Prolific.co crowdsourcing platform. One participant did not complete the study and was not included, leaving 219 participants.  

Incomplete or incorrect responses were assessed upon receipt of the participants survey answers and any exclusions were replaced by another participant before analyses were conducted.


In [6]:
raw_data = pd.read_csv('2_pipeline/2_main_study/master_raw_data.csv')
raw_data.columns

Index(['Unnamed: 0', 'session_id', 'participant_id', 'status',
       'started_datetime', 'completed_date_time', 'time_taken', 'age',
       'reviewed_at_datetime', 'entered_code', 'Nationality', 'Sex',
       'Response ID', 'Time Started', 'Date Submitted', 'Status', 'Contact ID',
       'Legacy Comments', 'Comments', 'Language', 'Referer', 'SessionID',
       'User Agent', 'Tags', 'valence_condition', 'means_condition', 'user_id',
       'Sentiment - user_id', 'sarah_parents.response',
       'Sentiment - sarah_parents.response', 'blue_team.response',
       'Sentiment - blue_team.response', 'bill_holiday.response',
       'Sentiment - bill_holiday.response', 'john_party.response',
       'Sentiment - john_party.response', 'rebecca_swimming.response',
       'Sentiment - rebecca_swimming.response', 'sally_wine.response',
       'Sentiment - sally_wine.response', 'belinda_meeting.response',
       'Sentiment - belinda_meeting.response', 'lucy_loan.response',
       'Sentiment - lucy_l

In [7]:
N = raw_data['user_id'].nunique()
N

219

In [8]:
total_n = Export.value('N', N) 
total_n > latex_report

Value('N', 219)

In [9]:
#Example Material for the Paper
sally_material = pd.DataFrame.from_dict(
    
    {'Example': {"1. Goal Sentence": "Sally wants to buy a bottle of wine for her Friday evening dinner.", 
                 "2. Goal Step": "Sally decides to go to the corner-shop near her house.",
                 "3. Condition": "The shop is (shut/open) and Sally has (an argument/a good chat) with a neighbour walking home."
                },
    }
)


print(tabulate(sally_material, headers="keys"))

                  Example
----------------  ----------------------------------------------------------------------------------------------
1. Goal Sentence  Sally wants to buy a bottle of wine for her Friday evening dinner.
2. Goal Step      Sally decides to go to the corner-shop near her house.
3. Condition      The shop is (shut/open) and Sally has (an argument/a good chat) with a neighbour walking home.


In [10]:
# Create a table export object and store in a variable called table.
example_material_table = Export.table('ExampleMaterial', 
             data=sally_material,
             caption='Example material.'
) 

# Transfer the export to the latex report.
example_material_table > latex_report

Table('ExampleMaterial',                                                             Example
1. Goal Sentence  Sally wants to buy a bottle of wine for her Fr...
2. Goal Step      Sally decides to go to the corner-shop near he...
3. Condition      The shop is (shut/open) and Sally has (an argu..., 'Example material.')

### Demographics. 
The mean age of participants was 34.75 (SD = 12.56). Participants were 63.76% female and 36.24% male. Participants were all native English speakers from the UK (89.04%), US (5.48%), or Ireland (5.48%).

In [11]:
# Age of Participants
age = raw_data['age'].describe()
age

count    219.000000
mean      34.753425
std       12.562264
min       18.000000
25%       25.000000
50%       32.000000
75%       42.500000
max       83.000000
Name: age, dtype: float64

In [12]:
for a in ["mean","std"]:
    value = Export.value(str(a.replace(" ", "") + "age"), str(round(age[a], 2))) 
    value > latex_report

In [13]:
# Sex of Participants
sex = raw_data['Sex'].value_counts(normalize=True)*100
sex

Female    63.761468
Male      36.238532
Name: Sex, dtype: float64

In [14]:
for s in sex.index:
    value = Export.value(s.replace(" ", ""), str(round(sex[s], 2))) 
    value > latex_report

In [15]:
# Sex of Participants
nationality = raw_data['Nationality'].value_counts(normalize=True)*100
nationality

United Kingdom    89.041096
United States      5.479452
Ireland            5.479452
Name: Nationality, dtype: float64

In [16]:
for n in nationality.index:
    value = Export.value(n.replace(" ",""), str(round(nationality[n],2)))
    value > latex_report

## Materials
Materials were created based on materials used in previous work (Foster & Keane, 20xx; Quinn, Campbell & Keane, 2021), manipulating the valence of a scenario (positive or negative) and the controllability of the scenario by changing the availability of the means to achieve the goal (means present or means absent). To isolate these factors, we do not vary the intention (goal) nor the first action taken towards the goal. Instead, we manipulate the means so that the means are either present or absent (changing the overall controllability of reaching the goal) and that the events are either positive or negative in both present and absent conditions (varying the valence). This allows us to test the interaction between controllability and valence which prior literature has not addressed.

Eight materials (vetted by a pre-test) were chosen, meeting the criteria of each of the four condition combinations. 

## Procedure
Participants were recruited via the Prolific.co platform and completed only one combination of the two IVs. After completing informed consent and agreeing to participate in the study, participants were randomly assigned to one of the four condition pairs (Positive – Means Present, Positive – Means Absent, Negative – Means Present, or Negative – Means Absent). Participants were then presented with two practice materials, followed by eight target materials.


### Chi-Square Functions

In [17]:
def cramers_V(chi, n, min_dim):
    v = math.sqrt((chi)/(n*(min_dim-1)))
    return v

In [18]:
def print_chi(obs):
    chi, pvalue, deffre, expt = chi2_contingency(obs)

    williams_chi, williams_pvalue, williams_deffre = williams_correction(obs, chi)
    effect_size = cramers_V(williams_chi, sum(sum(obs)), min(obs.shape))

    print("Unadjusted Chi-Square")
    print("X^2 = ", chi)
    print("degrees of freedom: ", deffre)
    print("p = ", "{:.16f}".format(pvalue), '\n')

    print("William's Corrected Chi-Square")
    print("X^2 = ", williams_chi)
    print("degrees of freedom: ", williams_deffre)
    print("p = ", "{:.16f}".format(williams_pvalue), '\n')
    
    print("Cramer's v effect size = ", "{:.3f}".format(effect_size), '\n')
    
    return pvalue, williams_chi, williams_pvalue, williams_deffre, effect_size

In [19]:
def export_value_chi_apa(value_name, chi, defre, N, p, v):
    value = Export.value(value_name,
                        f"\chi^2({defre}, N={N})={chi:.2f}, {p}, v={v}")
    value > latex_report

In [20]:
def chi_square(data):
    print("Chi square of all conditions:")
    print(tabulate(data, headers="keys", tablefmt='fancy_grid'))
    obs = np.array([data.loc[('negative', 'absent')],data.loc[('negative', 'present')],
                              data.loc[('positive', 'absent')],data.loc[('positive', 'present')]])
    pvalue, williams_chi, williams_pvalue, williams_deffre, effect_size = print_chi(obs)
       
    #save the william's chi info
    if williams_pvalue < 0.001:
        p = "p < .001"
    else:
        p = f"p = {williams_pvalue:.3f}"
        
    export_value_chi_apa(str(data.columns[0] + "chi").replace("_",""), williams_chi, williams_deffre, sum(sum(obs)), p, "{:.3f}".format(effect_size))
#     value = Export.value(str(data.columns[0] + "chi").replace("_",""),
#                         f"\chi^2 = {williams_chi:.2f}({williams_deffre}), {p}")
#     value > latex_report
    

In [21]:
def val_cond_chi_square(data):
    print("Chi square of all conditions:")
    print(tabulate(data, headers="keys", tablefmt='fancy_grid'))
    obs = np.array([data.loc[('negative')], data.loc[('positive')]])
    pvalue, williams_chi, williams_pvalue, williams_deffre, effect_size = print_chi(obs)
    
        #save the william's chi info
    if williams_pvalue < 0.001:
        p = "p < .001"
    else:
        p = f"p = {williams_pvalue:.3f}"
    
    export_value_chi_apa(str(data.columns[0] + "byvalencecondchi").replace("_",""), williams_chi, williams_deffre, sum(sum(obs)), p, "{:.3f}".format(effect_size))

    
#     value = Export.value(str(data.columns[0] + "byvalencecondchi").replace("_",""),
#                         f"\chi^2 = {williams_chi:.2f}({williams_deffre}), {p}")
#     value > latex_report
        

In [22]:
def means_cond_chi_square(data):
    print("Chi square of all conditions:")
    print(tabulate(data, headers="keys", tablefmt='fancy_grid'))
    obs = np.array([data.loc[('absent')],data.loc[('present')]])
    pvalue, williams_chi, williams_pvalue, williams_deffre, effect_size = print_chi(obs)
    
        #save the william's chi info
    if williams_pvalue < 0.001:
        p = "p < .001"
    else:
        p = f"p = {williams_pvalue:.3f}"

    export_value_chi_apa(str(data.columns[0] + "bymeanscondchi").replace("_",""), williams_chi, williams_deffre, sum(sum(obs)), p, "{:.3f}".format(effect_size))

  
#     value = Export.value(str(data.columns[0] + "bymeanscondchi").replace("_",""),
#                             f"\chi^2 = {williams_chi:.2f}({williams_deffre}), {p}")
#     value > latex_report
        

In [23]:
def cond_chi_square(data, alpha = .001):
    all_2_values = data.values.tolist()
    cond_2_list = [('negative', 'absent'),('negative', 'present'),('positive', 'absent'),('positive', 'present')]
    name_2_list = []

    count_pvalue = 0
    count_williams_pvalue = 0

    for cond1 in cond_2_list:
        for cond2 in cond_2_list:
            if (cond1 != cond2) and ((cond2 + cond1) not in name_2_list) :
                print(cond1 + cond2)
                name_2_list.append(cond1 + cond2)
                obs = np.array([data.loc[cond1],data.loc[cond2]])
                table_to_print = data.loc[[cond1,cond2]]
                print(tabulate(table_to_print, headers="keys", tablefmt='fancy_grid'))
                
                pvalue, williams_chi, williams_pvalue, williams_deffre, effect_size = print_chi(obs)
                
                #print(f"X^2 = {williams_chi:.2f}({williams_deffre}), p = {williams_pvalue:.3f}")
                
                
                #count significant pvalues
                if pvalue < alpha:
                    count_pvalue += 1
                print()
                
                if williams_pvalue < alpha:
                    count_williams_pvalue += 1
                    p = "p < .001"
                else:
                    p = f"p = {williams_pvalue:.3f}"
                    
                #save williams chi(df) and pvalue
                export_value_chi_apa(str(''.join(cond1) + ''.join(cond2) + data.columns[0] + "chi").replace("_",""), williams_chi, williams_deffre, sum(sum(obs)), p, "{:.3f}".format(effect_size))
                
#                 value = Export.value(str(''.join(cond1) + ''.join(cond2) + data.columns[0] + "chi").replace("_",""), 
#                                      str(f"\chi^2 = {williams_chi:.2f}({williams_deffre}), {p}"))
#                 value > latex_report
                print()

    print("The number of Instruction Conditions showing a Chi-Square with p < {} is ".format(alpha), count_pvalue)

    print("The number of Instruction Conditions showing a William's Corrected Chi-Square with p < {} is ".format(alpha), count_williams_pvalue)

In [24]:
def mat_chi_square(data, alpha = 0.001):
    count_pvalue = 0    
    count_williams_pvalue = 0
    bad_mats = []

    for mat in materials:
        df1 = data.loc[mat]
        print(mat)
        print(tabulate(df1, headers="keys", tablefmt='fancy_grid'))
        obs = np.array(df1)
        
        pvalue, williams_chi, williams_pvalue, williams_deffre, effect_size = print_chi(obs)

        if pvalue < alpha:
            count_pvalue += 1
        print()

        if williams_pvalue < alpha:
            count_williams_pvalue += 1
            p = "p < .001"
        else:
            p = f"p = {williams_pvalue:.3f}"
            bad_mats.append(mat)
            
        #check if it is overall chi for the mat in which case index.name = None
        if type(df1.index.name) != str:
            name = ""
        else:
            #, or if it's valence or means condition
            name = df1.index.name
            
        #save williams chi(df) and pvalue
        export_value_chi_apa(str(mat + df1.columns[0] + name + "chi").replace("_",""), williams_chi, williams_deffre, sum(sum(obs)), p, "{:.3f}".format(effect_size))

#         value = Export.value(str(mat + df1.columns[0] + name + "chi").replace("_",""), 
#                              str(f"\chi^2 = {williams_chi:.2f}({williams_deffre}), {p}"))
#         value > latex_report    
        print()

    print("The number of Materials showing a Chi-Square with p < {} is ".format(alpha), count_pvalue)

    print("The number of Materials showing a William's Corrected Chi-Square with p < {} is ".format(alpha), count_williams_pvalue)

    print("Non-significant mats are ", bad_mats)

# Results
Responses were labelled by two independent raters on answer category, valence, goal-related words, and outcome controllability.

In [25]:
master_codes = pd.read_csv('2_pipeline/2_main_study/master_data_codes.csv')

In [26]:
master_data = raw_data[['session_id', 'participant_id', 'status',
       'started_datetime', 'completed_date_time', 'time_taken', 'age',
       'reviewed_at_datetime', 'entered_code', 
       'Nationality', 'Sex', 'Response ID',
       'Time Started', 'Date Submitted', 'Status', 'Contact ID',
       'Legacy Comments', 'Comments', 'Language', 'Referer', 'SessionID',
       'valence_condition', 'means_condition', 'user_id',]].merge(
    master_codes[['user_id', 'material', 'response', 'ans_code', 'val_code', 'goal_code', 'control_code']], 
                                             on=['user_id'], how = 'right')
master_data.shape

(1752, 30)

In [27]:
#add labels for answer categories
ans_code_labels = pd.read_csv("2_pipeline/2_main_study/labels_and_descriptions.csv")
# display(ans_code_labels)
master_data = master_data.merge(ans_code_labels, on='ans_code', how = 'left')
master_data.shape

(1752, 31)

In [28]:
#fix where there are doubles because of the kappa combinations
master_data['response'] = master_data['response'].apply(
    lambda x: x.split('-->')[1] if '-->' in x else x)

In [29]:
for response in master_data['response']:
    if "-->" in response:
        
        print(response, response.split('-->')[1])

In [30]:
for response in master_data['response']:
    if 'strongest swimmer' in response:
        print(response)

The waves are stronger than Rebecca expected, she is not the strongest swimmer, so decides to head back inland. Just as she turns around to do so, she notices the lifeguard shouting at her... SHARK


In [31]:
answer_categories = master_data.groupby(['material', 'ans_code', 'category_description']).agg({'user_id': 'count'})

In [32]:
ans_cats_prop = answer_categories.groupby(level=0).apply(lambda x: x / float(x.sum())).reset_index().sort_values(by=['material','user_id'],ascending=False)

In [33]:
materials = master_data['material'].unique()
materials

array(['john_party', 'bill_holiday', 'rebecca_swimming', 'sally_wine',
       'belinda_meeting', 'michael_breakfast', 'lucy_loan', 'sean_call'],
      dtype=object)

In [34]:
zipped_cond_pairs = list(zip(
    ['negative','positive','negative','positive'],
    ['absent','absent','present','present']
))

In [35]:
zipped_cond_pairs

[('negative', 'absent'),
 ('positive', 'absent'),
 ('negative', 'present'),
 ('positive', 'present')]

In [36]:
condition_titles={'absent':'Uncontrollable', 'present':'Controllable'}

In [37]:
[str(' '.join(i)).title() for i in zipped_cond_pairs]

['Negative Absent', 'Positive Absent', 'Negative Present', 'Positive Present']

In [38]:
colors = ['rgb(228,26,28)','rgb(179,222,105)','rgb(251,180,174)','rgb(77,175,74)']

log_odds_subplot_colors = {}
for idx, tup in enumerate(zipped_cond_pairs):
    log_odds_subplot_colors[tup] = colors[idx]
    
log_odds_subplot_colors

{('negative', 'absent'): 'rgb(228,26,28)',
 ('positive', 'absent'): 'rgb(179,222,105)',
 ('negative', 'present'): 'rgb(251,180,174)',
 ('positive', 'present'): 'rgb(77,175,74)'}

# overall

In [39]:
master_data['val_code'].value_counts(normalize=True)

neg        0.567352
pos        0.351027
neither    0.081621
Name: val_code, dtype: float64

In [40]:
master_data['control_code'].value_counts(normalize=True)

uncontrollable     0.550228
controllable       0.254566
neither_control    0.195205
Name: control_code, dtype: float64