In [1]:
print('Choose whether to display code or not.')
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').show();
 } else {
 $('div.input').hide();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Show Code?"></form>''')

Choose whether to display code or not.


In [2]:
#load packages
import os
from tabulate import tabulate  # Used to display text-based tables.
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score

# The important Kallysto imports.
from kallysto.publication import Publication
from kallysto.export import Export
from kallysto.formatter import Latex, Markdown
import kallysto.markdown

In [3]:
#set working directory to top level project folder
os.chdir("..")
os.getcwd()

'/Users/mollyq/Documents/GitHub/Control_and_Valence_in_Unexpected_Events'

In [4]:
#Create a link between this notebook and the target publication (a_report)
latex_report = Publication(
    notebook='3_Labelled_Files_to_Kappa_to_Master.ipynb',  # Current notebook name
    title='control_report',     # Publication name
    pub_path='3_output/',         # Location of publication project relative to this notebook.
    formatter=Latex,          # Default publictaion format; Markdown can also be used.
    
    overwrite=True,           # Overwrite any existing exports if they exist.
    fresh_start=False          # Delete log and include files if they exist.
)

# Load in All Files


In [5]:
# paths to folders holding first 15 for calibrating and defining labelling criteria
folder_mq = "2_pipeline/2_main_study/1_MQ_labels/"
folder_cf = "2_pipeline/2_main_study/2_CF_labels/"

In [6]:
#raters
mq = 'MQ'
cf = 'CF'

In [7]:
#list of materials to read in
materials = ['john_party', 'bill_holiday', 'rebecca_swimming', 'sally_wine',
             'belinda_meeting', 'michael_breakfast', 'lucy_loan', 'sean_call'
]

In [8]:
#constant cols in order to id the answer categories
constant_cols = ['random', 'session_id','participant_id','status','started_datetime',
                 'completed_date_time','time_taken','age',
                 'reviewed_at_datetime','entered_code',
                 'Nationality','Sex','Response ID','Time Started','Date Submitted','Status',
                 'Contact ID','Legacy Comments','Comments','Language','Referer','SessionID',
                 'valence_condition',
                 'means_condition','user_id','material.variable','material','variable','response',
                 'pos','neg','neither','non_goal_object','goal_object','both_objects', 'neither_object',
                 'controllable','uncontrollable','neither_control','Unnamed: 0']

## First, check for missing values.

In [9]:
#check if the code has 219 entries,
def check_codes(dict_of_codes, code, df, material):
    df['sum'] = df[dict_of_codes[code]].sum(axis=1, skipna = True)
    rows_to_fix = df[df['sum'] != 1]
    if not rows_to_fix.empty:
        # and if not, print the material, which code to check, and the rows with errors
        print(material.upper())
        print("Found %d error(s) in the %s" % (len(rows_to_fix), code))
        print('row - response')
        print(rows_to_fix['response'])
        print()
    else:
        print(material, code, "COMPLETE")

        
#read in the files and send them to the checker
def check_files(folder, material):
    
    df = pd.read_csv(folder + material + "_labelled.csv", # name of file
                     header = 1, # file starts at line 2 with simple headers
                    )

    #get headers existing in df
    headers = df.columns.values
    #get unique headers as answer category
    ans_codes = []
    ans_codes = [item for item in headers if item not in constant_cols]
    
    #make a dict of codes
    dict_of_codes = {
                     'Answer_Categories': ans_codes,
                     'Valence_Categories': ['pos','neg','neither'],
                     'Goal_Categories': ['non_goal_object','goal_object'],
                     'Control_Categories': ['controllable','uncontrollable','neither_control']
                    }
    
    
    for code in dict_of_codes:
        check_codes(dict_of_codes, code, df, material)
    
    
    
    

In [10]:
#check for errors in all files:
for mat in materials:
    print("Molly Fixes")
    check_files(folder_mq, mat)
    print()
    print("-------------------------------------------------------")
    print()
    print("Courtney Fixes")
    check_files(folder_cf, mat)
    print()
    print("-------------------------------------------------------")
    print()

Molly Fixes
john_party Answer_Categories COMPLETE
john_party Valence_Categories COMPLETE
john_party Goal_Categories COMPLETE
john_party Control_Categories COMPLETE

-------------------------------------------------------

Courtney Fixes
john_party Answer_Categories COMPLETE
john_party Valence_Categories COMPLETE
john_party Goal_Categories COMPLETE
john_party Control_Categories COMPLETE

-------------------------------------------------------

Molly Fixes
bill_holiday Answer_Categories COMPLETE
bill_holiday Valence_Categories COMPLETE
bill_holiday Goal_Categories COMPLETE
bill_holiday Control_Categories COMPLETE

-------------------------------------------------------

Courtney Fixes
bill_holiday Answer_Categories COMPLETE
bill_holiday Valence_Categories COMPLETE
bill_holiday Goal_Categories COMPLETE
bill_holiday Control_Categories COMPLETE

-------------------------------------------------------

Molly Fixes
rebecca_swimming Answer_Categories COMPLETE
rebecca_swimming Valence_Categorie

# When missing values are fixed, load in files, and compare answers

In [11]:
mq_dict_of_files = {} #to keep dataframes MQ labelled
cf_dict_of_files = {} #to keep dataframes CF labelled


#function to process the material DFs
def process_mats(dict_of_codes, code, df, material):
    ###melt data by answer code
    data_w_codes = df.melt(id_vars=['user_id','response'], 
                  value_vars = dict_of_codes[code],
                  var_name=code, value_name= str(code + '_count'))
    #remove null
    data_w_codes = data_w_codes[(data_w_codes[str(code + '_count')] == 1) | 
                                (data_w_codes[str(code + '_count')] == 'X')]

    #print out the damage
    print("after cleaning the rows with %s !=(1 or X), %d rows are left" 
          % (code, data_w_codes.shape[0]))

    #set index of data for merge
    data_w_codes = data_w_codes.set_index(
        [
            'user_id', 
            'response'
        ]
    )


    return data_w_codes



# function to read each csv
def read_material_csvs(folder, material, rater):
    
    df = pd.read_csv(folder + material + "_labelled" + ".csv", # name of file
                     header = 1, # file starts at line 2 with simple headers
                    )
    #print(df.head())
    
    #get headers existing in df
    headers = df.columns.values
    #get unique headers as answer category
    ans_code = []
    ans_code = [item for item in headers if item not in constant_cols]
    
    #make a dict of codes
    dict_of_codes = {
                     'Answer_Categories': ans_code,
                     'Valence_Categories': ['pos','neg','neither'],
                     'Goal_Categories': ['non_goal_object','goal_object'],
                     'Control_Categories': ['controllable','uncontrollable','neither_control']
                    }
    
    #signal the start of material df processing
    print("Processing ", material)
    
    #Make an empty datframe to store new dfs
    data_w_both_codes = pd.DataFrame()
    
    # loop through the code types and get the new df
    for code in dict_of_codes:
        data_w_a_code = process_mats(dict_of_codes, code, df, material)
        #print(data_w_a_code.head())
        #merge new data
        if data_w_both_codes.empty:
            data_w_both_codes = data_w_a_code
        else:
            data_w_both_codes = pd.merge(
            left = data_w_both_codes, right = data_w_a_code, how = 'outer',
            left_index = True, right_index = True
        )
    
    #print(data_w_both_codes.head())
    #data_w_both_codes = data_w_both_codes.sort_index(level=1).reset_index(level=1, drop=True).reset_index()

    #add material name
    data_w_both_codes['material'] = material
    data_w_both_codes['rater'] = rater
    
    return data_w_both_codes
    

In [12]:
mq_ratings = pd.DataFrame()

for material in materials:
    material_df = read_material_csvs(folder_mq, material, mq)
    mq_ratings = mq_ratings.append(material_df)

mq_ratings.head(2)

Processing  john_party
after cleaning the rows with Answer_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Valence_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Goal_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Control_Categories !=(1 or X), 219 rows are left
Processing  bill_holiday
after cleaning the rows with Answer_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Valence_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Goal_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Control_Categories !=(1 or X), 219 rows are left
Processing  rebecca_swimming
after cleaning the rows with Answer_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Valence_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Goal_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Control_Categories !=(1 or X), 219 rows are left

Unnamed: 0_level_0,Unnamed: 1_level_0,Answer_Categories,Answer_Categories_count,Valence_Categories,Valence_Categories_count,Goal_Categories,Goal_Categories_count,Control_Categories,Control_Categories_count,material,rater
user_id,response,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5360c37cfdf99b02ccc08fcb,John cuts his hand with the peeler.,jp_neg_ans1,1.0,neg,1.0,goal_object,1.0,uncontrollable,1.0,john_party,MQ
55a14418fdf99b6ec83c244f,"His wife surprised him and announced that the dinner party was to celebrate their divorce, which John knew nothing about.",jp_neg_ans4,1.0,neg,1.0,goal_object,1.0,uncontrollable,1.0,john_party,MQ


In [13]:
# Get Rater 2 File
cf_ratings = pd.DataFrame()

for material in materials:
    material_df = read_material_csvs(folder_cf, material, cf)
    cf_ratings = cf_ratings.append(material_df)
        

cf_ratings.head(2)

Processing  john_party
after cleaning the rows with Answer_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Valence_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Goal_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Control_Categories !=(1 or X), 219 rows are left
Processing  bill_holiday
after cleaning the rows with Answer_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Valence_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Goal_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Control_Categories !=(1 or X), 219 rows are left
Processing  rebecca_swimming
after cleaning the rows with Answer_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Valence_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Goal_Categories !=(1 or X), 219 rows are left
after cleaning the rows with Control_Categories !=(1 or X), 219 rows are left

Unnamed: 0_level_0,Unnamed: 1_level_0,Answer_Categories,Answer_Categories_count,Valence_Categories,Valence_Categories_count,Goal_Categories,Goal_Categories_count,Control_Categories,Control_Categories_count,material,rater
user_id,response,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5360c37cfdf99b02ccc08fcb,John cuts his hand with the peeler.,jp_neg_ans1,1.0,neg,1.0,goal_object,1.0,uncontrollable,1.0,john_party,CF
55a14418fdf99b6ec83c244f,"His wife surprised him and announced that the dinner party was to celebrate their divorce, which John knew nothing about.",jp_neg_ans4,1.0,neg,1.0,goal_object,1.0,uncontrollable,1.0,john_party,CF


## Cohen's Kappa Inter-rater Agreement

In [14]:
for mat in materials:
    print()
    print("-------------------------------------------------------")
    print()
    print(mat.upper())
    print()
    cf_mat_ratings = cf_ratings[cf_ratings['material']==mat]
    mq_mat_ratings = mq_ratings[mq_ratings['material']==mat]
    
    cf_valence_ratings = cf_mat_ratings['Valence_Categories'].tolist()
    mq_valence_ratings = mq_mat_ratings['Valence_Categories'].tolist()
    print('Agreement between CF and MQ on valence:', cohen_kappa_score(cf_valence_ratings, mq_valence_ratings))

    cf_answer_ratings = cf_mat_ratings['Answer_Categories'].tolist()
    mq_answer_ratings = mq_mat_ratings['Answer_Categories'].tolist()
    print('Agreement between CF and MQ on answers:', cohen_kappa_score(cf_answer_ratings, mq_answer_ratings))

    cf_goal_ratings = cf_mat_ratings['Goal_Categories'].tolist()
    mq_goal_ratings = mq_mat_ratings['Goal_Categories'].tolist()
    print('Agreement between CF and MQ on goals:', cohen_kappa_score(cf_goal_ratings, mq_goal_ratings))

    cf_control_ratings = cf_mat_ratings['Control_Categories'].tolist()
    mq_control_ratings = mq_mat_ratings['Control_Categories'].tolist()
    print('Agreement between CF and MQ on control:', cohen_kappa_score(cf_control_ratings, mq_control_ratings))


-------------------------------------------------------

JOHN_PARTY

Agreement between CF and MQ on valence: 0.8025771555838371
Agreement between CF and MQ on answers: 0.7977372431309167
Agreement between CF and MQ on goals: 0.8942131195053618
Agreement between CF and MQ on control: 0.5284171138096583

-------------------------------------------------------

BILL_HOLIDAY

Agreement between CF and MQ on valence: 0.7509355051727933
Agreement between CF and MQ on answers: 0.8413080573151632
Agreement between CF and MQ on goals: 0.8714285714285714
Agreement between CF and MQ on control: 0.6389300134589502

-------------------------------------------------------

REBECCA_SWIMMING

Agreement between CF and MQ on valence: 0.6106919875130072
Agreement between CF and MQ on answers: 0.8140302873048073
Agreement between CF and MQ on goals: 0.9434157264662819
Agreement between CF and MQ on control: 0.7448385142377778

-------------------------------------------------------

SALLY_WINE

Agreement 

In [15]:
#overall
for cat in ['Valence_Categories','Answer_Categories','Goal_Categories','Control_Categories']:
    cf_valence_ratings = cf_mat_ratings[cat].tolist()
    mq_valence_ratings = mq_mat_ratings[cat].tolist()
    same = 0
    for i,j in zip(cf_valence_ratings, mq_valence_ratings):
        if i==j:
            same += 1
    percent_diff = 1 - same/len(cf_valence_ratings)
    K = cohen_kappa_score(cf_valence_ratings, mq_valence_ratings)
    print('Agreement between CF and MQ on ' , cat, ":", K, " percent diff = ", percent_diff)
    value = Export.value(str(cat.replace("_", "") + "K"), str(round(K, 2))) 
    value > latex_report

    
    # cf_answer_ratings = cf_mat_ratings['Answer_Categories'].tolist()
# mq_answer_ratings = mq_mat_ratings['Answer_Categories'].tolist()
# print('Agreement between CF and MQ on answers:', cohen_kappa_score(cf_answer_ratings, mq_answer_ratings))

# cf_goal_ratings = cf_mat_ratings['Goal_Categories'].tolist()
# mq_goal_ratings = mq_mat_ratings['Goal_Categories'].tolist()
# print('Agreement between CF and MQ on goals:', cohen_kappa_score(cf_goal_ratings, mq_goal_ratings))

# cf_control_ratings = cf_mat_ratings['Control_Categories'].tolist()
# mq_control_ratings = mq_mat_ratings['Control_Categories'].tolist()
# print('Agreement between CF and MQ on control:', cohen_kappa_score(cf_control_ratings, mq_control_ratings))


Agreement between CF and MQ on  Valence_Categories : 0.8746291959406713  percent diff =  0.05022831050228316
Agreement between CF and MQ on  Answer_Categories : 0.8332924074255967  percent diff =  0.14155251141552516
Agreement between CF and MQ on  Goal_Categories : 0.968807862127902  percent diff =  0.0091324200913242
Agreement between CF and MQ on  Control_Categories : 0.6622604097818903  percent diff =  0.1278538812785388


# Print out the things for the consensus.

In [16]:
# for material in materials:

#     df_cf = pd.read_csv(folder_cf + material + "_labelled.csv", skiprows = [0]).fillna(0)
#     df_mq = pd.read_csv(folder_mq + material + "_labelled.csv", skiprows = [0]).fillna(0)

#     dfDiff = df_cf.copy()
#     disagreements = 0
#     for row in range(dfDiff.shape[0]):
#         for col in range(dfDiff.shape[1]):
#             value_cf = df_cf.iloc[row,col]
#             try:
#                 value_mq = df_mq.iloc[row,col]
#                 if value_cf == value_mq:
#                     dfDiff.iloc[row,col] = df_mq.iloc[row,col]
#                 else:
#                     dfDiff.iloc[row,col] = ('{}-->{}').format(value_cf,value_mq)
#                     disagreements += 1
#             except:
#                 dfDiff.iloc[row,col] = ('{}-->{}').format(value_cf, 'NaN')

#     print(material, disagreements)
#     dfDiff.to_csv("2_pipeline/2_main_study/3_comparison_files/" + material + "_compare.csv", header=True)

# Read in the Final Consensus Files

In [17]:
def read_material_csvs(material):
    df = pd.read_csv("2_pipeline/2_main_study/4_Final/" + material + "_final.csv")
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    headers = df.columns.values
    constant_cols = ['random','participant_id','answer_text','material','ans_code','pos','neg', 'neither',
                     'goal_object','non_goal_object','response','controllable','uncontrollable','neither_control',
                    'session_id','status','started_datetime','completed_date_time',
                     'time_taken','age','num_approvals','num_rejections','prolific_score',
                     'reviewed_at_datetime','entered_code','Country of Birth',
                     'Current Country of Residence','Employment Status','First Language','First language',
                     'Nationality','Sex','Student Status','Response ID','Time Started','Date Submitted',
                     'Status','Contact ID','Legacy Comments','Comments','Language','Referer',
                     'SessionID','User Agent','Tags','IP Address','Longitude','Latitude','Country',
                     'City','State/Region','Postal','valence_condition','means_condition','variable',
                     'user_id','material.variable']
    ans_code = [item for item in headers if item not in constant_cols]

    #melt data by answer code
    data_w_answer_codes = df.melt(id_vars=['user_id','response'], 
                  value_vars = ans_code,
                  var_name='ans_code', value_name='ans_count')
    print("the data with answer codes for '%s' has a shape %s" % (material, str(data_w_answer_codes.shape)))
    
    #remove null
    data_w_answer_codes = data_w_answer_codes[data_w_answer_codes['ans_count'] == 1]
    print("after cleaning the rows with ans_count!=1 %d rows are left" % data_w_answer_codes.shape[0])
    
    
    #melt data by valence code
    data_w_valence_codes = df.melt(id_vars=['user_id','response'], 
                  value_vars=['pos','neg','neither'], 
                  var_name='val_code', value_name='val_count')
    print("the data with valence codes for '%s' has a shape %s" % (material, str(data_w_valence_codes.shape)))
    
    #remove null
    data_w_valence_codes = data_w_valence_codes[data_w_valence_codes['val_count'] == 1]
    print("after cleaning the rows with val_count!=1 %d rows are left" % data_w_valence_codes.shape[0])

    #melt data by goal code
    data_w_goal_codes = df.melt(id_vars=['user_id','response'], 
                  value_vars=['goal_object','non_goal_object'], 
                  var_name='goal_code', value_name='goal_count')
    print("the data with goal codes for '%s' has a shape %s" % (material, str(data_w_goal_codes.shape)))
    
    #remove null
    data_w_goal_codes = data_w_goal_codes[data_w_goal_codes['goal_count'] == 1]
    print("after cleaning the rows with goal_count!=1 %d rows are left" % data_w_goal_codes.shape[0])

    #melt data by control code
    data_w_control_codes = df.melt(id_vars=['user_id','response'], 
                  value_vars=['controllable','uncontrollable','neither_control'], 
                  var_name='control_code', value_name='control_count')
    print("the data with goal codes for '%s' has a shape %s" % (material, str(data_w_control_codes.shape)))
    
    #remove null
    data_w_control_codes = data_w_control_codes[data_w_control_codes['control_count'] == 1]
    print("after cleaning the rows with control_count!=1 %d rows are left" % data_w_control_codes.shape[0])

    
    
    #set index of both data for concat
    data_w_answer_codes = data_w_answer_codes.set_index(
        [
            'user_id', 
            'response'
        ]
    )
    
    data_w_valence_codes = data_w_valence_codes.set_index(
        [
            'user_id',
            'response' 
        ]
    )

    data_w_goal_codes = data_w_goal_codes.set_index(
        [
            'user_id', 
            'response'
        ]
    )
    
    data_w_control_codes = data_w_control_codes.set_index(
        [
            'user_id',
            'response' 
        ]
    )
    
    
    
    #concat data
    data_w_all_codes = pd.DataFrame()
    try:
        data_w_all_codes = pd.concat(
            [data_w_answer_codes, data_w_valence_codes, data_w_goal_codes, data_w_control_codes],
            axis=1
        )
        #print(data_w_both_codes.head())
#         data_w_both_codes = data_w_both_codes.sort_index(level=1).reset_index(level=1, drop=True).reset_index()
    except Exception as e:
        print("It didn't work for", material)
        print(e)
        print()
    
    #add material name
    data_w_all_codes['material'] = material
    
    return data_w_all_codes


In [18]:
read_material_csvs('belinda_meeting')

the data with answer codes for 'belinda_meeting' has a shape (3723, 4)
after cleaning the rows with ans_count!=1 219 rows are left
the data with valence codes for 'belinda_meeting' has a shape (657, 4)
after cleaning the rows with val_count!=1 219 rows are left
the data with goal codes for 'belinda_meeting' has a shape (438, 4)
after cleaning the rows with goal_count!=1 219 rows are left
the data with goal codes for 'belinda_meeting' has a shape (657, 4)
after cleaning the rows with control_count!=1 219 rows are left


Unnamed: 0_level_0,Unnamed: 1_level_0,ans_code,ans_count,val_code,val_count,goal_code,goal_count,control_code,control_count,material
user_id,response,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5360c37cfdf99b02ccc08fcb,She gets a message to say the meeting has changed venue.,bm_neg_ans6,1,neg,1,goal_object,1,uncontrollable,1,belinda_meeting
55a14418fdf99b6ec83c244f,Her phone rings and she is told that the meeting has been cancelled.,bm_neg_ans4,1,neither,1,goal_object,1,uncontrollable,1,belinda_meeting
55bbf175fdf99b47c7d10a22,"She ends up rushing across town to get to the correct office. Luckily, she arrives just on time",bm_pos_ans7,1,pos,1,goal_object,1,controllable,1,belinda_meeting
56163e6e7ffc8a0005812df2,A bomb explodes in the office block and kills her,bm_neg_ans7,1,neg,1,goal_object,1,uncontrollable,1,belinda_meeting
56671468d87e9100052f7f8b,"She miscalculated the time difference between Los Angeles and where she began her journey, and she's actully on time.",bm_pos_ans4,1,pos,1,goal_object,1,controllable,1,belinda_meeting
...,...,...,...,...,...,...,...,...,...,...
5f5158c1573b9938ed97d812,A fire alarm goes off,bm_neg_ans9,1,neg,1,non_goal_object,1,uncontrollable,1,belinda_meeting
5f523ba1c2e4fb4a54dc9fa0,The meeting has changed to a dial in conference call,bm_pos_ans3,1,neither,1,goal_object,1,uncontrollable,1,belinda_meeting
5f52c34dbc9a931b2df43460,She was taken to a wrong office block,bm_neg_ans6,1,neg,1,goal_object,1,uncontrollable,1,belinda_meeting
5f53cfde4e91947067d2435a,Perhaps she forgot the keys to her office,bm_neg_ans5,1,neg,1,goal_object,1,controllable,1,belinda_meeting


In [19]:
consensus_data = pd.DataFrame()
for material in materials:
    material_df = read_material_csvs(material)
    print(material_df.shape)
    consensus_data = consensus_data.append(material_df)
    
print(consensus_data.shape)
consensus_data.head()

the data with answer codes for 'john_party' has a shape (3066, 4)
after cleaning the rows with ans_count!=1 219 rows are left
the data with valence codes for 'john_party' has a shape (657, 4)
after cleaning the rows with val_count!=1 219 rows are left
the data with goal codes for 'john_party' has a shape (438, 4)
after cleaning the rows with goal_count!=1 219 rows are left
the data with goal codes for 'john_party' has a shape (657, 4)
after cleaning the rows with control_count!=1 219 rows are left
(219, 9)
the data with answer codes for 'bill_holiday' has a shape (3066, 4)
after cleaning the rows with ans_count!=1 219 rows are left
the data with valence codes for 'bill_holiday' has a shape (657, 4)
after cleaning the rows with val_count!=1 219 rows are left
the data with goal codes for 'bill_holiday' has a shape (438, 4)
after cleaning the rows with goal_count!=1 219 rows are left
the data with goal codes for 'bill_holiday' has a shape (657, 4)
after cleaning the rows with control_coun

Unnamed: 0_level_0,Unnamed: 1_level_0,ans_code,ans_count,val_code,val_count,goal_code,goal_count,control_code,control_count,material
user_id,response,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5360c37cfdf99b02ccc08fcb,John cuts his hand with the peeler.,jp_neg_ans1,1,neg,1,goal_object,1,uncontrollable,1,john_party
55a14418fdf99b6ec83c244f,"His wife surprised him and announced that the dinner party was to celebrate their divorce, which John knew nothing about.",jp_neg_ans4,1,neg,1,goal_object,1,uncontrollable,1,john_party
55bbf175fdf99b47c7d10a22,His wife tells him to go and watch TV while she does it properly,jp_pos_ans4,1,neg,1,goal_object,1,uncontrollable,1,john_party
56163e6e7ffc8a0005812df2,John stabbed her with the potato peeler,jp_neg_ans7,1,neg,1,goal_object,1,controllable,1,john_party
56671468d87e9100052f7f8b,"John's wife takes over peeling the potatoes, but does even worse at it than John did.",jp_pos_ans4,1,neg,1,goal_object,1,uncontrollable,1,john_party


In [20]:
consensus_data[(consensus_data['material']=='bill_holiday')&
              (consensus_data['ans_code']=='bh_neg_1')]['ans_code']='bh_neg_ans1'
consensus_data[(consensus_data['material']=='bill_holiday')&
              (consensus_data['ans_code']=='bh_neg_2')]['ans_code']='bh_neg_ans2'
consensus_data[(consensus_data['material']=='bill_holiday')&
              (consensus_data['ans_code']=='bh_neg_3')]['ans_code']='bh_neg_ans3'
consensus_data[(consensus_data['material']=='bill_holiday')&
              (consensus_data['ans_code']=='bh_neg_4')]['ans_code']='bh_neg_ans4'
consensus_data[(consensus_data['material']=='bill_holiday')&
              (consensus_data['ans_code']=='bh_neg_5')]['ans_code']='bh_neg_ans5'
consensus_data[(consensus_data['material']=='bill_holiday')&
              (consensus_data['ans_code']=='bh_neg_6')]['ans_code']='bh_neg_ans6'
consensus_data[(consensus_data['material']=='bill_holiday')&
              (consensus_data['ans_code']=='bh_neg_7')]['ans_code']='bh_neg_ans7'
consensus_data[(consensus_data['material']=='bill_holiday')&
              (consensus_data['ans_code']=='bh_neg_8')]['ans_code']='bh_neg_ans8'

consensus_data[(consensus_data['material']=='bill_holiday')&
              (consensus_data['ans_code']=='other')]['ans_code']='bh_other'
consensus_data[(consensus_data['material']=='belinda_meeting')&
              (consensus_data['ans_code']=='other')]['ans_code']='bm_other'
# consensus_data[consensus_data['material']=='bill_holiday']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [21]:
#consensus_data.to_csv("2_pipeline/2_main_study/master_data_codes.csv")