This script is meant to restructure the results files to be more usable in the data analysis scripts. Important changes are listed below:
* condensed results for multiple orderings of the same prompt with the same group assignments and dependent variable into a single row
    e.g. the 4 rows

        <parameterized-prompt>, ..., X, Y, DV, ..., <resultsXY>
        <parameterized-prompt>, ..., Y, X, DV, ..., <resultsXX>
        <parameterized-prompt>, ..., Y, X, DV, ..., <resultsYX>
        <parameterized-prompt>, ..., Y, Y, DV, ..., <resultsYY>
        
becomes the single row

        <parameterized-prompt>, ..., X, Y, DV, ..., <resultsXY>, <resultsXX>, <resultsYX>, <resultsYY>

Two versions of this exist, one for trolley problems and one for prisoner's dilemmas, differing due to differing column names and number

In [1]:
import pandas as pd

In [4]:
def restruct_data_shared(source_file, dest_file, new_cols, col_map):
    in_df = pd.read_csv(source_file, sep='|')

    restructured_df = pd.DataFrame(columns=in_df.columns.drop(['dv_prob', 'prompt_cat']).tolist() + new_cols)

    # Iterate through each row in the original dataframe
    for index, row in in_df.iterrows():
        # Extract relevant information from the current row
        prompt                 = row['prompt']
        prompt_cat             = row['prompt_cat']
        group1                 = row['group1']
        group2                 = row['group2']
        group_type             = row['group_type']
        dependent_var          = row['dependent_var']
        dv_prob                = row['dv_prob']
        res_group1, res_group2 = sorted([group1, group2])

        # Check if a row with the same groups, and dependent_var already exists in the restructured dataframe
        existing_row_index = restructured_df.index[
            (restructured_df['dependent_var'] == dependent_var) &
                (
                    ((restructured_df['group1'] == group1) &
                     (restructured_df['group2'] == group2))
                      |
                    ((restructured_df['group1'] == group2) &
                     (restructured_df['group2'] == group1))
                )
            ].tolist()

        # If a row with the same group1, group2, and dependent_var does not exist, create it
        if not existing_row_index:
            row_dict = {'prompt': [prompt],
                        'group1': [res_group1],
                        'group2': [res_group2],
                        'group_type': [group_type],
                        'dependent_var': [dependent_var],
                        'res_1_i_b': [None], 'res_1_i_c': [None], 'res_1_o_b': [None], 'res_1_o_c': [None],
                        'res_2_i_b': [None], 'res_2_i_c': [None], 'res_2_o_b': [None], 'res_2_o_c': [None],
                       } | {k: [None] for k in new_cols}
            
            row_df = pd.DataFrame.from_dict(row_dict)
            restructured_df = pd.concat([restructured_df, row_df], ignore_index=True)
            
            existing_row_index = restructured_df.index[
                (restructured_df['dependent_var'] == dependent_var) &
                    (
                        ((restructured_df['group1'] == group1) &
                         (restructured_df['group2'] == group2))
                          |
                        ((restructured_df['group1'] == group2) &
                         (restructured_df['group2'] == group1))
                    )
                ].tolist()

        #only 4 possibilities: 'in-betray', 'in-coop', 'out-betray', 'out-coop'
        restructured_df.at[existing_row_index[0], f'res_{1 if group1==res_group1 else 2}_{col_map[prompt_cat]}'] = dv_prob

    # restructured_df.to_csv(dest_file, sep='|', index=False, mode='w')
    print('Successfully saved restructured data to: ', dest_file)

def restruct_data_PD(source_file, dest_file):
    new_cols = [
        'res_1_i_b', 'res_1_i_c', 'res_1_o_b', 'res_1_o_c',
        'res_2_i_b', 'res_2_i_c', 'res_2_o_b', 'res_2_o_c',
    ]
    col_map = {
        'in-betray' : 'i_b',
        'in-coop'   : 'i_c',
        'out-betray': 'o_b',
        'out-coop'  : 'o_c',
    }
    restruct_data_shared(source_file, dest_file, new_cols, col_map)

def restruct_data_TP(source_file, dest_file):
    new_cols = [
        'res_1_c', 'res_1_lr_l', 'res_1_rl_l', 'res_1_lr_r', 'res_1_rl_r',
        'res_2_c', 'res_2_lr_l', 'res_2_rl_l', 'res_2_lr_r', 'res_2_rl_r'
    ]
    col_map = {
        'control'    : 'c',
        'lin-rout'   : 'lr_l',
        'rin-lout'   : 'rl_r',
        'lout-rin'   : 'lr_r',
        'rout-lin'   : 'rl_l',
    }
    restruct_data_shared(source_file, dest_file, new_cols, col_map)

In [24]:
tp_experiments = [
    'Exp1-THS',
    'Exp2-TLSP',
    'Exp3-TLSN',
    'Exp4-TRB',
]
pd_experiments = [
    'Exp5-PDLS',
    'Exp6-PDHS',
]

models = [
    'BERT',
    'LLaMa2-7B',
    'Llama3-8B',
    'Llama3-8B-Instruct',
    'GPT2',
    'Mistral-7B',
    'OpenLLaMa',
    'Beaver-7B',
    'Alpaca-7B',
    'Falcon-7B',
    'Falcon-7B-instruct',
    'RoBERTa',
    'RoBERTa-large',
    'SOLAR-10.7B',
    'debiased_bert-race',
    'debiased_bert-gender',
]

In [None]:
experiment_base_folder = '..'

for mod in models:
    for exp in tp_experiments:
        file_base = f'{experiment_base_folder}/{exp}/results_{mod}'
        restruct_data_TP(f'{file_base}.csv', f'{file_base}_restruct.csv')
    for exp in pd_experiments:
        file_base = f'{experiment_base_folder}/{exp}/results_{mod}'
        restruct_data_PD(f'{file_base}.csv', f'{file_base}_restruct.csv')