This script is meant to restructure the results files to be more usable in the data analysis scripts. Important changes are listed below:
* condensed results for multiple orderings of the same prompt with the same group assignments into a single row
    e.g. the 6 rows
        
        <parameterized-prompt>, ..., X, Y, ... <controlX>
        <parameterized-prompt>, ..., X, Y, ... <controlY>
        <parameterized-prompt>, ..., X, Y, ..., <resultsXY>
        <parameterized-prompt>, ..., Y, X, ..., <resultsXX>
        <parameterized-prompt>, ..., Y, X, ..., <resultsYX>
        <parameterized-prompt>, ..., Y, Y, ..., <resultsYY>
        
becomes the single row

        <parameterized-prompt>, ..., X, Y, ..., <controlX>, <controlY>, <resultsXY>, <resultsXX>, <resultsYX>, <resultsYY>

In [2]:
import pandas as pd

In [4]:
def restruct_data(source_file, dest_file):
    in_df = pd.read_csv(source_file, sep='|')

    new_columns = in_df.columns.drop(['dv_prob', 'prompt_cat']).tolist() + ['res_c1', 'res_c2', 'res_11', 'res_12', 'res_21', 'res_22']
    restructured_df = pd.DataFrame(columns=new_columns)

    # Iterate through each row in the original dataframe
    for index, row in in_df.iterrows():
        # Extract relevant information from the current row
        prompt = row['prompt']
        prompt_cat = row['prompt_cat']
        group1 = row['group1']
        group2 = row['group2']
        group_type = row['group_type']
        dependent_var = row['dependent_var']
        sentiment = row.get('sentiment', None)
        dv_prob = row['dv_prob']
        res_group1, res_group2 = sorted([group1, group2])

        # Check if a row with the same groups, and dependent_var already exists in the restructured dataframe
        existing_row_index = restructured_df.index[
            (restructured_df['dependent_var'] == dependent_var) &
            (restructured_df['group1'] == res_group1) &
            (restructured_df['group2'] == res_group2)
        ].tolist()
        if not existing_row_index:
            existing_row_index = restructured_df.index[
                (restructured_df['dependent_var'] == dependent_var) &
                (restructured_df['group1'] == group2) &
                (restructured_df['group2'] == group1)
            ].tolist()


        # If a row with the same group1, group2, and dependent_var exists, update it
        if existing_row_index:
            existing_row_index = existing_row_index[0]
            if   prompt_cat == 'control' and group1 == res_group1:
                restructured_df.at[existing_row_index, 'res_c1'] = dv_prob
            elif prompt_cat == 'control' and group1 == res_group2:
                restructured_df.at[existing_row_index, 'res_c2'] = dv_prob
            elif prompt_cat == 'in'      and group1 == res_group1:
                restructured_df.at[existing_row_index, 'res_11'] = dv_prob
            elif prompt_cat == 'in'      and group1 == res_group2:
                restructured_df.at[existing_row_index, 'res_22'] = dv_prob
            elif prompt_cat == 'out'     and group1 == res_group1:
                restructured_df.at[existing_row_index, 'res_12'] = dv_prob
            elif prompt_cat == 'out'     and group1 == res_group2:
                restructured_df.at[existing_row_index, 'res_21'] = dv_prob
        # Otherwise, create a new row
        else:
            row_df = pd.DataFrame.from_dict({'prompt': [prompt],
                                             'group1': [res_group1],
                                             'group2': [res_group2],
                                             'group_type': [group_type],
                                             'dependent_var': [dependent_var],
                                             'res_c1': [dv_prob] if prompt_cat == 'control' and group1 == res_group1 else [None],
                                             'res_c2': [dv_prob] if prompt_cat == 'control' and group1 == res_group2 else [None],
                                             'res_11': [dv_prob] if prompt_cat == 'in'      and group1 == res_group1 else [None],
                                             'res_22': [dv_prob] if prompt_cat == 'in'      and group1 == res_group2 else [None],
                                             'res_12': [dv_prob] if prompt_cat == 'out'     and group1 == res_group1 else [None],
                                             'res_21': [dv_prob] if prompt_cat == 'out'     and group1 == res_group2 else [None],
                                            })
            if sentiment is not None:
                row_df.at[0, 'sentiment'] = sentiment
            restructured_df = pd.concat([restructured_df, row_df], ignore_index=True)

    restructured_df.to_csv(dest_file, sep='|', index=False, mode='w')
    print('Successfully saved restructured data to: ', dest_file)

In [5]:
experiment_base_folder = '../data'
experiments = [
    'Exp1-basic',
    'Exp2-DG',
    'Exp3-PGG',
    'Exp4-CYD',
    'Exp5-FAA',
    'Exp6-WM'
]
models = [
    'bert-base-uncased',
    'debiased_model_bert-base-uncased_gender',
    'debiased_model_bert-base-uncased_race',
    'PKU-Alignment_alpaca-7b-reproduced',
    'PKU-Alignment_beaver-7b-v1.0',
]
for exp in experiments:
    for mod in models:
        file_base = f'{experiment_base_folder}/{exp}/results_{mod}'
        restruct_data(f'{file_base}.csv', f'{file_base}_restruct.csv')

Successfully saved restructured data to:  ../data/Exp1-basic/results_bert-base-uncased_restruct.csv
Successfully saved restructured data to:  ../data/Exp1-basic/results_debiased_model_bert-base-uncased_gender_restruct.csv
Successfully saved restructured data to:  ../data/Exp1-basic/results_debiased_model_bert-base-uncased_race_restruct.csv
Successfully saved restructured data to:  ../data/Exp1-basic/results_PKU-Alignment_alpaca-7b-reproduced_restruct.csv
Successfully saved restructured data to:  ../data/Exp1-basic/results_PKU-Alignment_beaver-7b-v1.0_restruct.csv
Successfully saved restructured data to:  ../data/Exp2-DG/results_bert-base-uncased_restruct.csv
Successfully saved restructured data to:  ../data/Exp2-DG/results_debiased_model_bert-base-uncased_gender_restruct.csv
Successfully saved restructured data to:  ../data/Exp2-DG/results_debiased_model_bert-base-uncased_race_restruct.csv
Successfully saved restructured data to:  ../data/Exp2-DG/results_PKU-Alignment_alpaca-7b-reprodu