#TODO: This description is outdated.
This script is meant to restructure the results files to be more usable in the data analysis scripts. Important changes are listed below:
* condensed results for multiple orderings of the same prompt with the same group assignments into a single row
    e.g. the 6 rows
        
        <parameterized-prompt>, ..., X, Y, ... <controlX>
        <parameterized-prompt>, ..., X, Y, ... <controlY>
        <parameterized-prompt>, ..., X, Y, ..., <resultsXY>
        <parameterized-prompt>, ..., Y, X, ..., <resultsXX>
        <parameterized-prompt>, ..., Y, X, ..., <resultsYX>
        <parameterized-prompt>, ..., Y, Y, ..., <resultsYY>
        
becomes the single row

        <parameterized-prompt>, ..., X, Y, ..., <controlX>, <controlY>, <resultsXY>, <resultsXX>, <resultsYX>, <resultsYY>

In [1]:
import pandas as pd

In [31]:
#TODO: Does not account for the extra result columns for mixed grouping in experiment PGG
def restruct_data(source_file, dest_file):
    in_df = pd.read_csv(source_file, sep='|')

    new_columns = in_df.columns.drop(['dv_prob', 'prompt_cat']).tolist() + [
        'res_1_c', 'res_1_lr_l', 'res_1_rl_l', 'res_1_lr_r', 'res_1_rl_r',
        'res_2_c', 'res_2_lr_l', 'res_2_rl_l', 'res_2_lr_r', 'res_2_rl_r'
    ]
    restructured_df = pd.DataFrame(columns=new_columns)

    # Iterate through each row in the original dataframe
    for index, row in in_df.iterrows():
        # Extract relevant information from the current row
        prompt                 = row['prompt']
        prompt_cat             = row['prompt_cat']
        group1                 = row['group1']
        group2                 = row['group2']
        group_type             = row['group_type']
        dependent_var          = row['dependent_var']
        sentiment              = row.get('sentiment', None)
        dv_prob                = row['dv_prob']
        res_group1, res_group2 = sorted([group1, group2])

        # Check if a row with the same groups, and dependent_var already exists in the restructured dataframe
        existing_row_index = restructured_df.index[
            (restructured_df['dependent_var'] == dependent_var) &
            (restructured_df['group1'] == res_group1) &
            (restructured_df['group2'] == res_group2)
        ].tolist()
        if not existing_row_index:
            existing_row_index = restructured_df.index[
                (restructured_df['dependent_var'] == dependent_var) &
                (restructured_df['group1'] == group2) &
                (restructured_df['group2'] == group1)
            ].tolist()

        # If a row with the same group1, group2, and dependent_var exists, update it
        if existing_row_index:
            existing_row_index = existing_row_index[0]
            if   prompt_cat == 'control':
                restructured_df.at[existing_row_index, f'res_{1 if group1==res_group1 else 2}_c'] = dv_prob
            elif prompt_cat == 'lin-rout':
                restructured_df.at[existing_row_index, f'res_{1 if group1==res_group1 else 2}_lr_l'] = dv_prob
            elif prompt_cat == 'rin-lout':
                restructured_df.at[existing_row_index, f'res_{1 if group1==res_group1 else 2}_rl_r'] = dv_prob
            elif prompt_cat == 'lout-rin':
                restructured_df.at[existing_row_index, f'res_{1 if group1==res_group1 else 2}_lr_r'] = dv_prob
            elif prompt_cat == 'rout-lin':
                restructured_df.at[existing_row_index, f'res_{1 if group1==res_group1 else 2}_rl_l'] = dv_prob
                
        # Otherwise, create a new row
        else:
            row_dict = {'prompt': [prompt],
                        'group1': [res_group1],
                        'group2': [res_group2],
                        'group_type': [group_type],
                        'dependent_var': [dependent_var],
                        'res_1_c': [dv_prob] if prompt_cat == 'control' and group1 == res_group1 else [None],
                        'res_2_c': [dv_prob] if prompt_cat == 'control' and group1 == res_group2 else [None],
                        'res_1_lr_l': [None], 'res_1_rl_r': [None], 'res_1_lr_r': [None], 'res_1_rl_l': [None],
                        'res_2_lr_l': [None], 'res_2_rl_l': [None], 'res_2_lr_r': [None], 'res_2_rl_r': [None],
                       }
            row_df = pd.DataFrame.from_dict(row_dict)
            if sentiment is not None:
                row_df.at[0, 'sentiment'] = sentiment
            restructured_df = pd.concat([restructured_df, row_df], ignore_index=True)

    restructured_df.to_csv(dest_file, sep='|', index=False, mode='w')
    print('Successfully saved restructured data to: ', dest_file)

In [33]:
experiment_base_folder = '../data'
experiments = [
    # 'Exp1-basic',
    # 'Exp2-DG',
    # 'Exp3-PGG',
    # 'Exp4-CYD',
    # 'Exp5-FAA',
    # 'Exp6-WM',
    'Exp7-THS',
    'Exp8-TLS-N',
    'Exp8-TLS-P',
    'Exp9-TRB',
]
#Add Mistral-7b and Solar-10.7b
models = [
    'bert-base-uncased',
    'openai-community_gpt2',
    # 'debiased_model_bert-base-uncased_gender',
    # 'debiased_model_bert-base-uncased_race',
    'openlm-research_open_llama_7b',
    'PKU-Alignment_alpaca-7b-reproduced',
    'PKU-Alignment_beaver-7b-v1.0',
    'tiiuae_falcon-7b',
    'tiiuae_falcon-7b-instruct',
    'mistralai_Mistral-7B-v0.1',
    'upstage_SOLAR-10.7B-v1.0'


]
for exp in experiments:
    for mod in models:
        file_base = f'{experiment_base_folder}/{exp}/results_{mod}'
        restruct_data(f'{file_base}.csv', f'{file_base}_restruct.csv')

Successfully saved restructured data to:  ../data/Exp7-THS/results_bert-base-uncased_restruct.csv
Successfully saved restructured data to:  ../data/Exp7-THS/results_openai-community_gpt2_restruct.csv
Successfully saved restructured data to:  ../data/Exp7-THS/results_openlm-research_open_llama_7b_restruct.csv
Successfully saved restructured data to:  ../data/Exp7-THS/results_PKU-Alignment_alpaca-7b-reproduced_restruct.csv
Successfully saved restructured data to:  ../data/Exp7-THS/results_PKU-Alignment_beaver-7b-v1.0_restruct.csv
Successfully saved restructured data to:  ../data/Exp7-THS/results_tiiuae_falcon-7b_restruct.csv
Successfully saved restructured data to:  ../data/Exp7-THS/results_tiiuae_falcon-7b-instruct_restruct.csv
Successfully saved restructured data to:  ../data/Exp7-THS/results_mistralai_Mistral-7B-v0.1_restruct.csv
Successfully saved restructured data to:  ../data/Exp7-THS/results_upstage_SOLAR-10.7B-v1.0_restruct.csv
Successfully saved restructured data to:  ../data/Ex