<a href="https://colab.research.google.com/github/Neovalle/H4rmony/blob/main/Imploding_and_Exploding_H4rmony_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Converting the dataset to rows of unique Prompt and Completions (Imploding)

In [None]:
!pip install datasets
from datasets import load_dataset
import pandas as pd

# Load the dataset
dataset = load_dataset("neovalle/H4rmony")

# Convert to pandas DataFrame
df = pd.DataFrame(dataset['train'])

# Proceed with the dataset transformation
# Filter rows for 'R1-R2' and 'R1-R3' ComparedRanks
df_r1_r2 = df[df['ComparedRanks'] == 'R1-R2']
df_r1_r3 = df[df['ComparedRanks'] == 'R1-R3']

# Create a dictionary to hold the new dataset
new_data = {
    'PromptID': [],
    'Prompt': [],
    'BetterAnswer': [],
    'Ambivalent': [],
    'WorseAnswer': []
}

# Populate the dictionary
for prompt_id in df['PromptID'].unique():
    prompt = df[df['PromptID'] == prompt_id]['Prompt'].iloc[0]

    better_answer = df_r1_r2[df_r1_r2['PromptID'] == prompt_id]['BetterCompletion'].iloc[0] if not df_r1_r2[df_r1_r2['PromptID'] == prompt_id].empty else None
    ambivalent = df_r1_r2[df_r1_r2['PromptID'] == prompt_id]['WorseCompletion'].iloc[0] if not df_r1_r2[df_r1_r2['PromptID'] == prompt_id].empty else None
    worse_answer = df_r1_r3[df_r1_r3['PromptID'] == prompt_id]['WorseCompletion'].iloc[0] if not df_r1_r3[df_r1_r3['PromptID'] == prompt_id].empty else None

    new_data['PromptID'].append(prompt_id)
    new_data['Prompt'].append(prompt)
    new_data['BetterAnswer'].append(better_answer)
    new_data['Ambivalent'].append(ambivalent)
    new_data['WorseAnswer'].append(worse_answer)

# Convert the dictionary to a DataFrame
new_df = pd.DataFrame(new_data)

# new_df.to_csv('transformed_dataset.csv', index=False)


### Building the dataset pairwise (exploding)

In [None]:
# Initialize a new DataFrame to hold the rebuilt dataset
rebuilt_df = pd.DataFrame(columns=['PromptID', 'Prompt', 'BetterCompletion', 'WorseCompletion', 'ComparedRanks'])

# Iterate through new_df to rebuild the original structure
for _, row in new_df.iterrows():
    # Row for ComparedRanks = 'R1-R2'
    rebuilt_df = rebuilt_df.append({
        'PromptID': row['PromptID'],
        'Prompt': row['Prompt'],
        'BetterCompletion': row['BetterAnswer'],
        'WorseCompletion': row['Ambivalent'],
        'ComparedRanks': 'R1-R2'
    }, ignore_index=True)

    # Row for ComparedRanks = 'R1-R3'
    rebuilt_df = rebuilt_df.append({
        'PromptID': row['PromptID'],
        'Prompt': row['Prompt'],
        'BetterCompletion': row['BetterAnswer'],
        'WorseCompletion': row['WorseAnswer'],
        'ComparedRanks': 'R1-R3'
    }, ignore_index=True)

    # Row for ComparedRanks = 'R2-R3'
    rebuilt_df = rebuilt_df.append({
        'PromptID': row['PromptID'],
        'Prompt': row['Prompt'],
        'BetterCompletion': row['Ambivalent'],
        'WorseCompletion': row['WorseAnswer'],
        'ComparedRanks': 'R2-R3'
    }, ignore_index=True)


# rebuilt_df.to_csv('rebuilt_dataset.csv', index=False)
