In [1]:
import pandas as pd

data = pd.read_json("hf://datasets/HannahRoseKirk/prism-alignment/utterances.jsonl", lines=True)

In [2]:
def reconstruct_conversations(desired_model, df):
    # filter data by model
    filtered_data = df[df['model_name'] == desired_model]
    
    conversation_ids = filtered_data['conversation_id'].unique()
    reconstructed_conversations = []

    for conversation_id in conversation_ids:
        # get data for conversation
        conversation_data = filtered_data[filtered_data['conversation_id'] == conversation_id]
        
        # sort by turn
        conversation_data = conversation_data.sort_values(by='turn')
        
        conversation_messages = []
        turns = conversation_data['turn'].unique()
        
        for turn in turns:
            turn_data = conversation_data[conversation_data['turn'] == turn]

            # select only chosen response
            chosen_row = turn_data[turn_data['if_chosen'] == True]
            
            if not chosen_row.empty:
                # get user prompt and model response
                user_prompt = chosen_row['user_prompt'].iloc[0]
                model_response = chosen_row['model_response'].iloc[0]
                
                # append messages
                conversation_messages.append({'role': 'user', 'content': user_prompt})
                conversation_messages.append({'role': 'assistant', 'content': model_response})
            else:
                # print(f"no chosen response for conversation {conversation_id}, turn {turn}")
                continue
        
        # append reconstructed conversation
        reconstructed_conversations.append({'conversation_id': conversation_id, 'messages': conversation_messages})

    # create dataframe and save
    reconstructed_df = pd.DataFrame(reconstructed_conversations)
    file_name_clean = desired_model.replace('/', '-')
    file_name = f"{file_name_clean}_reconstructed_conversations.csv"
    reconstructed_df.to_csv(f"data/{file_name}", index=False)

In [3]:
models = list(data['model_name'].unique())
for model in models:
    reconstruct_conversations(model, data)