In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
def count_equivalent_rows(df1, df2):
    df1_normalized = df1.copy()
    df2_normalized = df2.copy()

    df1_normalized[["response_a", "response_b"]] = (pd.DataFrame(df1_normalized[["response_a", "response_b"]].values.tolist()).apply(sorted, axis=1, result_type="expand"))
    df2_normalized[["response_a", "response_b"]] = (pd.DataFrame(df2_normalized[["response_a", "response_b"]].values.tolist()).apply(sorted, axis=1, result_type="expand"))

    df1_normalized["key"] = df1_normalized.apply(lambda row: f"{row['prompt']},{row['response_a']},{row['response_b']}", axis=1)
    df2_normalized["key"] = df2_normalized.apply(lambda row: f"{row['prompt']},{row['response_a']},{row['response_b']}", axis=1)

    common_keys = set(df1_normalized["key"]).intersection(set(df2_normalized["key"]))

    return len(common_keys)

df1 = pd.read_parquet("./argilla-dpo-mix-7k.parquet")
df2 = pd.read_parquet("./mlabonne-orpo-dpo-mix-40k.parquet")
common_count = count_equivalent_rows(df1, df2)
print(f"There are {common_count} rows where all three columns are identical, including cases where `response_a` and `response_b` are interchangeable.")

There are 220 rows where all three columns are identical, including cases where `response_a` and `response_b` are interchangeable.


In [3]:
def merge_and_deduplicate(dataframes):
    combined_df = pd.concat(dataframes, ignore_index=True)

    def standardize_row(row):
        if row["response_a"] > row["response_b"]:
            row["response_a"], row["response_b"] = row["response_b"], row["response_a"]
            row["winner"] = "model_a" if row["winner"] == "model_b" else "model_b"
        return row

    combined_df = combined_df.apply(standardize_row, axis=1)

    deduplicated_df = combined_df.drop_duplicates(subset=["prompt", "response_a", "response_b"])

    model_a_wins = deduplicated_df[deduplicated_df["winner"] == "model_a"]
    model_b_wins = deduplicated_df[deduplicated_df["winner"] == "model_b"]

    diff = abs(len(model_a_wins) - len(model_b_wins))

    if len(model_a_wins) > len(model_b_wins):
        rows_to_swap = model_a_wins.sample(diff // 2, random_state=42)
        for idx in rows_to_swap.index:
            deduplicated_df.loc[idx, ["response_a", "response_b"]] = deduplicated_df.loc[idx, ["response_b", "response_a"]].values
            deduplicated_df.loc[idx, "winner"] = "model_b"
    elif len(model_b_wins) > len(model_a_wins):
        rows_to_swap = model_b_wins.sample(diff // 2, random_state=42)
        for idx in rows_to_swap.index:
            deduplicated_df.loc[idx, ["response_a", "response_b"]] = deduplicated_df.loc[idx, ["response_b", "response_a"]].values
            deduplicated_df.loc[idx, "winner"] = "model_a"

    return deduplicated_df

df1 = pd.read_parquet("./argilla-dpo-mix-7k.parquet")
print(df1.shape)
df2 = pd.read_parquet("./mlabonne-orpo-dpo-mix-40k.parquet")
print(df2.shape)
df3 = pd.read_parquet("./Intel-orca_dpo_pairs.parquet")
print(df3.shape)

final_df = merge_and_deduplicate([df1, df2, df3])
print(final_df.shape)
print("The number of duplicate rows is: ", len(df1) + len(df2) + len(df3) - len(final_df))
print(final_df["winner"].value_counts())
final_df.to_parquet("hf_dpo_combined.parquet")
final_df.head()

(1291, 8)
(39758, 8)
(12859, 8)
(52617, 8)
The number of duplicate rows is:  1291
winner
model_b    26309
model_a    26308
Name: count, dtype: int64


Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,cXKrCdseUJ8va4PKf4QrT,Q:Question: how old julio cesar chavez when he...,"Yes, the information you found on Google is c...","No, the answer provided does not directly answ...",model_a,Unknown,Unknown,English
1,yEaDBnhKgKRZdOtZajBX9,"Definition: Given a sentence in German, provid...",Sure! Here's how I used the definition to com...,一般的に葉は長さが1.5mmから4mmで、幅は0.2mmから0.7mmです。\n\nTo c...,model_b,Unknown,Unknown,English
2,pwJXHWcTw8GcA3NxgtIQa,How does the sentence end? See options at the ...,"Sure, I'd be happy to help you with the task!...","To find out how the sentence ends, we need to ...",model_b,Unknown,Unknown,English
3,eeVFSlegHdvcUC1mYj1mu,Q:Suggest a movie title for the following movi...,"Based on the provided movie plot, here are so...","""Rings of Destiny: The Battle for Middle-earth""",model_a,Unknown,Unknown,English
4,amALFkWY1JHZZoDhenwjw,Two brothers had bike repair shops in neighbor...,"Sure, I'd be happy to help! Based on the info...",Bob's Town\nB: Rob's Town\n\nTo answer this qu...,model_b,Unknown,Unknown,English
