In [1]:
import os
import pandas as pd

In [2]:
current_dir = os.getcwd()

parquet_files = [f for f in os.listdir(current_dir) if f.endswith(".parquet")]

if not parquet_files:
    print("No .parquet files found in the current directory.")
else:
    print(f"Found {len(parquet_files)} .parquet files. Merging...")
    merged_df = pd.DataFrame()

    for file in parquet_files:
        file_path = os.path.join(current_dir, file)
        if file.startswith("self"):
            try:
                df = pd.read_parquet(file_path)
                merged_df = pd.concat([merged_df, df], ignore_index=True)
                print(f"Merged: {file}")
            except Exception as e:
                print(f"Error reading {file}: {e}")

    output_file = os.path.join(current_dir, "O1-OPEN-OpenO1-SFT-zh-ru-vi-merged.parquet")
    try:
        merged_df.to_parquet(output_file, index=False)
        print(f"All files merged successfully.")
    except Exception as e:
        print(f"Error saving merged file: {e}")

Found 14 .parquet files. Merging...
Merged: self-generation-zh-ru-vi-10.parquet
Merged: self-generation-zh-ru-vi-11.parquet
Merged: self-generation-zh-ru-vi-12.parquet
Merged: self-generation-zh-ru-vi-13.parquet
Merged: self-generation-zh-ru-vi-2.parquet
Merged: self-generation-zh-ru-vi-3.parquet
Merged: self-generation-zh-ru-vi-4.parquet
Merged: self-generation-zh-ru-vi-5.parquet
Merged: self-generation-zh-ru-vi-6.parquet
Merged: self-generation-zh-ru-vi-7.parquet
Merged: self-generation-zh-ru-vi-8.parquet
Merged: self-generation-zh-ru-vi-9.parquet
Merged: self-generation-zh-ru-vi.parquet
All files merged successfully.


In [3]:
print(merged_df.shape)
merged_df.drop_duplicates(subset=["prompt", "response_a", "response_b"], inplace=True)
print(merged_df.shape)

(11500, 8)
(11500, 8)
