In [1]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from huggingface_hub import HfApi, logging
import glob

jsonl_dir = "out_data/"
jsonl_list = glob.glob(f"{jsonl_dir}/*.jsonl")
jsonl_list.sort()

logging.set_verbosity_debug()
hf = HfApi()

chunk_size = 100000  # 50万件ごとに分割

# 一時的にデータを保持するためのリスト
temp_data = []

for path in jsonl_list:
    i=0
    filename = path.split("/")[-1]
    dataset_name = filename.split(".")[0]

    # JSONLファイルを読み込む
    df = pd.read_json(path, lines=True)
    
    # 一時リストにデータを追加
    temp_data.append(df)

    # 一時リストのデータを結合
    combined_df = pd.concat(temp_data, ignore_index=True)

    # チャンクサイズを超える場合、Parquetに変換してアップロード
    while len(combined_df) >= chunk_size:
        chunk = combined_df[:chunk_size]
        combined_df = combined_df[chunk_size:]
        
        table = pa.Table.from_pandas(chunk)
        parquet_path = f"{jsonl_dir}/{dataset_name}_part{i + 1}.parquet"
        pq.write_table(table, parquet_path)
        
        # Parquetファイルをアップロード
        hf.upload_file(path_or_fileobj=parquet_path,
                       path_in_repo=f"data/{dataset_name}_part{i + 1}.parquet",
                       repo_id="kanhatakeyama/SyntheticText",
                       repo_type="dataset")
        i += 1

# 残りのデータもParquetに変換してアップロード
if len(combined_df) > 0:
    table = pa.Table.from_pandas(combined_df)
    parquet_path = f"{jsonl_dir}/{dataset_name}_part{i + 1}.parquet"
    pq.write_table(table, parquet_path)
    
    hf.upload_file(path_or_fileobj=parquet_path,
                   path_in_repo=f"data/{dataset_name}_part{i + 1}.parquet",
                   repo_id="kanhatakeyama/SyntheticText",
                   repo_type="dataset")


  from .autonotebook import tqdm as notebook_tqdm
About to commit to the hub: 1 addition(s), 0 copie(s) and 0 deletion(s).
Request 16bec65e-627b-488f-bef0-92d999b01866: POST https://huggingface.co/api/datasets/kanhatakeyama/SyntheticText/preupload/main (authenticated: True)
Request beb54deb-5e99-44fa-a0a7-b48612ddcfe5: POST https://huggingface.co/datasets/kanhatakeyama/SyntheticText.git/info/lfs/objects/batch (authenticated: True)
Uploading 1 LFS file to the Hub
model_20240601103632_part1.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]Request b7086d28-6378-47e8-8350-ffdd308aae13: PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/81/c1/81c1e5e83b4f19f50fde8f3262cb4ae5f1e0b7a61ae309c8c3507a3c45ef1184/12d83cec787f0d5af89e22b543f94c893f0ae424e21c35e59b225af016b0db68?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQFN2FTF47%2F20240604%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240604T050124Z&X-Amz-Expires=86400&X-Amz-