In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json
from tqdm import tqdm
import os
from google.cloud import storage

def json_to_parquet(json_file, parquet_file, chunk_size=100000):
    # Function to read JSON in chunks
    def read_json_chunks(file, chunk_size):
        with open(file, 'r') as f:
            data = json.load(f)
            for i in range(0, len(data), chunk_size):
                yield data[i:i + chunk_size]

    # Initialize the Parquet writer
    schema = None
    writer = None

    # Process the JSON file in chunks
    for chunk in tqdm(read_json_chunks(json_file, chunk_size), desc="Converting to Parquet"):
        df = pd.DataFrame(chunk)
        
        # Convert 'id' column to string
        if 'id' in df.columns:
            df['id'] = df['id'].astype(str)
        
        # If 'image' column is missing, add it as an empty string column
        if 'image' not in df.columns:
            df['image'] = ''
        
        if schema is None:
            # Create a schema based on the DataFrame
            schema = pa.Schema.from_pandas(df)
            writer = pq.ParquetWriter(parquet_file, schema)
        
        table = pa.Table.from_pandas(df, schema=schema)
        writer.write_table(table)

    if writer:
        writer.close()

def upload_to_gcs(local_file, gcs_path):
    storage_client = storage.Client()
    bucket_name = gcs_path.split('/')[2]
    blob_name = '/'.join(gcs_path.split('/')[3:])
    
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    blob.upload_from_filename(local_file)
    print(f"File {local_file} uploaded to {gcs_path}")

# File paths
json_file = "/mnt/disks/storage/data/finetune_data/clean_9784k.json"
local_parquet_file = "/tmp/cambrian_dataset_10M.parquet"
gcs_path = "gs://us-central2-storage/tensorflow_datasets/tensorflow_datasets/cambrian_dataset/cambrian_dataset_10M.parquet"

# Convert JSON to Parquet
json_to_parquet(json_file, local_parquet_file)

# Upload to GCS
upload_to_gcs(local_parquet_file, gcs_path)

# Clean up local Parquet file
os.remove(local_parquet_file)
print("Local Parquet file removed")

Converting to Parquet: 98it [02:57,  1.81s/it]


File /tmp/cambrian_dataset_10M.parquet uploaded to gs://us-central2-storage/tensorflow_datasets/tensorflow_datasets/cambrian_dataset/cambrian_dataset_10M.parquet
Local Parquet file removed


In [None]:
import pandas as pd
df = pd.read_parquet(local_parquet_file)