In [None]:
import json
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from google.cloud import storage
import io
from tqdm import tqdm
import os

def json_to_parquet_gcs(json_file, gcs_bucket, gcs_prefix, chunk_size=10000):
    # Initialize GCS client
    storage_client = storage.Client()
    bucket = storage_client.bucket(gcs_bucket)

    # Function to read JSON in chunks
    def read_json_chunks(file, chunk_size):
        with open(file, 'r') as f:
            data = json.load(f)
            for i in range(0, len(data), chunk_size):
                yield data[i:i + chunk_size]

    # Process the JSON file in chunks
    for i, chunk in enumerate(tqdm(read_json_chunks(json_file, chunk_size), desc="Converting to Parquet")):
        df = pd.DataFrame(chunk)
        
        # Convert 'id' column to string
        if 'id' in df.columns:
            df['id'] = df['id'].astype(str)
        
        # If 'image' column is missing, add it as an empty string column
        if 'image' not in df.columns:
            df['image'] = ''
        
        # Convert DataFrame to PyArrow Table
        table = pa.Table.from_pandas(df)
        
        # Write to Parquet in memory
        buf = io.BytesIO()
        pq.write_table(table, buf)
        buf.seek(0)
        
        # Upload to GCS
        blob_name = f"{gcs_prefix}/cambrian_dataset_10M_part_{i:05d}.parquet"
        blob = bucket.blob(blob_name)
        blob.upload_from_file(buf, content_type='application/octet-stream')
        
        print(f"Uploaded {blob_name}")

# File paths and GCS details
json_file = "/mnt/disks/storage/data/finetune_data/clean_9784k.json"
gcs_bucket = "us-central2-storage"
gcs_prefix = "us-central2-storage/tensorflow_datasets/tensorflow_datasets/downloads/manual_cambrian_dataset"

# Convert JSON to Parquet and upload to GCS
json_to_parquet_gcs(json_file, gcs_bucket, gcs_prefix)

print("Conversion and upload complete")

In [1]:
from google.cloud import storage
import pyarrow.parquet as pq
import pandas as pd
from io import BytesIO

def read_sample_by_id(bucket_name, prefix, sample_id):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    
    for blob in blobs:
        if blob.name.endswith('.parquet'):
            data = blob.download_as_bytes()
            table = pq.read_table(BytesIO(data))
            df = table.to_pandas()
            
            # Ensure 'id' column is treated as string
            df['id'] = df['id'].astype(str)
            
            sample = df[df['id'] == str(sample_id)]
            if not sample.empty:
                return sample.iloc[0].to_dict()
    
    return None  # Sample not found

# Use the function
bucket_name = "us-central2-storage"
prefix = "tensorflow_datasets/tensorflow_datasets/downloads/manual_cambrian_dataset"
sample_id = 12378  # Replace with the desired sample ID

sample = read_sample_by_id(bucket_name, prefix, sample_id)

if sample:
    print(f"Sample with ID {sample_id}:")
    for key, value in sample.items():
        print(f"{key}: {value}")
else:
    print(f"Sample with ID {sample_id} not found.")

Sample with ID 12378:
id: 12378
image: ai2d/ai2d/images/1310.png
conversations: [{'from': 'human', 'value': '<image>\nWhat phase is shown above'}
 {'from': 'gpt', 'value': 'photosynthesis'}]
source: ai2d_15k.json
