In [None]:
import json
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from google.cloud import storage
import io
from tqdm import tqdm
import os

def json_to_parquet_gcs(json_file, gcs_bucket, gcs_prefix, chunk_size=10000):
    # Initialize GCS client
    storage_client = storage.Client()
    bucket = storage_client.bucket(gcs_bucket)

    # Function to read JSON in chunks
    def read_json_chunks(file, chunk_size):
        with open(file, 'r') as f:
            data = json.load(f)
            for i in range(0, len(data), chunk_size):
                yield data[i:i + chunk_size]

    # Process the JSON file in chunks
    for i, chunk in enumerate(tqdm(read_json_chunks(json_file, chunk_size), desc="Converting to Parquet")):
        df = pd.DataFrame(chunk)
        
        # Convert 'id' column to string
        if 'id' in df.columns:
            df['id'] = df['id'].astype(str)
        
        # If 'image' column is missing, add it as an empty string column
        if 'image' not in df.columns:
            df['image'] = ''
        
        # Convert DataFrame to PyArrow Table
        table = pa.Table.from_pandas(df)
        
        # Write to Parquet in memory
        buf = io.BytesIO()
        pq.write_table(table, buf)
        buf.seek(0)
        
        # Upload to GCS
        blob_name = f"{gcs_prefix}/cambrian_dataset_10M_part_{i:05d}.parquet"
        blob = bucket.blob(blob_name)
        blob.upload_from_file(buf, content_type='application/octet-stream')
        
        print(f"Uploaded {blob_name}")

# File paths and GCS details
json_file = "/mnt/disks/storage/data/finetune_data/clean_9784k.json"
gcs_bucket = "us-central2-storage"
gcs_prefix = "us-central2-storage/tensorflow_datasets/tensorflow_datasets/downloads/manual_cambrian_dataset"

# Convert JSON to Parquet and upload to GCS
json_to_parquet_gcs(json_file, gcs_bucket, gcs_prefix)

print("Conversion and upload complete")

In [3]:
import os
import pyarrow.parquet as pq
import pandas as pd
from tqdm import tqdm

def count_parquet_files(folder_path):
    return sum(1 for file in os.listdir(folder_path) if file.endswith('.parquet'))

def read_parquet_sample(folder_path, num_files=1, rows_per_file=5):
    parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
    
    for file in parquet_files[:num_files]:
        print(f"\nSample from file: {file}")
        file_path = os.path.join(folder_path, file)
        table = pq.read_table(file_path)
        df = table.to_pandas()
        print(df.head(rows_per_file))

def count_total_samples(folder_path):
    total_samples = 0
    parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
    
    for file in tqdm(parquet_files, desc="Counting samples"):
        file_path = os.path.join(folder_path, file)
        table = pq.read_table(file_path)
        total_samples += table.num_rows
    
    return total_samples

def read_sample_by_id(folder_path, sample_id):
    parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
    
    for file in tqdm(parquet_files, desc="Searching for sample"):
        file_path = os.path.join(folder_path, file)
        table = pq.read_table(file_path)
        df = table.to_pandas()
        
        # Ensure 'id' column is treated as string
        df['id'] = df['id'].astype(str)
        
        sample = df[df['id'] == str(sample_id)]
        if not sample.empty:
            return sample.iloc[0].to_dict()
    
    return None  # Sample not found

# Use the functions
folder_path = "/home/austinwang/manual_cambrian_dataset"

# Count Parquet files
try:
    num_files = count_parquet_files(folder_path)
except FileNotFoundError:
    print(f"Folder not found: {folder_path}")
    print(f"Try: gsutil -m cp -R gs://us-central2-storage/tensorflow_datasets/tensorflow_datasets/downloads/manual_cambrian_dataset /home/austinwang")

print(f"Total number of Parquet files: {num_files}")

# Read sample data from the first 2 Parquet files
read_parquet_sample(folder_path, num_files=2, rows_per_file=5)

# Count total samples
total_samples = count_total_samples(folder_path)
print(f"Total number of samples across all Parquet files: {total_samples}")

# Read a specific sample
sample_id = 12378  # Replace with the desired sample ID
sample = read_sample_by_id(folder_path, sample_id)

if sample:
    print(f"\nSample with ID {sample_id}:")
    for key, value in sample.items():
        print(f"{key}: {value}")
else:
    print(f"\nSample with ID {sample_id} not found.")

Total number of Parquet files: 979

Sample from file: cambrian_dataset_10M_part_00483.parquet
                         id  \
0  allava_vflan_inst_111976   
1  allava_vflan_inst_111977   
2  allava_vflan_inst_111978   
3  allava_vflan_inst_111979   
4   allava_vflan_inst_11197   

                                               image  \
0  allava/ALLaVA-4V/allava_vflan/images/images_19...   
1  allava/ALLaVA-4V/allava_vflan/images/images_19...   
2  allava/ALLaVA-4V/allava_vflan/images/images_19...   
3  allava/ALLaVA-4V/allava_vflan/images/images_19...   
4  allava/ALLaVA-4V/allava_vflan/images/images_19...   

                                       conversations                  source  
0  [{'from': 'human', 'value': '<image>
From the ...  allava-vflan-200k.json  
1  [{'from': 'human', 'value': '<image>
In this t...  allava-vflan-200k.json  
2  [{'from': 'human', 'value': '<image>
What is t...  allava-vflan-200k.json  
3  [{'from': 'human', 'value': '<image>
You are g...  allava-vflan

Counting samples: 100%|██████████| 979/979 [00:19<00:00, 51.07it/s]


Total number of samples across all Parquet files: 9784416


Searching for sample:  11%|█▏        | 112/979 [00:06<00:52, 16.52it/s]


Sample with ID 12378:
id: 12378
conversations: [{'from': 'human', 'value': 'Create a nested for loop that prints all prime numbers within a given range from a to b (inclusive), where a and b are positive integers (1 ≤ a ≤ b ≤ 10^6). Additionally, include a piece of erroneous code that needs to be fixed.\n\nNote: The time complexity of the solution should be O(n log log n), where n is the maximum value between a and b. The space complexity should be O(1).'}
 {'from': 'gpt', 'value': "Sure! To solve this task, I will create a nested for loop that iterates over each number within the given range. For each number, I will check if it is a prime number using the Sieve of Eratosthenes algorithm. If a number is prime, it will be printed.\n\nHere's the code:\n\n```python\ndef is_prime(n):\n    if n <= 1:\n        return False\n    if n <= 3:\n        return True\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i + 2) == 0:


