diff --git a/pipeline/mimicit_utils/mimicit_dataset.py b/pipeline/mimicit_utils/mimicit_dataset.py index 2b9dabdc..a4d9b3e1 100755 --- a/pipeline/mimicit_utils/mimicit_dataset.py +++ b/pipeline/mimicit_utils/mimicit_dataset.py @@ -217,10 +217,26 @@ def __init__(self, args, dataset_info, task_group=""): ] ) - if cur_images_path != "" and cur_images_path.endswith(".parquet") and cur_images_path not in loaded_images_path: - cur_df = pd.read_parquet(cur_images_path, columns=None) # not in memory - self.images.append(cur_df) - loaded_images_path.add(cur_images_path) + if cur_images_path != "" and cur_images_path not in loaded_images_path: + if cur_images_path.endswith(".parquet"): + parquet_file = pq.ParquetFile(cur_images_path) + dfs = [] # List to hold the DataFrames of each batch + for batch in parquet_file.iter_batches(batch_size=1000): # Adjust batch_size as needed + batch_df = batch.to_pandas() + dfs.append(batch_df) + cur_df = pd.concat(dfs, ignore_index=True) # Concatenate all DataFrames + self.images.append(cur_df) + loaded_images_path.add(cur_images_path) + elif cur_images_path.endswith(".json"): + with open(cur_images_path, "rb") as f: + cur_df = pd.DataFrame(orjson.loads(f.read())) + self.images.append(cur_df) + loaded_images_path.add(cur_images_path) + else: + master_print(f"Error: {cur_images_path} is not supported!") + import pdb + pdb.set_trace() + del cur_df self.train_data_list.extend(resampled_train) self.train_config.update(cache_train_config)