Skip to content

Commit

Permalink
Update mimicit_dataset.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Luodian committed Dec 10, 2023
1 parent 5a7d017 commit 3a74688
Showing 1 changed file with 20 additions and 4 deletions.
24 changes: 20 additions & 4 deletions pipeline/mimicit_utils/mimicit_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,10 +217,26 @@ def __init__(self, args, dataset_info, task_group=""):
]
)

if cur_images_path != "" and cur_images_path.endswith(".parquet") and cur_images_path not in loaded_images_path:
cur_df = pd.read_parquet(cur_images_path, columns=None) # not in memory
self.images.append(cur_df)
loaded_images_path.add(cur_images_path)
if cur_images_path != "" and cur_images_path not in loaded_images_path:
if cur_images_path.endswith(".parquet"):
parquet_file = pq.ParquetFile(cur_images_path)
dfs = [] # List to hold the DataFrames of each batch
for batch in parquet_file.iter_batches(batch_size=1000): # Adjust batch_size as needed
batch_df = batch.to_pandas()
dfs.append(batch_df)
cur_df = pd.concat(dfs, ignore_index=True) # Concatenate all DataFrames
self.images.append(cur_df)
loaded_images_path.add(cur_images_path)
elif cur_images_path.endswith(".json"):
with open(cur_images_path, "rb") as f:
cur_df = pd.DataFrame(orjson.loads(f.read()))
self.images.append(cur_df)
loaded_images_path.add(cur_images_path)
else:
master_print(f"Error: {cur_images_path} is not supported!")
import pdb
pdb.set_trace()
del cur_df

self.train_data_list.extend(resampled_train)
self.train_config.update(cache_train_config)
Expand Down

0 comments on commit 3a74688

Please sign in to comment.