From af5db07a3fb03ebcb7eaefea497d2615671bcfe6 Mon Sep 17 00:00:00 2001 From: Hao Shen Date: Thu, 10 Oct 2024 14:07:42 -0400 Subject: [PATCH] correct the chunk size by adding header size --- src/litdata/streaming/item_loader.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/litdata/streaming/item_loader.py b/src/litdata/streaming/item_loader.py index 17a2d5342..296720902 100644 --- a/src/litdata/streaming/item_loader.py +++ b/src/litdata/streaming/item_loader.py @@ -141,11 +141,17 @@ def load_item_from_chunk( del self._chunk_filepaths[chunk_filepath] if chunk_filepath not in self._chunk_filepaths: - exists = os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size >= chunk_bytes + # Get size of chunk header + # The number of items + the number of offsets (number of items in the chunk + 1) + # multiplied by the header encoding dtype (np.uint32) + chunk_header_bytes = (1 + self._chunks[chunk_index]["chunk_size"] + 1) * 4 + filesize_bytes = chunk_bytes + chunk_header_bytes + + exists = os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size >= filesize_bytes while not exists: sleep(0.1) - exists = os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size >= chunk_bytes + exists = os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size >= filesize_bytes self._chunk_filepaths[chunk_filepath] = True