Skip to content

Commit af5db07

Browse files
author
Hao Shen
committed
correct the chunk size by adding header size
1 parent b9aa903 commit af5db07

File tree

1 file changed

+8
-2
lines changed

1 file changed

+8
-2
lines changed

src/litdata/streaming/item_loader.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,17 @@ def load_item_from_chunk(
141141
del self._chunk_filepaths[chunk_filepath]
142142

143143
if chunk_filepath not in self._chunk_filepaths:
144-
exists = os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size >= chunk_bytes
144+
# Get size of chunk header
145+
# The number of items + the number of offsets (number of items in the chunk + 1)
146+
# multiplied by the header encoding dtype (np.uint32)
147+
chunk_header_bytes = (1 + self._chunks[chunk_index]["chunk_size"] + 1) * 4
148+
filesize_bytes = chunk_bytes + chunk_header_bytes
149+
150+
exists = os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size >= filesize_bytes
145151

146152
while not exists:
147153
sleep(0.1)
148-
exists = os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size >= chunk_bytes
154+
exists = os.path.exists(chunk_filepath) and os.stat(chunk_filepath).st_size >= filesize_bytes
149155

150156
self._chunk_filepaths[chunk_filepath] = True
151157

0 commit comments

Comments
 (0)