LabStrangeLoop · ayeganov · Sep 4, 2025 · Aug 29, 2025 · Aug 29, 2025 · Sep 1, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,7 +28,6 @@ dev = [
 ]
 
 [tool.pytest.ini_options]
-asyncio_mode = "auto"
 
 [tool.mypy]
 python_version = "3.12"

diff --git a/scratchgpt/dataloader.py b/scratchgpt/dataloader.py
@@ -2,12 +2,16 @@
 from pathlib import Path
 from typing import Literal, override
 
+import numpy as np
 import torch
 from torch import Tensor
 from torch.utils.data import Dataset
+from tqdm import tqdm
 
 from .tokenizer.base_tokenizer import Tokenizer
 
+DEFAULT_DTYPE = np.dtype(np.uint16)
+
 
 class TextProvider(ABC):
     @abstractmethod
@@ -40,17 +44,13 @@ def __init__(self, dir_path: Path) -> None:
             raise ValueError(f"Directory path {dir_path} is not a directory")
 
         self._data = ""
+        file_paths = list(dir_path.rglob("*"))
         print(f"Loading data from {dir_path}")
-        total_read: int = 0
-        for idx, file_path in enumerate(dir_path.rglob("*")):
+        for file_path in tqdm(file_paths, desc="Reading data files", unit="file"):
             if file_path.is_file() and not file_path.name.startswith("."):
                 with open(file_path, encoding="utf-8") as f:
                     self._data += f.read() + "\n"
 
-            if idx % 500 == 1:
-                total_read += 500
-                print(f"Read {total_read} files")
-
         print("Data Loaded")
 
     @override
@@ -93,3 +93,26 @@ def __getitem__(self, idx: int) -> tuple[Tensor, Tensor]:
         block = self.data[idx : idx + self.block_size]
         target = self.data[idx + 1 : idx + self.block_size + 1]
         return block, target
+
+
+class PretokenizedDataset(Dataset[tuple[Tensor, Tensor]]):
+    def __init__(
+        self,
+        token_file: Path,
+        block_size: int,
+        dtype: np.dtype = DEFAULT_DTYPE,
+    ) -> None:
+        super().__init__()
+        self.block_size = block_size
+
+        all_tokens = np.memmap(token_file, dtype=dtype, mode="c")
+        self.data = torch.from_numpy(all_tokens)
+
+    def __len__(self) -> int:
+        return max(0, len(self.data) - self.block_size)
+
+    def __getitem__(self, idx: int) -> tuple[Tensor, Tensor]:
+        block = self.data[idx : idx + self.block_size]
+        target = self.data[idx + 1 : idx + self.block_size + 1]
+
+        return block.long(), target.long()
diff --git a/scratchgpt/preprocess.py b/scratchgpt/preprocess.py
@@ -0,0 +1,126 @@
+import io
+from pathlib import Path
+from typing import Any, Protocol
+
+import numpy as np
+from numpy.typing import DTypeLike
+from tqdm import tqdm
+
+from .tokenizer.base_tokenizer import Tokenizer
+
+
+class SupportsUpdate(Protocol):
+    def update(self, n: int) -> Any: ...
+
+
+class Preprocessor(Protocol):
+    """
+    Preprocessor protocol for handling dataset conversion using a specific tokenizer.
+    """
+
+    def __call__(
+        self,
+        source: io.TextIOBase,
+        sink: io.BufferedIOBase,
+        chunk_size: int,
+        pbar: SupportsUpdate | None = None,
+    ) -> None:
+        """
+        Process the input text source and write the result to the binary sink.
+        Optionally updates a tqdm progress bar.
+        """
+
+
+class TokenizerPreprocessor(Preprocessor):
+    """
+    Default pre-processor. Tokenizes a text stream and writes the output
+    to a binary stream, managing progress updates internally.
+    """
+
+    def __init__(self, tokenizer: Tokenizer) -> None:
+        self.tokenizer = tokenizer
+        vocab_size = self.tokenizer.vocab_size
+        if vocab_size < 2**8:
+            self.dtype: DTypeLike = np.uint8
+        elif vocab_size < 2**16:
+            self.dtype = np.uint16
+        elif vocab_size < 2**32:
+            self.dtype = np.uint32
+        else:
+            self.dtype = np.uint64
+        print(f"Preprocessor initialized. Selected {np.dtype(self.dtype).name} for token storage.")
+
+    def __call__(
+        self,
+        source: io.TextIOBase,
+        sink: io.BufferedIOBase,
+        chunk_size: int = 10 * 1024 * 1024,
+        pbar: SupportsUpdate | None = None,
+    ) -> None:
+        """
+        Reads from the source stream, tokenizes content in chunks, writes to the
+        sink stream, and updates the provided progress bar.
+        """
+        while chunk := source.read(chunk_size):
+            tokens = self.tokenizer.encode(chunk)
+            token_array = np.array(tokens, dtype=self.dtype)
+            sink.write(token_array.tobytes())
+            if pbar:
+                pbar.update(len(chunk.encode("utf-8", errors="ignore")))
+
+
+class File2FileTokenizerPreprocessor:
+    """
+    Orchestrates preprocessing for a single source file to a single destination file.
+    """
+
+    def __init__(self, tokenizer: Tokenizer) -> None:
+        self._preprocessor = TokenizerPreprocessor(tokenizer)
+
+    def __call__(self, input_path: Path, output_path: Path, chunk_size: int = 10 * 1024 * 1024) -> None:
+        if not input_path.is_file():
+            raise ValueError(f"Input path must be a file: {input_path}")
+        if output_path.exists():
+            raise FileExistsError(f"Output path already exists: {output_path}")
+
+        total_size = input_path.stat().st_size
+
+        with (
+            open(input_path, encoding="utf-8", errors="ignore") as source,
+            open(output_path, "wb") as sink,
+            tqdm(total=total_size, unit="B", unit_scale=True, desc=f"Tokenizing {input_path.name}") as pbar,
+        ):
+            self._preprocessor(source, sink, chunk_size, pbar)
+
+        print(f"Successfully preprocessed '{input_path}' to '{output_path}'")
+
+
+class Folder2FileTokenizerPreprocessor:
+    """
+    Orchestrates preprocessing for a directory of source files to a single destination file.
+    """
+
+    def __init__(self, tokenizer: Tokenizer) -> None:
+        self._preprocessor = TokenizerPreprocessor(tokenizer)
+
+    def __call__(self, input_path: Path, output_path: Path, chunk_size: int = 10 * 1024 * 1024) -> None:
+        if not input_path.is_dir():
+            raise ValueError(f"Input path must be a directory: {input_path}")
+        if output_path.exists():
+            raise FileExistsError(f"Output path already exists: {output_path}")
+
+        files_to_process = [p for p in input_path.rglob("*") if p.is_file() and not p.name.startswith(".")]
+        total_size = sum(p.stat().st_size for p in files_to_process)
+
+        print(f"Found {len(files_to_process)} files to process.")
+
+        with (
+            open(output_path, "wb") as sink,
+            tqdm(total=total_size, unit="B", unit_scale=True, desc=f"Tokenizing Folder '{input_path.name}'") as pbar,
+        ):
+            for file_path in files_to_process:
+                pbar.set_postfix_str(f"Processing: {file_path.name}", refresh=True)
+                with open(file_path, encoding="utf-8", errors="ignore") as source:
+                    self._preprocessor(source, sink, chunk_size, pbar)
+
+        print(f"\nSuccessfully preprocessed folder '{input_path}' to '{output_path}'")