LabStrangeLoop · ayeganov · Sep 12, 2025 · Sep 12, 2025 · Sep 12, 2025 · Sep 12, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,18 +8,26 @@ authors = [
 ]
 readme = "README.md"
 requires-python = ">=3.12"
+
+keywords = ["transformer", "language-model", "deep-learning", "pytorch", "gpt", "tokenizer"]
+
 classifiers = [
     "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.12",
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Development Status :: 3 - Alpha",  # or 4 - Beta, 5 - Production/Stable
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
 ]
+license = {file = "LICENSE"}
+
 dependencies = [
     "numpy>=2.3.2",
     "ptflops>=0.7.5",
     "pydantic-settings>=2.10.1",
     "pydantic-yaml>=1.6.0",
-    "tiktoken>=0.11.0",
     "torch>=2.8.0",
     "tqdm>=4.67.1",
     "types-tqdm>=4.67.0.20250809",
@@ -93,4 +101,3 @@ build-backend = "hatchling.build"
 [project.scripts]
 train = "scratchgpt.train:main"
 infer = "scratchgpt.infer:main"
-tiktoken = "scratchgpt.tokenizer.tiktoken:main"
diff --git a/scratchgpt/__init__.py b/scratchgpt/__init__.py
@@ -20,7 +20,6 @@
     TokenizerLoadFailedError,
     get_best_model_weights_path,
     get_latest_model_weights_path,
-    get_tokenizer,
     get_tokenizer_path,
     load_model,
     load_tokenizer,
@@ -33,7 +32,6 @@
 )
 from scratchgpt.tokenizer.char_tokenizer import CharTokenizer, Utf8Tokenizer
 from scratchgpt.tokenizer.hf_tokenizer import HuggingFaceTokenizer
-from scratchgpt.tokenizer.tiktoken import TiktokenWrapper
 from scratchgpt.training.trainer import Trainer, get_dtype_for_vocab_size
 
 __all__ = [
@@ -52,7 +50,6 @@
     "load_model",
     "load_tokenizer",
     "save_tokenizer",
-    "get_tokenizer",
     "get_best_model_weights_path",
     "get_latest_model_weights_path",
     "get_tokenizer_path",
@@ -65,7 +62,6 @@
     "CharTokenizer",
     "Utf8Tokenizer",
     "HuggingFaceTokenizer",
-    "TiktokenWrapper",
     # Training
     "Trainer",
     "get_dtype_for_vocab_size",

diff --git a/scratchgpt/tokenizer/tiktoken.py b/scratchgpt/tokenizer/tiktoken.py