LoicGrobol · LoicGrobol · Jul 28, 2023 · Jul 28, 2023 · Jul 28, 2023 · Jul 28, 2023
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,7 +19,7 @@ keywords = ["nlp", "transformers", "language-model"]
 requires-python = ">=3.8"
 dependencies = [
     "click >= 8.0.4, < 9.0.0",
-    "datasets >= 2.2, < 2.14",
+    "datasets >= 2.2, < 2.15",
     "filelock",
     "jsonlines",
     "loguru",
@@ -68,6 +68,12 @@ zeldarose-transformer = "zeldarose.train_transformer:main"
 line-length = 100
 select = ["A", "B", "C90", "E", "F", "N", "NPY", "PT", "PTH", "PYI", "S", "W"]
 
+[tool.ruff.per-file-ignores]
+"tests/**/*.py" = [
+    "S101", # asserts allowed in tests
+    "ARG", # Unused function args -> fixtures nevertheless are functionally relevant
+]
+
 
 [tool.mypy]
 warn_unreachable = true
@@ -78,6 +84,3 @@ line-length = 100
 
 [tool.isort]
 profile = "black"
-
-[tool.bandit]
-assert_used.skips = ["tests/test_*.py"]
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
@@ -25,14 +25,16 @@ def test_train_tokenizer(
     tmp_path: pathlib.Path,
 ):
     ret = script_runner.run(
-        "zeldarose-tokenizer",
-        "--vocab-size",
-        "4096",
-        "--out-path",
-        str(tmp_path / "tokenizer"),
-        "--model-name",
-        "my-muppet",
-        str(raw_text_path),
+        [
+            "zeldarose-tokenizer",
+            "--vocab-size",
+            "4096",
+            "--out-path",
+            str(tmp_path / "tokenizer"),
+            "--model-name",
+            "my-muppet",
+            str(raw_text_path),
+        ]
     )
     assert ret.success
 
@@ -57,25 +59,27 @@ def test_train_mbart(
         extra_args.extend(["--num-devices", str(devices)])
 
     ret = script_runner.run(
-        "zeldarose-transformer",
-        "--accelerator",
-        accelerator,
-        "--config",
-        str(mbart_task_config),
-        "--tokenizer",
-        str(mbart_model_config),
-        "--model-config",
-        str(mbart_model_config),
-        "--device-batch-size",
-        "2",
-        "--out-dir",
-        str(tmp_path / "train-out"),
-        "--cache-dir",
-        str(tmp_path / "cache"),
-        "--val-text",
-        str(translation_dataset_path),
-        str(translation_dataset_path),
-        *extra_args,
+        [
+            "zeldarose-transformer",
+            "--accelerator",
+            accelerator,
+            "--config",
+            str(mbart_task_config),
+            "--tokenizer",
+            str(mbart_model_config),
+            "--model-config",
+            str(mbart_model_config),
+            "--device-batch-size",
+            "2",
+            "--out-dir",
+            str(tmp_path / "train-out"),
+            "--cache-dir",
+            str(tmp_path / "cache"),
+            "--val-text",
+            str(translation_dataset_path),
+            str(translation_dataset_path),
+            *extra_args,
+        ],
         env={"TORCH_DISTRIBUTED_DEBUG": "DETAIL", **os.environ},
     )
     assert ret.success
@@ -102,25 +106,27 @@ def test_train_mlm(
         extra_args.extend(["--num-devices", str(devices)])
 
     ret = script_runner.run(
-        "zeldarose-transformer",
-        "--accelerator",
-        accelerator,
-        "--config",
-        str(mlm_task_config),
-        "--tokenizer",
-        str(tokenizer_name_or_path),
-        "--model-config",
-        str(mlm_model_config),
-        "--device-batch-size",
-        "8",
-        "--out-dir",
-        str(tmp_path / "train-out"),
-        "--cache-dir",
-        str(tmp_path / "tokenizer-cache"),
-        "--val-text",
-        str(raw_text_path),
-        str(raw_text_path),
-        *extra_args,
+        [
+            "zeldarose-transformer",
+            "--accelerator",
+            accelerator,
+            "--config",
+            str(mlm_task_config),
+            "--tokenizer",
+            str(tokenizer_name_or_path),
+            "--model-config",
+            str(mlm_model_config),
+            "--device-batch-size",
+            "8",
+            "--out-dir",
+            str(tmp_path / "train-out"),
+            "--cache-dir",
+            str(tmp_path / "tokenizer-cache"),
+            "--val-text",
+            str(raw_text_path),
+            str(raw_text_path),
+            *extra_args,
+        ],
         env={"TORCH_DISTRIBUTED_DEBUG": "DETAIL", **os.environ},
     )
     assert ret.success
@@ -151,25 +157,27 @@ def test_train_rtd(
     if num_devices is not None:
         extra_args.extend(["--num-devices", str(num_devices)])
     ret = script_runner.run(
-        "zeldarose-transformer",
-        "--accelerator",
-        accelerator,
-        "--config",
-        str(rtd_task_config),
-        "--tokenizer",
-        str(tokenizer_name_or_path),
-        "--model-config",
-        str(rtd_model_config),
-        "--device-batch-size",
-        "8",
-        "--out-dir",
-        str(tmp_path / "train-out"),
-        "--cache-dir",
-        str(tmp_path / "tokenizer-cache"),
-        "--val-text",
-        str(raw_text_path),
-        str(raw_text_path),
-        *extra_args,
+        [
+            "zeldarose-transformer",
+            "--accelerator",
+            accelerator,
+            "--config",
+            str(rtd_task_config),
+            "--tokenizer",
+            str(tokenizer_name_or_path),
+            "--model-config",
+            str(rtd_model_config),
+            "--device-batch-size",
+            "8",
+            "--out-dir",
+            str(tmp_path / "train-out"),
+            "--cache-dir",
+            str(tmp_path / "tokenizer-cache"),
+            "--val-text",
+            str(raw_text_path),
+            str(raw_text_path),
+            *extra_args,
+        ],
         env={"TORCH_DISTRIBUTED_DEBUG": "DETAIL", **os.environ},
     )
     assert ret.success
@@ -182,25 +190,27 @@ def test_train_mlm_with_remote_dataset(
     tmp_path: pathlib.Path,
 ):
     ret = script_runner.run(
-        "zeldarose-transformer",
-        "--strategy",
-        "ddp_spawn",
-        "--num-devices",
-        "2",
-        "--config",
-        str(mlm_task_config),
-        "--tokenizer",
-        "lgrobol/roberta-minuscule",
-        "--model-config",
-        "lgrobol/roberta-minuscule",
-        "--device-batch-size",
-        "8",
-        "--out-dir",
-        str(tmp_path / "train-out"),
-        "--cache-dir",
-        str(tmp_path / "tokenizer-cache"),
-        "--val-text",
-        remote_raw_text,
-        remote_raw_text,
+        [
+            "zeldarose-transformer",
+            "--strategy",
+            "ddp_spawn",
+            "--num-devices",
+            "2",
+            "--config",
+            str(mlm_task_config),
+            "--tokenizer",
+            "lgrobol/roberta-minuscule",
+            "--model-config",
+            "lgrobol/roberta-minuscule",
+            "--device-batch-size",
+            "8",
+            "--out-dir",
+            str(tmp_path / "train-out"),
+            "--cache-dir",
+            str(tmp_path / "tokenizer-cache"),
+            "--val-text",
+            remote_raw_text,
+            remote_raw_text,
+        ]
     )
     assert ret.success
diff --git a/zeldarose/datasets/transform.py b/zeldarose/datasets/transform.py
@@ -26,7 +26,10 @@ def encode_dataset(
     logger.info(f"Loading data from {text_path}")
     try:
         full_dataset = datasets.load_dataset("text", data_files=str(text_path), split="train")
-    except FileNotFoundError as e:
+    # So far the cleaner way to detect that a dataset is remote???
+    # in datasets < 2.14 this was FileNotFoundError, in 2.14 it's the other one
+    # in the future? Who's to say,,,
+    except (FileNotFoundError, datasets.builder.DatasetGenerationError) as e:
         if isinstance(text_path, str):
             dataset_name, dataset_config, dataset_split = text_path.split(":")
             full_dataset = datasets.load_dataset(