Lightning-Universe · SeanNaren · Sep 30, 2021 · Sep 14, 2021 · Sep 14, 2021 · Sep 14, 2021
@@ -0,0 +1,3 @@
+_target_: lightning_transformers.core.callback.LightningBoltsSparseMLCallback
+output_dir: ${env:MODELS_PATH}
+recipe_path: ${env:RECIPE_PATH}
@@ -0,0 +1 @@
+_target_: lightning_transformers.core.loggers.WANDBLogger
@@ -0,0 +1,3 @@
+defaults:
+  - default # inherit from default trainer conf
+  - callbacks: sparseml
@@ -11,11 +11,123 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import collections
+import inspect
+import os
 import time
+from typing import Any, Dict, List, Optional, Union
 
+import numpy
+import onnxruntime
 import torch
+from pl_bolts.callbacks import SparseMLCallback
 from pytorch_lightning import Callback
+from pytorch_lightning.loggers import WandbLogger
 from pytorch_lightning.utilities import rank_zero_info
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from sparseml.pytorch.utils import ModuleExporter
+from sparseml.pytorch.utils.logger import WANDBLogger
+from torch import Tensor
+
+
+class LightningBoltsSparseMLCallback(SparseMLCallback):
+
+    def __init__(self, output_dir, recipe_path):
+        self.output_dir = output_dir
+        super().__init__(recipe_path=recipe_path)
+
+    def on_init_end(self, trainer: "pl.Trainer") -> None:
+        if isinstance(trainer.logger, WANDBLogger):
+            trainer.logger.__init__(init_kwargs={"project": "lightning-transformers"})
+
+    def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        optimizer = trainer.optimizers
+
+        if len(optimizer) > 1:
+            raise MisconfigurationException("SparseML only supports training with one optimizer.")
+        optimizer = optimizer[0]
+
+        loggers = trainer.logger
+
+        if not isinstance(loggers, list):
+            loggers = [loggers]
+
+        self.manager.initialize(pl_module, epoch=0.0, logger=loggers)
+        self.manager.initialize_loggers(loggers)
+
+        optimizer = self.manager.modify(
+            pl_module, optimizer, steps_per_epoch=self._num_training_steps_per_epoch(trainer), epoch=0
+        )
+
+        trainer.optimizers = [optimizer]
+
+    @staticmethod
+    def export_to_sparse_onnx(
+        model: "LightningModule", output_dir: str, sample_batch: Optional[Tensor] = None, **kwargs
+    ) -> None:
+        """Exports the model to ONNX format."""
+        with model._prevent_trainer_and_dataloaders_deepcopy():
+            exporter = ModuleExporter(model.model, output_dir=output_dir)
+            sample_batch = sample_batch if sample_batch is not None else model.example_input_array
+            if sample_batch is None:
+                raise MisconfigurationException(
+                    "To export the model, a sample batch must be passed via "
+                    "``SparseMLCallback.export_to_sparse_onnx(model, output_dir, sample_batch=sample_batch)`` "
+                    "or an ``example_input_array`` property within the LightningModule"
+                )
+
+            # the following is adapted from @natuan and @spacemanidol
+            sess = None
+            num_samples = 0
+
+            sample_inputs = os.path.join(output_dir, "sample-inputs")
+            sample_outputs = os.path.join(output_dir, "sample-outputs")
+            os.makedirs(sample_inputs, exist_ok=True)
+            os.makedirs(sample_outputs, exist_ok=True)
+
+            if sess is None:
+                forward_args_spec = inspect.getfullargspec(exporter._module.__class__.forward)
+                one_sample_input = collections.OrderedDict([(f, sample_batch[f][0].long().reshape(1, -1))
+                                                            for f in forward_args_spec.args if f in sample_batch])
+
+                try:
+                    exporter.export_onnx(sample_batch=one_sample_input, convert_qat=True, **kwargs)
+                    exporter.export_onnx(
+                        sample_batch=one_sample_input,
+                        name="small_model.onnx",
+                        convert_qat=True,
+                        export_params=False,
+                        **kwargs,
+                    )
+                    onnx_file = os.path.join(output_dir, "model.onnx")
+
+                except Exception:
+                    raise RuntimeError("Error exporting ONNX models and/or inputs/outputs")
+
+                sess = onnxruntime.InferenceSession(onnx_file)
+
+            # add additional files for testing since this feature is very new
+            input_names = list(sample_batch.keys())
+            output_names = [o.name for o in sess.get_outputs()]
+            for input_vals in zip(*sample_batch.values()):
+                input_feed = {k: v.long().numpy() for k, v in zip(input_names, input_vals)}
+                output_vals = sess.run(output_names, {k: input_feed[k].reshape(1, -1) for k in input_feed})
+                output_dict = {name: numpy.squeeze(val) for name, val in zip(output_names, output_vals)}
+                file_idx = f"{num_samples}".zfill(4)
+                numpy.savez(f"{sample_inputs}/inp-{file_idx}.npz", **input_feed)
+                numpy.savez(f"{sample_outputs}/out-{file_idx}.npz", **output_dict)
+                num_samples += 1
+
+    def teardown(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None:
+        sample_batch = next(iter(trainer.train_dataloader))
+        # if asked for output names, bert's ModelOutput gives two names
+        # but when run, this the model only gives one output
+        # workaround is just to force onnx to realize there is only one output
+        output_names = ["logits"]
+        self.export_to_sparse_onnx(
+            output_dir=self.output_dir, model=pl_module, sample_batch=sample_batch, output_names=output_names
+        )
 
 
 class CUDACallback(Callback):

@@ -0,0 +1,92 @@
+import time
+from typing import Dict, Optional, Union
+
+from pytorch_lightning.loggers import WandbLogger
+from sparseml.pytorch.utils.logger import LambdaLogger
+
+
+class WANDBLogger(WandbLogger):
+    """
+    Modifier logger that handles outputting values to Weights and Biases.
+
+    :param init_kwargs: the args to call into wandb.init with;
+        ex: wandb.init(**init_kwargs). If not supplied, then init will not be called
+    :param name: name given to the logger, used for identification;
+        defaults to wandb
+    :param enabled: True to log, False otherwise
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.enabled = True
+
+    def _lambda_func(
+        self,
+        tag: Optional[str],
+        value: Optional[float],
+        values: Optional[Dict[str, float]],
+        step: Optional[int],
+        wall_time: Optional[float],
+    ) -> bool:
+        params = {}
+
+        if value is not None:
+            params[tag] = value
+
+        if values:
+            if tag:
+                values = {f"{tag}/{key}": val for key, val in values.items()}
+            params.update(values)
+
+        try:
+            self.log_metrics(params, step=step)
+        except Exception as e:
+            print(params, e)
+
+        return True
+
+    def log_scalar(
+        self,
+        tag: str,
+        value: float,
+        step: Union[None, int] = None,
+        wall_time: Union[None, float] = None,
+    ):
+        """
+        :param tag: identifying tag to log the value with
+        :param value: value to save
+        :param step: global step for when the value was taken
+        :param wall_time: global wall time for when the value was taken,
+            defaults to time.time()
+        :return: True if logged, False otherwise.
+        """
+        if not self.enabled:
+            return False
+
+        if not wall_time:
+            wall_time = time.time()
+
+        return self._lambda_func(tag, value, None, step, wall_time)
+
+    def log_scalars(
+        self,
+        tag: str,
+        values: Dict[str, float],
+        step: Union[None, int] = None,
+        wall_time: Union[None, float] = None,
+    ):
+        """
+        :param tag: identifying tag to log the values with
+        :param values: values to save
+        :param step: global step for when the values were taken
+        :param wall_time: global wall time for when the values were taken,
+            defaults to time.time()
+        :return: True if logged, False otherwise.
+        """
+        if not self.enabled:
+            return False
+
+        if not wall_time:
+            wall_time = time.time()
+
+        return self._lambda_func(tag, None, values, step, wall_time)