diff --git a/.actions/assistant.py b/.actions/assistant.py index 870e46d0a1202..bd134e2464dd0 100644 --- a/.actions/assistant.py +++ b/.actions/assistant.py @@ -431,6 +431,42 @@ def copy_replace_imports( source_dir, source_imports, target_imports, target_dir=target_dir, lightning_by=lightning_by ) + @staticmethod + def pull_docs_files( + gh_user_repo: str, + target_dir: str = "docs/source-pytorch/XXX", + checkout: str = "tags/1.0.0", + source_dir: str = "docs/source", + ) -> None: + """Pull docs pages from external source and append to local docs.""" + import zipfile + + zip_url = f"https://github.com/{gh_user_repo}/archive/refs/{checkout}.zip" + + with tempfile.TemporaryDirectory() as tmp: + zip_file = os.path.join(tmp, "repo.zip") + urllib.request.urlretrieve(zip_url, zip_file) + + with zipfile.ZipFile(zip_file, "r") as zip_ref: + zip_ref.extractall(tmp) + + zip_dirs = [d for d in glob.glob(os.path.join(tmp, "*")) if os.path.isdir(d)] + # check that the extracted archive has only repo folder + assert len(zip_dirs) == 1 + repo_dir = zip_dirs[0] + + ls_pages = glob.glob(os.path.join(repo_dir, source_dir, "*.rst")) + ls_pages += glob.glob(os.path.join(repo_dir, source_dir, "**", "*.rst")) + for rst in ls_pages: + rel_rst = rst.replace(os.path.join(repo_dir, source_dir) + os.path.sep, "") + rel_dir = os.path.dirname(rel_rst) + os.makedirs(os.path.join(_PROJECT_ROOT, target_dir, rel_dir), exist_ok=True) + new_rst = os.path.join(_PROJECT_ROOT, target_dir, rel_rst) + if os.path.isfile(new_rst): + logging.warning(f"Page {new_rst} already exists in the local tree so it will be skipped.") + continue + shutil.copy(rst, new_rst) + if __name__ == "__main__": import jsonargparse diff --git a/docs/source-pytorch/accelerators/hpu_basic.rst b/docs/source-pytorch/accelerators/hpu_basic.rst deleted file mode 100644 index 0302841f9b82e..0000000000000 --- a/docs/source-pytorch/accelerators/hpu_basic.rst +++ /dev/null @@ -1,109 +0,0 @@ -:orphan: - -.. _hpu_basics: - -Accelerator: HPU training -========================= -**Audience:** Users looking to save money and run large models faster using single or multiple Gaudi devices. - -.. warning:: This is an :ref:`experimental ` feature. - ----- - -What is an HPU? ---------------- - -`Habana® Gaudi® AI Processor (HPU) `__ training processors are built on a heterogeneous architecture with a cluster of fully programmable Tensor Processing Cores (TPC) along with its associated development tools and libraries, and a configurable Matrix Math engine. - -The TPC core is a VLIW SIMD processor with an instruction set and hardware tailored to serve training workloads efficiently. -The Gaudi memory architecture includes on-die SRAM and local memories in each TPC and, -Gaudi is the first DL training processor that has integrated RDMA over Converged Ethernet (RoCE v2) engines on-chip. - -On the software side, the PyTorch Habana bridge interfaces between the framework and SynapseAI software stack to enable the execution of deep learning models on the Habana Gaudi device. - -Gaudi offers a substantial price/performance advantage -- so you get to do more deep learning training while spending less. - -For more information, check out `Gaudi Architecture `__ and `Gaudi Developer Docs `__. - ----- - -Run on Gaudi ------------- - -To enable PyTorch Lightning to utilize the HPU accelerator, simply provide ``accelerator="hpu"`` parameter to the Trainer class. - -.. code-block:: python - - # run on as many Gaudi devices as available by default - trainer = Trainer(accelerator="auto", devices="auto", strategy="auto") - # equivalent to - trainer = Trainer() - - # run on one Gaudi device - trainer = Trainer(accelerator="hpu", devices=1) - # run on multiple Gaudi devices - trainer = Trainer(accelerator="hpu", devices=8) - # choose the number of devices automatically - trainer = Trainer(accelerator="hpu", devices="auto") - - -The ``devices>1`` parameter with HPUs enables the Habana accelerator for distributed training. -It uses :class:`~lightning.pytorch.strategies.hpu_parallel.HPUParallelStrategy` internally which is based on DDP -strategy with the addition of Habana's collective communication library (HCCL) to support scale-up within a node and -scale-out across multiple nodes. - ----- - -Scale-out on Gaudis -------------------- - -To train a Lightning model using multiple HPU nodes, set the ``num_nodes`` parameter with the available nodes in the ``Trainer`` class. - -.. code-block:: python - - trainer = Trainer(accelerator="hpu", devices=8, strategy="hpu_parallel", num_nodes=2) - -In addition to this, the following environment variables need to be set to establish communication across nodes. Check out the documentation on :doc:`Cluster Environment <../clouds/cluster>` for more details. - -- *MASTER_PORT* - required; has to be a free port on machine with NODE_RANK 0 -- *MASTER_ADDR* - required (except for NODE_RANK 0); address of NODE_RANK 0 node -- *WORLD_SIZE* - required; how many workers are in the cluster -- *NODE_RANK* - required; id of the node in the cluster - -The trainer needs to be instantiated on every node participating in the training. - -On Node 1: - -.. code-block:: bash - - MASTER_ADDR= MASTER_PORT= NODE_RANK=0 WORLD_SIZE=16 - python -m some_model_trainer.py (--arg1 ... train script args...) - -On Node 2: - -.. code-block:: bash - - MASTER_ADDR= MASTER_PORT= NODE_RANK=1 WORLD_SIZE=16 - python -m some_model_trainer.py (--arg1 ... train script args...) - ----- - -How to access HPUs ------------------- - -To use HPUs, you must have access to a system with HPU devices. - -AWS -^^^ -You can either use `Gaudi-based AWS EC2 DL1 instances `__ or `Supermicro X12 Gaudi server `__ to get access to HPUs. - -Check out the `PyTorch Model on AWS DL1 Instance Quick Start `__. - ----- - -.. _known-limitations_hpu: - -Known limitations ------------------ - -* `Habana dataloader `__ is not supported. diff --git a/docs/source-pytorch/accelerators/hpu_intermediate.rst b/docs/source-pytorch/accelerators/hpu_intermediate.rst deleted file mode 100644 index 0391209a438eb..0000000000000 --- a/docs/source-pytorch/accelerators/hpu_intermediate.rst +++ /dev/null @@ -1,101 +0,0 @@ -:orphan: - -.. _hpu_intermediate: - -Accelerator: HPU training -========================= -**Audience:** Gaudi chip users looking to save memory and scale models with mixed-precision training. - -.. warning:: This is an :ref:`experimental ` feature. - ----- - -Enable Mixed Precision ----------------------- - -Lightning also allows mixed precision training with HPUs. -By default, HPU training will use 32-bit precision. To enable mixed precision, set the ``precision`` flag. - -.. code-block:: python - - trainer = Trainer(devices=1, accelerator="hpu", precision=16) - ----- - -Customize Mixed Precision -------------------------- - -Internally, :class:`~lightning.pytorch.plugins.precision.hpu.HPUPrecisionPlugin` uses the Habana Mixed Precision (HMP) package to enable mixed precision training. - -You can execute the ops in FP32 or BF16 precision. The HMP package modifies the Python operators to add the appropriate cast operations for the arguments before execution. -The default settings enable users to enable mixed precision training with minimal code easily. - -In addition to the default settings in HMP, users also have the option of overriding these defaults and providing their -BF16 and FP32 operator lists by passing them as parameter to :class:`~lightning.pytorch.plugins.precision.hpu.HPUPrecisionPlugin`. - -The below snippet shows an example model using MNIST with a single Habana Gaudi device and making use of HMP by overriding the default parameters. -This enables advanced users to provide their own BF16 and FP32 operator list instead of using the HMP defaults. - -.. code-block:: python - - import lightning.pytorch as pl - from lightning.pytorch.plugins import HPUPrecisionPlugin - - # Initialize a trainer with HPU accelerator for HPU strategy for single device, - # with mixed precision using overidden HMP settings - trainer = pl.Trainer( - accelerator="hpu", - devices=1, - # Optional Habana mixed precision params to be set - # Checkout `examples/pytorch/hpu/ops_bf16_mnist.txt` for the format - plugins=[ - HPUPrecisionPlugin( - precision=16, - opt_level="O1", - verbose=False, - bf16_file_path="ops_bf16_mnist.txt", - fp32_file_path="ops_fp32_mnist.txt", - ) - ], - ) - - # Init our model - model = LitClassifier() - # Init the data - dm = MNISTDataModule(batch_size=batch_size) - - # Train the model ⚡ - trainer.fit(model, datamodule=dm) - -For more details, please refer to `PyTorch Mixed Precision Training on Gaudi `__. - ----- - -Enabling DeviceStatsMonitor with HPUs ----------------------------------------- - -:class:`~lightning.pytorch.callbacks.device_stats_monitor.DeviceStatsMonitor` is a callback that automatically monitors and logs device stats during the training stage. -This callback can be passed for training with HPUs. It returns a map of the following metrics with their values in bytes of type uint64: - -- **Limit**: amount of total memory on HPU device. -- **InUse**: amount of allocated memory at any instance. -- **MaxInUse**: amount of total active memory allocated. -- **NumAllocs**: number of allocations. -- **NumFrees**: number of freed chunks. -- **ActiveAllocs**: number of active allocations. -- **MaxAllocSize**: maximum allocated size. -- **TotalSystemAllocs**: total number of system allocations. -- **TotalSystemFrees**: total number of system frees. -- **TotalActiveAllocs**: total number of active allocations. - -The below snippet shows how DeviceStatsMonitor can be enabled. - -.. code-block:: python - - from lightning.pytorch import Trainer - from lightning.pytorch.callbacks import DeviceStatsMonitor - - device_stats = DeviceStatsMonitor() - trainer = Trainer(accelerator="hpu", callbacks=[device_stats]) - -For more details, please refer to `Memory Stats APIs `__. diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst index 12ac56f27974d..f01cfca0f4579 100644 --- a/docs/source-pytorch/advanced/model_parallel.rst +++ b/docs/source-pytorch/advanced/model_parallel.rst @@ -58,11 +58,11 @@ Cutting-edge and third-party Strategies Cutting-edge Lightning strategies are being developed by third-parties outside of Lightning. -If you want to try some of the latest and greatest features for model-parallel training, check out the :doc:`Colossal-AI Strategy <./third_party/colossalai>` integration. +If you want to try some of the latest and greatest features for model-parallel training, check out the :doc:`Colossal-AI Strategy <../integrations/strategies/colossalai>` integration. -Another integration is :doc:`Bagua Strategy <./third_party/bagua>`, deep learning training acceleration framework for PyTorch, with advanced distributed training algorithms and system optimizations. +Another integration is :doc:`Bagua Strategy <../integrations/strategies/bagua>`, deep learning training acceleration framework for PyTorch, with advanced distributed training algorithms and system optimizations. -For training on unreliable mixed GPUs across the internet check out the :doc:`Hivemind Strategy <./third_party/hivemind>` integration. +For training on unreliable mixed GPUs across the internet check out the :doc:`Hivemind Strategy <../integrations/strategies/hivemind>` integration. ---- diff --git a/docs/source-pytorch/common/index.rst b/docs/source-pytorch/common/index.rst index 0d09ecebe5efd..b5f44f76a37dd 100644 --- a/docs/source-pytorch/common/index.rst +++ b/docs/source-pytorch/common/index.rst @@ -16,7 +16,7 @@ Save memory with half-precision ../advanced/model_parallel Train on single or multiple GPUs <../accelerators/gpu> - Train on single or multiple HPUs <../accelerators/hpu> + Train on single or multiple HPUs <../integrations/hpu/index> Train on single or multiple IPUs <../accelerators/ipu> Train on single or multiple TPUs <../accelerators/tpu> Train on MPS <../accelerators/mps> @@ -148,7 +148,7 @@ How-to Guides .. displayitem:: :header: Train on single or multiple HPUs :description: Train models faster with HPU accelerators - :button_link: ../accelerators/hpu.html + :button_link: ../integrations/hpu/index.html :col_css: col-md-4 :height: 180 diff --git a/docs/source-pytorch/common_usecases.rst b/docs/source-pytorch/common_usecases.rst index 8046c1ee788a8..263fb348116a8 100644 --- a/docs/source-pytorch/common_usecases.rst +++ b/docs/source-pytorch/common_usecases.rst @@ -123,7 +123,7 @@ Customize and extend Lightning for things like custom hardware or distributed st :header: Train on single or multiple HPUs :description: Train models faster with HPUs. :col_css: col-md-12 - :button_link: accelerators/hpu.html + :button_link: integrations/hpu/index.html :height: 100 .. displayitem:: diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py index efa9286c65bba..754fa680e5ee1 100644 --- a/docs/source-pytorch/conf.py +++ b/docs/source-pytorch/conf.py @@ -17,6 +17,8 @@ import shutil import sys import warnings +from importlib.util import module_from_spec, spec_from_file_location +from types import ModuleType import pt_lightning_sphinx_theme @@ -30,16 +32,24 @@ # ----------------------- # BUILD stuff # ----------------------- -PATH_HERE = os.path.abspath(os.path.dirname(__file__)) -PATH_ROOT = os.path.join(PATH_HERE, "..", "..") -PATH_RAW_NB = os.path.join(PATH_ROOT, "_notebooks") +_PATH_HERE = os.path.abspath(os.path.dirname(__file__)) +_PATH_ROOT = os.path.join(_PATH_HERE, "..", "..") +_PATH_RAW_NB = os.path.join(_PATH_ROOT, "_notebooks") _SHOULD_COPY_NOTEBOOKS = True -sys.path.insert(0, os.path.abspath(PATH_ROOT)) -sys.path.append(os.path.join(PATH_RAW_NB, ".actions")) -try: - from assistant import AssistantCLI -except ImportError: + +def _load_py_module(name: str, location: str) -> ModuleType: + spec = spec_from_file_location(name, location) + py = module_from_spec(spec) + spec.loader.exec_module(py) + return py + + +assist_local = _load_py_module("assistant", os.path.join(_PATH_ROOT, ".actions", "assistant.py")) + +if os.path.isdir(os.path.join(_PATH_RAW_NB, ".actions")): + assist_nb = _load_py_module("assistant", os.path.join(_PATH_RAW_NB, ".actions", "assistant.py")) +else: _SHOULD_COPY_NOTEBOOKS = False warnings.warn("To build the code, please run: `git submodule update --init --recursive`", stacklevel=2) @@ -48,9 +58,9 @@ # -- Project documents ------------------------------------------------------- if _SHOULD_COPY_NOTEBOOKS: - AssistantCLI.copy_notebooks( - PATH_RAW_NB, - PATH_HERE, + assist_nb.AssistantCLI.copy_notebooks( + _PATH_RAW_NB, + _PATH_HERE, "notebooks", patterns=[".", "course_UvA-DL", "lightning_examples"], ) @@ -64,7 +74,7 @@ "lightning_examples/warp-drive", ] for file in ignore: - file = os.path.join(PATH_HERE, "notebooks", file) + file = os.path.join(_PATH_HERE, "notebooks", file) if os.path.exists(file): os.remove(file) @@ -84,16 +94,24 @@ def _transform_changelog(path_in: str, path_out: str) -> None: fp.writelines(chlog_lines) -os.makedirs(os.path.join(PATH_HERE, FOLDER_GENERATED), exist_ok=True) +os.makedirs(os.path.join(_PATH_HERE, FOLDER_GENERATED), exist_ok=True) # copy all documents from GH templates like contribution guide -for md in glob.glob(os.path.join(PATH_ROOT, ".github", "*.md")): - shutil.copy(md, os.path.join(PATH_HERE, FOLDER_GENERATED, os.path.basename(md))) +for md in glob.glob(os.path.join(_PATH_ROOT, ".github", "*.md")): + shutil.copy(md, os.path.join(_PATH_HERE, FOLDER_GENERATED, os.path.basename(md))) # copy also the changelog _transform_changelog( - os.path.join(PATH_ROOT, "src", "lightning", "fabric", "CHANGELOG.md"), - os.path.join(PATH_HERE, FOLDER_GENERATED, "CHANGELOG.md"), + os.path.join(_PATH_ROOT, "src", "lightning", "fabric", "CHANGELOG.md"), + os.path.join(_PATH_HERE, FOLDER_GENERATED, "CHANGELOG.md"), ) + +assist_local.AssistantCLI.pull_docs_files( + gh_user_repo="Lightning-AI/lightning-Habana", + target_dir="docs/source-pytorch/integrations/hpu", + checkout="tags/1.0.0", +) + + # -- Project information ----------------------------------------------------- project = "PyTorch Lightning" @@ -311,6 +329,7 @@ def _transform_changelog(path_in: str, path_out: str) -> None: "PIL": ("https://pillow.readthedocs.io/en/stable/", None), "torchmetrics": ("https://torchmetrics.readthedocs.io/en/stable/", None), "graphcore": ("https://docs.graphcore.ai/en/latest/", None), + "habana": ("https://lightning-ai.github.io/lightning-Habana/", None), } # -- Options for todo extension ---------------------------------------------- @@ -360,7 +379,7 @@ def package_list_from_file(file): } MOCK_PACKAGES = [] if SPHINX_MOCK_REQUIREMENTS: - _path_require = lambda fname: os.path.join(PATH_ROOT, "requirements", "pytorch", fname) + _path_require = lambda fname: os.path.join(_PATH_ROOT, "requirements", "pytorch", fname) # mock also base packages when we are on RTD since we don't install them there MOCK_PACKAGES += package_list_from_file(_path_require("base.txt")) MOCK_PACKAGES += package_list_from_file(_path_require("extra.txt")) diff --git a/docs/source-pytorch/extensions/accelerator.rst b/docs/source-pytorch/extensions/accelerator.rst index 90fb24fc375ad..45f4b72500c38 100644 --- a/docs/source-pytorch/extensions/accelerator.rst +++ b/docs/source-pytorch/extensions/accelerator.rst @@ -11,7 +11,7 @@ Currently there are accelerators for: - :doc:`GPU <../accelerators/gpu>` - :doc:`TPU <../accelerators/tpu>` - :doc:`IPU <../accelerators/ipu>` -- :doc:`HPU <../accelerators/hpu>` +- :doc:`HPU <../integrations/hpu/index>` - :doc:`MPS <../accelerators/mps>` The Accelerator is part of the Strategy which manages communication across multiple devices (distributed communication). diff --git a/docs/source-pytorch/extensions/strategy.rst b/docs/source-pytorch/extensions/strategy.rst index a5eaeefd536d3..715d2e84d651b 100644 --- a/docs/source-pytorch/extensions/strategy.rst +++ b/docs/source-pytorch/extensions/strategy.rst @@ -83,10 +83,10 @@ The below table lists all relevant strategies available in Lightning with their - Provides capabilities to run training using the DeepSpeed library, with training optimizations for large billion parameter models. :ref:`Learn more. ` * - hpu_parallel - ``HPUParallelStrategy`` - - Strategy for distributed training on multiple HPU devices. :doc:`Learn more. <../accelerators/hpu>` + - Strategy for distributed training on multiple HPU devices. :doc:`Learn more. <../integrations/hpu/index>` * - hpu_single - ``SingleHPUStrategy`` - - Strategy for training on a single HPU device. :doc:`Learn more. <../accelerators/hpu>` + - Strategy for training on a single HPU device. :doc:`Learn more. <../integrations/hpu/index>` * - ipu_strategy - ``IPUStrategy`` - Plugin for training on IPU devices. :doc:`Learn more. <../accelerators/ipu>` diff --git a/docs/source-pytorch/glossary/index.rst b/docs/source-pytorch/glossary/index.rst index 2c91a7c2d0b80..94a042b26a49c 100644 --- a/docs/source-pytorch/glossary/index.rst +++ b/docs/source-pytorch/glossary/index.rst @@ -15,7 +15,7 @@ Finetuning <../advanced/finetuning> GPU <../accelerators/gpu> Half precision <../common/precision> - HPU <../accelerators/hpu> + HPU <../integrations/hpu/index> Inference <../deploy/production_intermediate> IPU <../accelerators/ipu> Lightning CLI <../cli/lightning_cli> @@ -140,7 +140,7 @@ Glossary :header: HPU :description: Habana Gaudi AI Processor Unit for faster training :col_css: col-md-12 - :button_link: ../accelerators/hpu.html + :button_link: ../integrations/hpu/index.html :height: 100 .. displayitem:: diff --git a/docs/source-pytorch/accelerators/hpu.rst b/docs/source-pytorch/integrations/hpu/index.rst similarity index 73% rename from docs/source-pytorch/accelerators/hpu.rst rename to docs/source-pytorch/integrations/hpu/index.rst index 13eeab8e9a72e..2f8607dbfe19e 100644 --- a/docs/source-pytorch/accelerators/hpu.rst +++ b/docs/source-pytorch/integrations/hpu/index.rst @@ -10,19 +10,11 @@ Accelerator: HPU training .. Add callout items below this line -.. displayitem:: - :header: Prepare your code (Optional) - :description: Prepare your code to run on any hardware - :col_css: col-md-4 - :button_link: accelerator_prepare.html - :height: 150 - :tag: basic - .. displayitem:: :header: Basic :description: Learn the basics of single and multi-HPU core training. :col_css: col-md-4 - :button_link: hpu_basic.html + :button_link: basic.html :height: 150 :tag: basic @@ -30,10 +22,18 @@ Accelerator: HPU training :header: Intermediate :description: Enable state-of-the-art scaling with advanced mix-precision settings. :col_css: col-md-4 - :button_link: hpu_intermediate.html + :button_link: intermediate.html :height: 150 :tag: intermediate +.. displayitem:: + :header: Advanced + :description: Explore state-of-the-art scaling with additional advanced configurations. + :col_css: col-md-4 + :button_link: advanced.html + :height: 150 + :tag: advanced + .. raw:: html diff --git a/docs/source-pytorch/advanced/third_party/bagua.rst b/docs/source-pytorch/integrations/strategies/bagua.rst similarity index 100% rename from docs/source-pytorch/advanced/third_party/bagua.rst rename to docs/source-pytorch/integrations/strategies/bagua.rst diff --git a/docs/source-pytorch/advanced/third_party/colossalai.rst b/docs/source-pytorch/integrations/strategies/colossalai.rst similarity index 100% rename from docs/source-pytorch/advanced/third_party/colossalai.rst rename to docs/source-pytorch/integrations/strategies/colossalai.rst diff --git a/docs/source-pytorch/advanced/third_party/hivemind.rst b/docs/source-pytorch/integrations/strategies/hivemind.rst similarity index 100% rename from docs/source-pytorch/advanced/third_party/hivemind.rst rename to docs/source-pytorch/integrations/strategies/hivemind.rst diff --git a/docs/source-pytorch/levels/advanced_level_20.rst b/docs/source-pytorch/levels/advanced_level_20.rst index 50153ea582978..f17ebdfd1fd5e 100644 --- a/docs/source-pytorch/levels/advanced_level_20.rst +++ b/docs/source-pytorch/levels/advanced_level_20.rst @@ -19,7 +19,7 @@ Explore Intel Habana Processing Unit (HPU) for model scaling. :header: Train models on HPUs :description: Learn the basics of single and multi-HPU core training. :col_css: col-md-6 - :button_link: ../accelerators/hpu_basic.html + :button_link: ../integrations/hpu/basic.html :height: 150 :tag: basic @@ -27,7 +27,7 @@ Explore Intel Habana Processing Unit (HPU) for model scaling. :header: Optimize models training on HPUs :description: Enable state-of-the-art scaling with advanced mix-precision settings. :col_css: col-md-6 - :button_link: ../accelerators/hpu_intermediate.html + :button_link: ../integrations/hpu/intermediate.html :height: 150 :tag: intermediate diff --git a/docs/source-pytorch/upgrade/sections/1_9_advanced.rst b/docs/source-pytorch/upgrade/sections/1_9_advanced.rst index 782e16348d3b1..f9ef1b811e133 100644 --- a/docs/source-pytorch/upgrade/sections/1_9_advanced.rst +++ b/docs/source-pytorch/upgrade/sections/1_9_advanced.rst @@ -261,7 +261,7 @@ .. _Fabric: https://lightning.ai/docs/fabric/ .. _lightning-Horovod: https://github.com/Lightning-AI/lightning-Horovod -.. _lightning-ColossalAI: https://lightning.ai/docs/pytorch/latest/advanced/third_party/colossalai.html +.. _lightning-ColossalAI: https://lightning.ai/docs/pytorch/latest/integrations/strategies/colossalai.html .. _lightning-Fairscale: https://github.com/Lightning-Sandbox/lightning-Fairscale .. _pr15953: https://github.com/Lightning-AI/lightning/pull/15953