Lightning-AI · carmocca · Oct 4, 2023 · Oct 4, 2023 · Oct 4, 2023 · awaelchli
@@ -125,10 +125,10 @@ def init_context(self) -> ContextManager:
                 " `ignore_modules` or remove the `init_module` context manager."
             )
         dtype_ctx = _DtypeContextManager(self.dtype)
-        stack = ExitStack()
-        stack.enter_context(dtype_ctx)
         # TODO: this could also support replacing `Embedding` and `Conv1D`
         context_manager = _ClassReplacementContextManager({"torch.nn.Linear": self._linear_cls})
+        stack = ExitStack()
+        stack.enter_context(dtype_ctx)
         stack.enter_context(context_manager)
         return stack
 

@@ -94,9 +94,8 @@ def convert_module(self, module: torch.nn.Module) -> torch.nn.Module:
         return module
 
     def init_context(self) -> ContextManager:
+        dtype_ctx = _DtypeContextManager(self.dtype)
         stack = ExitStack()
-        stack.enter_context(_DtypeContextManager(self.dtype))
-
         if self.replace_layers:
             import transformer_engine.pytorch as te
 
@@ -107,15 +106,17 @@ def init_context(self) -> ContextManager:
                 }
             )
             stack.enter_context(context_manager)
+        stack.enter_context(dtype_ctx)
         return stack
 
     def forward_context(self) -> ContextManager:
-        stack = ExitStack()
-        stack.enter_context(_DtypeContextManager(self.dtype))
-
+        dtype_ctx = _DtypeContextManager(self.dtype)
         import transformer_engine.pytorch as te
 
-        stack.enter_context(te.fp8_autocast(enabled=True, fp8_recipe=self.recipe))
+        autocast_ctx = te.fp8_autocast(enabled=True, fp8_recipe=self.recipe)
+        stack = ExitStack()
+        stack.enter_context(dtype_ctx)
+        stack.enter_context(autocast_ctx)
         return stack
 
     def convert_input(self, data: Any) -> Any:

@@ -350,10 +350,11 @@ def module_init_context(self, empty_init: Optional[bool] = None) -> ContextManag
             raise NotImplementedError(
                 f"`{empty_init=}` is not a valid choice with `DeepSpeedStrategy` when ZeRO stage 3 is enabled."
             )
+        module_sharded_ctx = self.module_sharded_context()
         stack = ExitStack()
         if not self.zero_stage_3:
             stack.enter_context(super().module_init_context(empty_init=empty_init))
-        stack.enter_context(self.module_sharded_context())
+        stack.enter_context(module_sharded_ctx)
         return stack
 
     def module_sharded_context(self) -> ContextManager:

@@ -333,6 +333,8 @@ def module_to_device(self, module: Module) -> None:
         pass
 
     def module_init_context(self, empty_init: Optional[bool] = None) -> ContextManager:
+        precision_init_ctx = self.precision.init_context()
+        module_sharded_ctx = self.module_sharded_context()
         stack = ExitStack()
         if _TORCH_GREATER_EQUAL_2_1 and empty_init:
             # Materialization happens in `setup`. When modules get wrapped by FSDP, the sequence of operations is:
@@ -341,8 +343,8 @@ def module_init_context(self, empty_init: Optional[bool] = None) -> ContextManag
             stack.enter_context(torch.device("meta"))
         elif _TORCH_GREATER_EQUAL_1_13:
             stack.enter_context(_EmptyInit(enabled=bool(empty_init)))
-        stack.enter_context(self.precision.init_context())
-        stack.enter_context(self.module_sharded_context())
+        stack.enter_context(precision_init_ctx)
+        stack.enter_context(module_sharded_ctx)
         return stack
 
     def module_sharded_context(self) -> ContextManager:

@@ -120,10 +120,11 @@ def process_dataloader(self, dataloader: DataLoader) -> DataLoader:
 
     def tensor_init_context(self) -> ContextManager:
         """Controls how tensors get created (device, dtype)."""
+        precision_init_ctx = self.precision.init_context()
         stack = ExitStack()
         if _TORCH_GREATER_EQUAL_2_0:
             stack.enter_context(self.root_device)
-        stack.enter_context(self.precision.init_context())
+        stack.enter_context(precision_init_ctx)
         return stack
 
     def module_init_context(self, empty_init: Optional[bool] = None) -> ContextManager:
@@ -137,10 +138,11 @@ def module_init_context(self, empty_init: Optional[bool] = None) -> ContextManag
                 If ``None``, the strategy will decide. Some strategies may not support all options.
 
         """
+        tensor_init_ctx = self.tensor_init_context()
         stack = ExitStack()
         if _TORCH_GREATER_EQUAL_1_13:
             stack.enter_context(_EmptyInit(enabled=bool(empty_init)))
-        stack.enter_context(self.tensor_init_context())
+        stack.enter_context(tensor_init_ctx)
         return stack
 
     def setup_module_and_optimizers(

@@ -194,11 +194,13 @@ def module_to_device(self, module: Module) -> None:
         pass
 
     def module_init_context(self, empty_init: Optional[bool] = None) -> ContextManager:
+        precision_init_ctx = self.precision.init_context()
+        module_sharded_ctx = self.module_sharded_context()
         stack = ExitStack()
         if _TORCH_GREATER_EQUAL_1_13:
             stack.enter_context(_EmptyInit(enabled=bool(empty_init)))
-        stack.enter_context(self.precision.init_context())
-        stack.enter_context(self.module_sharded_context())
+        stack.enter_context(precision_init_ctx)
+        stack.enter_context(module_sharded_ctx)
         return stack
 
     def module_sharded_context(self) -> ContextManager: