prod: apply black drift cleanup

DavidBellamy · DavidBellamy · commit db437d22658c · 2026-05-18T15:54:47.000-07:00
Six files on the prod base had black-non-compliant formatting that pre-commit on PR #25 flagged as failures. Applying `black==24.3.0` (matches .pre-commit-config.yaml) brings them in line so CI passes. Also fixes the single line in train_async.py from this PR that black wants (blank line after the import). No behavioral changes; pure whitespace + line breaks.
diff --git a/miles/backends/training_utils/log_utils.py b/miles/backends/training_utils/log_utils.py
@@ -22,33 +22,33 @@
 # Maps bare metric names to their W&B top-level section(s).
 # Keys appearing in multiple sections (e.g. pg_loss) are emitted under each.
 _TRAIN_METRIC_GROUPS: dict[str, list[str]] = {
-    "ppo_kl":                         ["policy_shift"],
-    "ois":                            ["policy_shift"],
-    "pg_clipfrac":                    ["policy_shift"],
-    "pg_loss":                        ["policy_shift", "optimization"],
-    "log_probs":                      ["policy_shift"],   # current policy (training forward pass)
-    "old_log_probs":                  ["policy_shift"],   # old policy (rollout or FSDP rollout)
-    "ref_kl":                         ["policy_shift"],
+    "ppo_kl": ["policy_shift"],
+    "ois": ["policy_shift"],
+    "pg_clipfrac": ["policy_shift"],
+    "pg_loss": ["policy_shift", "optimization"],
+    "log_probs": ["policy_shift"],  # current policy (training forward pass)
+    "old_log_probs": ["policy_shift"],  # old policy (rollout or FSDP rollout)
+    "ref_kl": ["policy_shift"],
     "train_rollout_logprob_abs_diff": ["train_inference_mismatch"],
-    "train_rollout_logprob_diff":     ["train_inference_mismatch"],
-    "tis":                            ["train_inference_mismatch"],
-    "tis_abs":                        ["train_inference_mismatch"],
-    "tis_clipfrac":                   ["train_inference_mismatch"],
-    "loss":                           ["optimization"],
-    "entropy_loss":                   ["optimization"],
-    "kl_loss":                        ["optimization"],
-    "grad_norm":                      ["optimization"],
+    "train_rollout_logprob_diff": ["train_inference_mismatch"],
+    "tis": ["train_inference_mismatch"],
+    "tis_abs": ["train_inference_mismatch"],
+    "tis_clipfrac": ["train_inference_mismatch"],
+    "loss": ["optimization"],
+    "entropy_loss": ["optimization"],
+    "kl_loss": ["optimization"],
+    "grad_norm": ["optimization"],
 }
 
 # Maps rollout batch field names to their W&B top-level section.
 _ROLLOUT_DATA_METRIC_GROUPS: dict[str, str] = {
-    "log_probs":         "train_inference_mismatch",  # FSDP log probs at rollout time
+    "log_probs": "train_inference_mismatch",  # FSDP log probs at rollout time
     "rollout_log_probs": "train_inference_mismatch",  # inference engine log probs
-    "ref_log_probs":     "policy_shift",              # reference model log probs
-    "rewards":           "reward",
-    "raw_reward":        "reward",
-    "advantages":        "reward",
-    "returns":           "reward",
+    "ref_log_probs": "policy_shift",  # reference model log probs
+    "rewards": "reward",
+    "raw_reward": "reward",
+    "advantages": "reward",
+    "returns": "reward",
 }
 
 
@@ -533,7 +533,7 @@ def log_train_step(
     for full_key, val in log_dict_out.items():
         if not full_key.startswith(prefix):
             continue
-        bare_key = full_key[len(prefix):]
+        bare_key = full_key[len(prefix) :]
         if bare_key in _TRAIN_METRIC_GROUPS:
             for group in _TRAIN_METRIC_GROUPS[bare_key]:
                 grouped_additions[f"{group}/{bare_key}"] = val
diff --git a/miles/backends/training_utils/loss.py b/miles/backends/training_utils/loss.py
@@ -693,7 +693,9 @@ def policy_loss_function(
     if "rollout_log_probs" in batch and batch["rollout_log_probs"]:
         rollout_log_probs_cat = torch.cat(batch["rollout_log_probs"], dim=0)
         log_probs_batch_cat = torch.cat(batch["log_probs"], dim=0)
-        train_rollout_logprob_abs_diff = sum_of_sample_mean((old_log_probs - rollout_log_probs_cat).abs()).clone().detach()
+        train_rollout_logprob_abs_diff = (
+            sum_of_sample_mean((old_log_probs - rollout_log_probs_cat).abs()).clone().detach()
+        )
         # signed: log π(inf) − log π(fsdp rollout)
         train_rollout_logprob_diff = sum_of_sample_mean(rollout_log_probs_cat - log_probs_batch_cat).clone().detach()
 
diff --git a/miles/ray/rollout.py b/miles/ray/rollout.py
@@ -1400,9 +1400,7 @@ def _compute_grouped_response_metrics(args, group: list[Sample], prefix: str) ->
     }
 
 
-def _compute_group_outcome_metrics(
-    args, all_samples: list[Sample], prefix: str = "reward"
-) -> dict:
+def _compute_group_outcome_metrics(args, all_samples: list[Sample], prefix: str = "reward") -> dict:
     """Fraction of prompt groups that are unanimously correct or incorrect. GRPO only."""
     if args.advantage_estimator == "ppo":
         return {}
diff --git a/miles/rollout/generate_utils/openai_endpoint_utils.py b/miles/rollout/generate_utils/openai_endpoint_utils.py
@@ -247,5 +247,5 @@ def _truncate_sample_output(sample: Sample, keep_tokens: int, tokenizer) -> None
     if sample.loss_mask is not None:
         sample.loss_mask = sample.loss_mask[:keep_tokens]
     if sample.rollout_routed_experts is not None:
-        sample.rollout_routed_experts = sample.rollout_routed_experts[:len(sample.tokens) - 1]
+        sample.rollout_routed_experts = sample.rollout_routed_experts[: len(sample.tokens) - 1]
     sample.status = Sample.Status.TRUNCATED
diff --git a/miles/rollout/session/linear_trajectory.py b/miles/rollout/session/linear_trajectory.py
@@ -340,10 +340,7 @@ def _evict_stale_sessions(self) -> None:
         if not self._session_last_access:
             return
         now = time.monotonic()
-        stale = [
-            sid for sid, ts in self._session_last_access.items()
-            if now - ts > self._SESSION_TTL_SECS
-        ]
+        stale = [sid for sid, ts in self._session_last_access.items() if now - ts > self._SESSION_TTL_SECS]
         for sid in stale:
             self.sessions.pop(sid, None)
             self._session_last_access.pop(sid, None)
diff --git a/miles/utils/replay_base.py b/miles/utils/replay_base.py
@@ -123,9 +123,7 @@ def _get_replay_result(top_indices, scores, topk, *args, **kwargs):
                 _, sorted_free = masked_scores.sort(dim=1, descending=True)
                 # The k-th -1 slot in each row gets sorted_free[row, k].
                 pad_cumsum = torch.cumsum(padding_mask.long(), dim=1) - 1
-                fill_values = torch.gather(sorted_free, 1, pad_cumsum.clamp(min=0)).to(
-                    top_indices.dtype
-                )
+                fill_values = torch.gather(sorted_free, 1, pad_cumsum.clamp(min=0)).to(top_indices.dtype)
                 top_indices = torch.where(padding_mask, fill_values, top_indices)
 
             if return_probs:
diff --git a/train_async.py b/train_async.py
@@ -77,5 +77,6 @@ async def train(args):
     args = parse_args()
     if getattr(args, "enable_r3_correctness_check", False):
         from miles.utils.replay_base import RoutingReplayManager
+
         RoutingReplayManager.enable_check_replay_result = True
     asyncio.run(train(args))

Original file line number	Diff line number	Diff line change
`@@ -1400,9 +1400,7 @@ def _compute_grouped_response_metrics(args, group: list[Sample], prefix: str) ->`
`1400`	`1400`	`}`
`1401`	`1401`
`1402`	`1402`
`1403`		`-def _compute_group_outcome_metrics(`
`1404`		`- args, all_samples: list[Sample], prefix: str = "reward"`
`1405`		`-) -> dict:`
	`1403`	`+def _compute_group_outcome_metrics(args, all_samples: list[Sample], prefix: str = "reward") -> dict:`
`1406`	`1404`	`"""Fraction of prompt groups that are unanimously correct or incorrect. GRPO only."""`
`1407`	`1405`	`if args.advantage_estimator == "ppo":`
`1408`	`1406`	`return {}`