From 8c750f6504bf8e4a0e3fcbcb4b6483b3a947e4dc Mon Sep 17 00:00:00 2001
From: Shri Javadekar <shrinand@gmail.com>
Date: Wed, 24 Jan 2024 21:50:37 -0800
Subject: [PATCH] Get OOM errors to stderr and the UI

There is a race between Metaflow detecting that a pod failed execution
and the reason for pod failure being set on the pod. As a result,
at times, the failure reason doesn't posted to stderr.

This change makes Metaflow try a little harder to get the reason for
the failures.

For pods that get OOM killed, this change worked just fine.

Without this change, the OOM killed pod would simply die and the user
would have no idea why. With this change, the error on stderr shows:

Task ran out of memory. Increase the available memory by specifying @resource(memory=...) for the step.
---
 metaflow/plugins/kubernetes/kubernetes.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/metaflow/plugins/kubernetes/kubernetes.py b/metaflow/plugins/kubernetes/kubernetes.py
index 0237a225e2..d219bfcd8a 100644
--- a/metaflow/plugins/kubernetes/kubernetes.py
+++ b/metaflow/plugins/kubernetes/kubernetes.py
@@ -341,6 +341,8 @@ def update_delay(secs_since_start):
             sigmoid = 1.0 / (1.0 + math.exp(-0.01 * secs_since_start + 9.0))
             return 0.5 + sigmoid * 30.0
 
+        start_time = time.time()
+
         def wait_for_launch(job):
             status = job.status
             echo(
@@ -349,7 +351,6 @@ def wait_for_launch(job):
                 job_id=job.id,
             )
             t = time.time()
-            start_time = time.time()
             while job.is_waiting:
                 new_status = job.status
                 if status != new_status or (time.time() - t) > 30:
@@ -389,7 +390,7 @@ def wait_for_launch(job):
         #        truncated logs if it doesn't.
         # TODO : For hard crashes, we can fetch logs from the pod.
 
-        if self._job.has_failed:
+        def _handle_exit_code():
             exit_code, reason = self._job.reason
             msg = next(
                 msg
@@ -405,8 +406,7 @@ def wait_for_launch(job):
                 if int(exit_code) == 137:
                     raise KubernetesException(
                         "Task ran out of memory. "
-                        "Increase the available memory by specifying "
-                        "@resource(memory=...) for the step. "
+                        "Increase the available memory for the step."
                     )
                 if int(exit_code) == 134:
                     raise KubernetesException("%s (exit code %s)" % (msg, exit_code))
@@ -416,7 +416,14 @@ def wait_for_launch(job):
                 "%s. This could be a transient error. Use @retry to retry." % msg
             )
 
+        if self._job.has_failed:
+            _handle_exit_code()
+
         exit_code, _ = self._job.reason
+        if exit_code != 0:
+            # One more attempt to get the reason; after a small delay
+            time.sleep(time.time() - start_time)
+            _handle_exit_code()
         echo(
             "Task finished with exit code %s." % exit_code,
             "stderr",