diff --git a/src/gc.c b/src/gc.c
index cf04641d1fb69..ee5f6a0a53174 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -2742,13 +2742,16 @@ JL_EXTENSION NOINLINE void gc_mark_loop_serial(jl_ptls_t ptls)
     gc_drain_own_chunkqueue(ptls, &ptls->mark_queue);
 }
 
-void gc_mark_and_steal(jl_ptls_t ptls)
+int gc_mark_and_steal(jl_ptls_t ptls)
 {
     jl_gc_markqueue_t *mq = &ptls->mark_queue;
     jl_gc_markqueue_t *mq_master = NULL;
     int master_tid = jl_atomic_load(&gc_master_tid);
-    if (master_tid != -1)
-        mq_master = &gc_all_tls_states[master_tid]->mark_queue;
+    if (master_tid == -1) {
+        return 0;
+    }
+    mq_master = &gc_all_tls_states[master_tid]->mark_queue;
+    int marked = 0;
     void *new_obj;
     jl_gc_chunk_t c;
     pop : {
@@ -2764,6 +2767,7 @@ void gc_mark_and_steal(jl_ptls_t ptls)
         goto steal;
     }
     mark : {
+        marked = 1;
         gc_mark_outrefs(ptls, mq, new_obj, 0);
         goto pop;
     }
@@ -2792,12 +2796,10 @@ void gc_mark_and_steal(jl_ptls_t ptls)
             }
         }
         // Try to steal chunk from master thread
-        if (mq_master != NULL) {
-            c = gc_chunkqueue_steal_from(mq_master);
-            if (c.cid != GC_empty_chunk) {
-                gc_mark_chunk(ptls, mq, &c);
-                goto pop;
-            }
+        c = gc_chunkqueue_steal_from(mq_master);
+        if (c.cid != GC_empty_chunk) {
+            gc_mark_chunk(ptls, mq, &c);
+            goto pop;
         }
         // Try to steal pointer from random GC thread
         for (int i = 0; i < 4 * jl_n_markthreads; i++) {
@@ -2814,37 +2816,98 @@ void gc_mark_and_steal(jl_ptls_t ptls)
             if (new_obj != NULL)
                 goto mark;
         }
-        // Try to steal pointer from master thread
-        if (mq_master != NULL) {
-            new_obj = gc_ptr_queue_steal_from(mq_master);
-            if (new_obj != NULL)
-                goto mark;
-        }
+        new_obj = gc_ptr_queue_steal_from(mq_master);
+        if (new_obj != NULL)
+            goto mark;
     }
+    return marked;
 }
 
-void gc_mark_loop_parallel(jl_ptls_t ptls, int master)
+#define GC_BACKOFF_MIN_LG2 (1 << 3)
+#define GC_BACKOFF_MAX_LG2 (1 << 11)
+
+STATIC_INLINE void gc_sched_yield_reset_state(gc_sched_state_t *s) JL_NOTSAFEPOINT
 {
-    int backoff = GC_BACKOFF_MIN;
-    if (master) {
-        jl_atomic_store(&gc_master_tid, ptls->tid);
-        // Wake threads up and try to do some work
-        uv_mutex_lock(&gc_threads_lock);
-        jl_atomic_fetch_add(&gc_n_threads_marking, 1);
-        uv_cond_broadcast(&gc_threads_cond);
-        uv_mutex_unlock(&gc_threads_lock);
-        gc_mark_and_steal(ptls);
-        jl_atomic_fetch_add(&gc_n_threads_marking, -1);
+    s->yield_phase = GC_SPINNING;
+    s->backoff_lg2 = GC_BACKOFF_MIN_LG2;
+    s->n_spins_at_max = 0;
+}
+
+STATIC_INLINE void gc_sched_yield(gc_sched_state_t *s) JL_NOTSAFEPOINT
+{
+    if (s->yield_phase == GC_SPINNING) {
+        // spin for 2^backoff_lg2 iterations
+        for (int i = 0; i < (1 << s->backoff_lg2); i++) {
+            jl_cpu_pause();
+        }
+        if (s->backoff_lg2 == GC_BACKOFF_MAX_LG2) {
+            s->n_spins_at_max++;
+            // has been spinning for a while... should
+            // just sleep in the next failed steal attempt
+            if (s->n_spins_at_max >= 4) {
+                s->yield_phase = GC_SLEEPING;
+            }
+        }
+        else {
+            s->backoff_lg2++;
+        }
     }
+    else {
+        // sleep for 1ms
+        uv_sleep(1);
+    }
+}
+
+void gc_mark_loop_master_init(jl_ptls_t ptls)
+{
+    jl_atomic_store(&gc_master_tid, ptls->tid);
+    // Wake threads up and try to do some work
+    uv_mutex_lock(&gc_threads_lock);
+    jl_atomic_fetch_add(&gc_n_threads_marking, 1);
+    uv_cond_broadcast(&gc_threads_cond);
+    uv_mutex_unlock(&gc_threads_lock);
+    gc_mark_and_steal(ptls);
+    jl_atomic_fetch_add(&gc_n_threads_marking, -1);
+}
+
+void gc_mark_loop_parallel(jl_ptls_t ptls)
+{
+    gc_sched_state_t s;
+    gc_sched_yield_reset_state(&s);
     while (jl_atomic_load(&gc_n_threads_marking) > 0) {
         // Try to become a thief while other threads are marking
         jl_atomic_fetch_add(&gc_n_threads_marking, 1);
-        if (jl_atomic_load(&gc_master_tid) != -1) {
-            gc_mark_and_steal(ptls);
-        }
+        int marked = gc_mark_and_steal(ptls);
         jl_atomic_fetch_add(&gc_n_threads_marking, -1);
-        // Failed to steal
-        gc_backoff(&backoff);
+        if (marked) {
+            gc_sched_yield_reset_state(&s);
+        }
+        else {
+            gc_sched_yield(&s);
+        }
+    }
+}
+
+void gc_mark_loop_master(jl_ptls_t ptls)
+{
+    gc_mark_loop_master_init(ptls);
+    gc_mark_loop_parallel(ptls);
+}
+
+STATIC_INLINE int gc_may_mark(void) JL_NOTSAFEPOINT
+{
+    return jl_atomic_load(&gc_n_threads_marking) > 0;
+}
+
+void gc_mark_loop_worker(jl_ptls_t ptls)
+{
+    while (1) {
+        uv_mutex_lock(&gc_threads_lock);
+        while (!gc_may_mark()) {
+            uv_cond_wait(&gc_threads_cond, &gc_threads_lock);
+        }
+        uv_mutex_unlock(&gc_threads_lock);
+        gc_mark_loop_parallel(ptls);
     }
 }
 
@@ -2854,16 +2917,15 @@ void gc_mark_loop(jl_ptls_t ptls)
         gc_mark_loop_serial(ptls);
     }
     else {
-        gc_mark_loop_parallel(ptls, 1);
+        gc_mark_loop_master(ptls);
     }
 }
 
 void gc_mark_loop_barrier(void)
 {
     jl_atomic_store(&gc_master_tid, -1);
-    while (jl_atomic_load(&gc_n_threads_marking) != 0) {
-        jl_cpu_pause();
-    }
+    while (jl_atomic_load(&gc_n_threads_marking) != 0)
+        ;
 }
 
 void gc_mark_clean_reclaim_sets(void)
diff --git a/src/gc.h b/src/gc.h
index f713ebd9e9737..0a647a1208291 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -114,6 +114,16 @@ typedef struct _jl_gc_chunk_t {
 #define GC_PTR_QUEUE_INIT_SIZE (1 << 18)    // initial size of queue of `jl_value_t *`
 #define GC_CHUNK_QUEUE_INIT_SIZE (1 << 14)  // initial size of chunk-queue
 
+// State used for GC scheduling
+typedef struct {
+#define GC_SPINNING 0
+#define GC_SLEEPING 1
+    uint8_t yield_phase;    // whether the thread is spinning or sleeping
+                            // between failed steal attempts
+    size_t backoff_lg2;     // expontial backoff log counter
+    size_t n_spins_at_max;  // number of times it spinned at the maximum backoff
+} gc_sched_state_t;
+
 // layout for big (>2k) objects
 
 JL_EXTENSION typedef struct _bigval_t {
@@ -190,19 +200,6 @@ extern jl_gc_global_page_pool_t global_page_pool_lazily_freed;
 extern jl_gc_global_page_pool_t global_page_pool_clean;
 extern jl_gc_global_page_pool_t global_page_pool_freed;
 
-#define GC_BACKOFF_MIN 4
-#define GC_BACKOFF_MAX 12
-
-STATIC_INLINE void gc_backoff(int *i) JL_NOTSAFEPOINT
-{
-    if (*i < GC_BACKOFF_MAX) {
-        (*i)++;
-    }
-    for (int j = 0; j < (1 << *i); j++) {
-        jl_cpu_pause();
-    }
-}
-
 // Lock-free stack implementation taken
 // from Herlihy's "The Art of Multiprocessor Programming"
 // XXX: this is not a general-purpose lock-free stack. We can
@@ -460,7 +457,7 @@ void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t *
 void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_NOTSAFEPOINT;
 void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
 void gc_mark_loop_serial(jl_ptls_t ptls);
-void gc_mark_loop_parallel(jl_ptls_t ptls, int master);
+void gc_mark_loop_worker(jl_ptls_t ptls);
 void sweep_stack_pools(void);
 void jl_gc_debug_init(void);
 
diff --git a/src/partr.c b/src/partr.c
index 75d6d832fe78f..3f349d415df3b 100644
--- a/src/partr.c
+++ b/src/partr.c
@@ -107,12 +107,6 @@ void jl_init_threadinginfra(void)
 
 void JL_NORETURN jl_finish_task(jl_task_t *t);
 
-
-static inline int may_mark(void) JL_NOTSAFEPOINT
-{
-    return (jl_atomic_load(&gc_n_threads_marking) > 0);
-}
-
 // gc thread mark function
 void jl_gc_mark_threadfun(void *arg)
 {
@@ -128,14 +122,7 @@ void jl_gc_mark_threadfun(void *arg)
     // free the thread argument here
     free(targ);
 
-    while (1) {
-        uv_mutex_lock(&gc_threads_lock);
-        while (!may_mark()) {
-            uv_cond_wait(&gc_threads_cond, &gc_threads_lock);
-        }
-        uv_mutex_unlock(&gc_threads_lock);
-        gc_mark_loop_parallel(ptls, 0);
-    }
+    gc_mark_loop_worker(ptls);
 }
 
 // gc thread sweep function