diff --git a/src/gc.c b/src/gc.c index cf04641d1fb69..ee5f6a0a53174 100644 --- a/src/gc.c +++ b/src/gc.c @@ -2742,13 +2742,16 @@ JL_EXTENSION NOINLINE void gc_mark_loop_serial(jl_ptls_t ptls) gc_drain_own_chunkqueue(ptls, &ptls->mark_queue); } -void gc_mark_and_steal(jl_ptls_t ptls) +int gc_mark_and_steal(jl_ptls_t ptls) { jl_gc_markqueue_t *mq = &ptls->mark_queue; jl_gc_markqueue_t *mq_master = NULL; int master_tid = jl_atomic_load(&gc_master_tid); - if (master_tid != -1) - mq_master = &gc_all_tls_states[master_tid]->mark_queue; + if (master_tid == -1) { + return 0; + } + mq_master = &gc_all_tls_states[master_tid]->mark_queue; + int marked = 0; void *new_obj; jl_gc_chunk_t c; pop : { @@ -2764,6 +2767,7 @@ void gc_mark_and_steal(jl_ptls_t ptls) goto steal; } mark : { + marked = 1; gc_mark_outrefs(ptls, mq, new_obj, 0); goto pop; } @@ -2792,12 +2796,10 @@ void gc_mark_and_steal(jl_ptls_t ptls) } } // Try to steal chunk from master thread - if (mq_master != NULL) { - c = gc_chunkqueue_steal_from(mq_master); - if (c.cid != GC_empty_chunk) { - gc_mark_chunk(ptls, mq, &c); - goto pop; - } + c = gc_chunkqueue_steal_from(mq_master); + if (c.cid != GC_empty_chunk) { + gc_mark_chunk(ptls, mq, &c); + goto pop; } // Try to steal pointer from random GC thread for (int i = 0; i < 4 * jl_n_markthreads; i++) { @@ -2814,37 +2816,98 @@ void gc_mark_and_steal(jl_ptls_t ptls) if (new_obj != NULL) goto mark; } - // Try to steal pointer from master thread - if (mq_master != NULL) { - new_obj = gc_ptr_queue_steal_from(mq_master); - if (new_obj != NULL) - goto mark; - } + new_obj = gc_ptr_queue_steal_from(mq_master); + if (new_obj != NULL) + goto mark; } + return marked; } -void gc_mark_loop_parallel(jl_ptls_t ptls, int master) +#define GC_BACKOFF_MIN_LG2 (1 << 3) +#define GC_BACKOFF_MAX_LG2 (1 << 11) + +STATIC_INLINE void gc_sched_yield_reset_state(gc_sched_state_t *s) JL_NOTSAFEPOINT { - int backoff = GC_BACKOFF_MIN; - if (master) { - jl_atomic_store(&gc_master_tid, ptls->tid); - // Wake threads up and try to do some work - uv_mutex_lock(&gc_threads_lock); - jl_atomic_fetch_add(&gc_n_threads_marking, 1); - uv_cond_broadcast(&gc_threads_cond); - uv_mutex_unlock(&gc_threads_lock); - gc_mark_and_steal(ptls); - jl_atomic_fetch_add(&gc_n_threads_marking, -1); + s->yield_phase = GC_SPINNING; + s->backoff_lg2 = GC_BACKOFF_MIN_LG2; + s->n_spins_at_max = 0; +} + +STATIC_INLINE void gc_sched_yield(gc_sched_state_t *s) JL_NOTSAFEPOINT +{ + if (s->yield_phase == GC_SPINNING) { + // spin for 2^backoff_lg2 iterations + for (int i = 0; i < (1 << s->backoff_lg2); i++) { + jl_cpu_pause(); + } + if (s->backoff_lg2 == GC_BACKOFF_MAX_LG2) { + s->n_spins_at_max++; + // has been spinning for a while... should + // just sleep in the next failed steal attempt + if (s->n_spins_at_max >= 4) { + s->yield_phase = GC_SLEEPING; + } + } + else { + s->backoff_lg2++; + } } + else { + // sleep for 1ms + uv_sleep(1); + } +} + +void gc_mark_loop_master_init(jl_ptls_t ptls) +{ + jl_atomic_store(&gc_master_tid, ptls->tid); + // Wake threads up and try to do some work + uv_mutex_lock(&gc_threads_lock); + jl_atomic_fetch_add(&gc_n_threads_marking, 1); + uv_cond_broadcast(&gc_threads_cond); + uv_mutex_unlock(&gc_threads_lock); + gc_mark_and_steal(ptls); + jl_atomic_fetch_add(&gc_n_threads_marking, -1); +} + +void gc_mark_loop_parallel(jl_ptls_t ptls) +{ + gc_sched_state_t s; + gc_sched_yield_reset_state(&s); while (jl_atomic_load(&gc_n_threads_marking) > 0) { // Try to become a thief while other threads are marking jl_atomic_fetch_add(&gc_n_threads_marking, 1); - if (jl_atomic_load(&gc_master_tid) != -1) { - gc_mark_and_steal(ptls); - } + int marked = gc_mark_and_steal(ptls); jl_atomic_fetch_add(&gc_n_threads_marking, -1); - // Failed to steal - gc_backoff(&backoff); + if (marked) { + gc_sched_yield_reset_state(&s); + } + else { + gc_sched_yield(&s); + } + } +} + +void gc_mark_loop_master(jl_ptls_t ptls) +{ + gc_mark_loop_master_init(ptls); + gc_mark_loop_parallel(ptls); +} + +STATIC_INLINE int gc_may_mark(void) JL_NOTSAFEPOINT +{ + return jl_atomic_load(&gc_n_threads_marking) > 0; +} + +void gc_mark_loop_worker(jl_ptls_t ptls) +{ + while (1) { + uv_mutex_lock(&gc_threads_lock); + while (!gc_may_mark()) { + uv_cond_wait(&gc_threads_cond, &gc_threads_lock); + } + uv_mutex_unlock(&gc_threads_lock); + gc_mark_loop_parallel(ptls); } } @@ -2854,16 +2917,15 @@ void gc_mark_loop(jl_ptls_t ptls) gc_mark_loop_serial(ptls); } else { - gc_mark_loop_parallel(ptls, 1); + gc_mark_loop_master(ptls); } } void gc_mark_loop_barrier(void) { jl_atomic_store(&gc_master_tid, -1); - while (jl_atomic_load(&gc_n_threads_marking) != 0) { - jl_cpu_pause(); - } + while (jl_atomic_load(&gc_n_threads_marking) != 0) + ; } void gc_mark_clean_reclaim_sets(void) diff --git a/src/gc.h b/src/gc.h index f713ebd9e9737..0a647a1208291 100644 --- a/src/gc.h +++ b/src/gc.h @@ -114,6 +114,16 @@ typedef struct _jl_gc_chunk_t { #define GC_PTR_QUEUE_INIT_SIZE (1 << 18) // initial size of queue of `jl_value_t *` #define GC_CHUNK_QUEUE_INIT_SIZE (1 << 14) // initial size of chunk-queue +// State used for GC scheduling +typedef struct { +#define GC_SPINNING 0 +#define GC_SLEEPING 1 + uint8_t yield_phase; // whether the thread is spinning or sleeping + // between failed steal attempts + size_t backoff_lg2; // expontial backoff log counter + size_t n_spins_at_max; // number of times it spinned at the maximum backoff +} gc_sched_state_t; + // layout for big (>2k) objects JL_EXTENSION typedef struct _bigval_t { @@ -190,19 +200,6 @@ extern jl_gc_global_page_pool_t global_page_pool_lazily_freed; extern jl_gc_global_page_pool_t global_page_pool_clean; extern jl_gc_global_page_pool_t global_page_pool_freed; -#define GC_BACKOFF_MIN 4 -#define GC_BACKOFF_MAX 12 - -STATIC_INLINE void gc_backoff(int *i) JL_NOTSAFEPOINT -{ - if (*i < GC_BACKOFF_MAX) { - (*i)++; - } - for (int j = 0; j < (1 << *i); j++) { - jl_cpu_pause(); - } -} - // Lock-free stack implementation taken // from Herlihy's "The Art of Multiprocessor Programming" // XXX: this is not a general-purpose lock-free stack. We can @@ -460,7 +457,7 @@ void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t **fl_begin, jl_value_t * void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_NOTSAFEPOINT; void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq); void gc_mark_loop_serial(jl_ptls_t ptls); -void gc_mark_loop_parallel(jl_ptls_t ptls, int master); +void gc_mark_loop_worker(jl_ptls_t ptls); void sweep_stack_pools(void); void jl_gc_debug_init(void); diff --git a/src/partr.c b/src/partr.c index 75d6d832fe78f..3f349d415df3b 100644 --- a/src/partr.c +++ b/src/partr.c @@ -107,12 +107,6 @@ void jl_init_threadinginfra(void) void JL_NORETURN jl_finish_task(jl_task_t *t); - -static inline int may_mark(void) JL_NOTSAFEPOINT -{ - return (jl_atomic_load(&gc_n_threads_marking) > 0); -} - // gc thread mark function void jl_gc_mark_threadfun(void *arg) { @@ -128,14 +122,7 @@ void jl_gc_mark_threadfun(void *arg) // free the thread argument here free(targ); - while (1) { - uv_mutex_lock(&gc_threads_lock); - while (!may_mark()) { - uv_cond_wait(&gc_threads_cond, &gc_threads_lock); - } - uv_mutex_unlock(&gc_threads_lock); - gc_mark_loop_parallel(ptls, 0); - } + gc_mark_loop_worker(ptls); } // gc thread sweep function