worker.go

package work

import (
	"fmt"
	"math/rand"
	"reflect"
	"time"

	"github.com/gomodule/redigo/redis"
)

const fetchKeysPerJobType = 6

type worker struct {
	workerID      string
	poolID        string
	namespace     string
	pool          *redis.Pool
	jobTypes      map[string]*jobType
	sleepBackoffs []int64
	middleware    []*middlewareHandler
	contextType   reflect.Type

	redisFetchScript *redis.Script
	sampler          prioritySampler
	*observer

	stopChan         chan struct{}
	doneStoppingChan chan struct{}

	drainChan        chan struct{}
	doneDrainingChan chan struct{}
}

func newWorker(namespace string, poolID string, pool *redis.Pool, contextType reflect.Type, middleware []*middlewareHandler, jobTypes map[string]*jobType, sleepBackoffs []int64) *worker {
	workerID := makeIdentifier()
	ob := newObserver(namespace, pool, workerID)

	if len(sleepBackoffs) == 0 {
		sleepBackoffs = sleepBackoffsInMilliseconds
	}

	w := &worker{
		workerID:      workerID,
		poolID:        poolID,
		namespace:     namespace,
		pool:          pool,
		contextType:   contextType,
		sleepBackoffs: sleepBackoffs,

		observer: ob,

		stopChan:         make(chan struct{}),
		doneStoppingChan: make(chan struct{}),

		drainChan:        make(chan struct{}),
		doneDrainingChan: make(chan struct{}),
	}

	w.updateMiddlewareAndJobTypes(middleware, jobTypes)

	return w
}

// note: can't be called while the thing is started
func (w *worker) updateMiddlewareAndJobTypes(middleware []*middlewareHandler, jobTypes map[string]*jobType) {
	w.middleware = middleware
	sampler := prioritySampler{}
	for _, jt := range jobTypes {
		sampler.add(jt.Priority,
			redisKeyJobs(w.namespace, jt.Name),
			redisKeyJobsInProgress(w.namespace, w.poolID, jt.Name),
			redisKeyJobsPaused(w.namespace, jt.Name),
			redisKeyJobsLock(w.namespace, jt.Name),
			redisKeyJobsLockInfo(w.namespace, jt.Name),
			redisKeyJobsConcurrency(w.namespace, jt.Name))
	}
	w.sampler = sampler
	w.jobTypes = jobTypes
	w.redisFetchScript = redis.NewScript(len(jobTypes)*fetchKeysPerJobType, redisLuaFetchJob)
}

func (w *worker) start() {
	go w.loop()
	go w.observer.start()
}

func (w *worker) stop() {
	w.stopChan <- struct{}{}
	<-w.doneStoppingChan
	w.observer.drain()
	w.observer.stop()
}

func (w *worker) drain() {
	w.drainChan <- struct{}{}
	<-w.doneDrainingChan
	w.observer.drain()
}

var sleepBackoffsInMilliseconds = []int64{0, 10, 100, 1000, 5000}

func (w *worker) loop() {
	var drained bool
	var consequtiveNoJobs int64

	// Begin immediately. We'll change the duration on each tick with a timer.Reset()
	timer := time.NewTimer(0)
	defer timer.Stop()

	for {
		select {
		case <-w.stopChan:
			w.doneStoppingChan <- struct{}{}
			return
		case <-w.drainChan:
			drained = true
			timer.Reset(0)
		case <-timer.C:
			job, err := w.fetchJob()
			if err != nil {
				logError("worker.fetch", err)
				timer.Reset(10 * time.Millisecond)
			} else if job != nil {
				w.processJob(job)
				consequtiveNoJobs = 0
				timer.Reset(0)
			} else {
				if drained {
					w.doneDrainingChan <- struct{}{}
					drained = false
				}
				consequtiveNoJobs++
				idx := consequtiveNoJobs
				if idx >= int64(len(w.sleepBackoffs)) {
					idx = int64(len(w.sleepBackoffs)) - 1
				}
				timer.Reset(time.Duration(w.sleepBackoffs[idx]) * time.Millisecond)
			}
		}
	}
}

// fetchJob returns a job, or nil if there are no jobs.
// It looks for any of the registered jobs. As soon it finds one, it
// extracts the job from the queue <namespace>:jobs:<jobName> and puts it to the queue
// <namespace>:jobs:<jobName>:<poolID>:inprogress. For more details see redisLuaFetchJob lua script.
// The found job is returned as a Job struct.
func (w *worker) fetchJob() (*Job, error) {
	// resort queues
	// NOTE: we could optimize this to only resort every second, or something.
	w.sampler.sample()
	numKeys := len(w.sampler.samples) * fetchKeysPerJobType
	var scriptArgs = make([]interface{}, 0, numKeys+1)

	for _, s := range w.sampler.samples {
		scriptArgs = append(scriptArgs, s.redisJobs, s.redisJobsInProg, s.redisJobsPaused, s.redisJobsLock, s.redisJobsLockInfo, s.redisJobsMaxConcurrency) // KEYS[1-6 * N]
	}
	scriptArgs = append(scriptArgs, w.poolID) // ARGV[1]
	conn := w.pool.Get()
	defer conn.Close()

	values, err := redis.Values(w.redisFetchScript.Do(conn, scriptArgs...))
	if err == redis.ErrNil {
		return nil, nil
	} else if err != nil {
		return nil, err
	}

	if len(values) != 3 {
		return nil, fmt.Errorf("need 3 elements back")
	}

	rawJSON, ok := values[0].([]byte)
	if !ok {
		return nil, fmt.Errorf("response msg not bytes")
	}

	dequeuedFrom, ok := values[1].([]byte)
	if !ok {
		return nil, fmt.Errorf("response queue not bytes")
	}

	inProgQueue, ok := values[2].([]byte)
	if !ok {
		return nil, fmt.Errorf("response in prog not bytes")
	}

	job, err := newJob(rawJSON, dequeuedFrom, inProgQueue)
	if err != nil {
		return nil, err
	}

	return job, nil
}

func (w *worker) processJob(job *Job) {
	if job.Unique {
		updatedJob := w.getUniqueJob(job)
		// This is to support the old way of doing it, where we used the job off the queue and just deleted the unique key
		// Going forward the job on the queue will always be just a placeholder, and we will be replacing it with the
		// updated job extracted here
		if updatedJob != nil {
			job = updatedJob
		}
	}
	var runErr error
	jt := w.jobTypes[job.Name]
	if jt == nil {
		runErr = fmt.Errorf("stray job: no handler")
		logError("process_job.stray", runErr)
	} else {
		w.observeStarted(job.Name, job.ID, job.Args)
		job.observer = w.observer // for Checkin
		_, runErr = runJob(job, w.contextType, w.middleware, jt)
		w.observeDone(job.Name, job.ID, runErr)
	}

	fate := terminateOnly
	op := opTerminate
	if runErr != nil {
		job.failed(runErr)
		fate, op = w.jobFate(jt, job)
	}
	w.removeJobFromInProgress(job, fate)

	// Remove unique job after it has finished or has been put in dead queue.
	if job.Unique && op != opRetry {
		w.deleteUniqueJob(job)
	}
}

func (w *worker) getUniqueJob(job *Job) *Job {
	var uniqueKey string
	var err error

	if job.UniqueKey != "" {
		uniqueKey = job.UniqueKey
	} else { // For jobs put in queue prior to this change. In the future this can be deleted as there will always be a UniqueKey
		uniqueKey, err = redisKeyUniqueJob(w.namespace, job.Name, job.Args)
		if err != nil {
			logError("worker.get_unique_job.key", err)
			return nil
		}
	}

	conn := w.pool.Get()
	defer conn.Close()

	rawJSON, err := redis.Bytes(conn.Do("GET", uniqueKey))
	if err != nil {
		logError("worker.get_unique_job.get", err)
		return nil
	}

	// Previous versions did not support updated arguments and just set key to 1, so in these cases we should do nothing.
	// In the future this can be deleted, as we will always be getting arguments from here.
	// If job.Fails != 0, that means the job comes from retry queue and we already have all args.
	if string(rawJSON) == "1" || job.Fails != 0 {
		return nil
	}

	// The job pulled off the queue was just a placeholder with no args, so replace it
	jobWithArgs, err := newJob(rawJSON, job.dequeuedFrom, job.inProgQueue)
	if err != nil {
		logError("worker.get_unique_job.updated_job", err)
		return nil
	}

	// This is a hack to fix the following problem.
	// If a job is scheduled with a unique key (EnqueueUniqueInByKey), it's added in 2 places in redis:
	// scheduled queue and under unique key.
	// A requeuer loop calls a lua script, which extracts the job from the scheduled queue and
	// puts it to the jobs queue. Also the script adds a new field to the json body of the job using cjson library.
	// It encodes json with a different field order than the golang encoding/json.
	// Later on, a worker loop moves the job from the jobs queue to the inprocess queue.
	// The worker after processing the job, deletes the job from the inprocess queue and the unique key,
	// but
	// for deletion it uses rawJson from unique job received from unique key, which doesn't match the json body of the job
	// in the inprocess queue. Without this hack we'd get memory leak in redis, because the job would never be deleted
	// from the inprocess queue.
	//
	// EnqueueUniqueInByKey -> scheduled queue -> (json body is modified) -> jobs queue -> inprocess queue -> (handle job) -> delete from inprocess queue
	//                      -> unique key                                                                                     using rawJson from unique key
	//
	// NOTE: this field is used only to delete the job from the inprocess queue.
	// job.rawJSON is the original json body of the job coming from jobs queue.
	jobWithArgs.rawJSON = job.rawJSON

	return jobWithArgs
}

func (w *worker) deleteUniqueJob(job *Job) {
	var uniqueKey string
	var err error

	if job.UniqueKey != "" {
		uniqueKey = job.UniqueKey
	} else { // For jobs put in queue prior to this change. In the future this can be deleted as there will always be a UniqueKey
		uniqueKey, err = redisKeyUniqueJob(w.namespace, job.Name, job.Args)
		if err != nil {
			logError("worker.delete_unique_job.key", err)
			return
		}
	}

	conn := w.pool.Get()
	defer conn.Close()

	_, err = conn.Do("DEL", uniqueKey)
	if err != nil {
		logError("worker.delete_unique_job.del", err)
	}
}

func (w *worker) removeJobFromInProgress(job *Job, fate terminateOp) {
	conn := w.pool.Get()
	defer conn.Close()

	conn.Send("MULTI")
	conn.Send("LREM", job.inProgQueue, 1, job.rawJSON)
	conn.Send("DECR", redisKeyJobsLock(w.namespace, job.Name))
	conn.Send("HINCRBY", redisKeyJobsLockInfo(w.namespace, job.Name), w.poolID, -1)
	fate(conn)
	if _, err := conn.Do("EXEC"); err != nil {
		logError("worker.remove_job_from_in_progress.lrem", err)
	}
}

type terminateOp func(conn redis.Conn)

// opType describes the type of terminateOp.
// It's used to distinguish between terminateOp functions.
type opType int

const (
	opTerminate = opType(iota)
	opRetry
	opDead
)

func terminateOnly(_ redis.Conn) { return }
func terminateAndRetry(w *worker, jt *jobType, job *Job) (terminateOp, opType) {
	rawJSON, err := job.serialize()
	if err != nil {
		logError("worker.terminate_and_retry.serialize", err)
		return terminateOnly, opTerminate
	}
	return func(conn redis.Conn) {
		conn.Send("ZADD", redisKeyRetry(w.namespace), nowEpochSeconds()+jt.calcBackoff(job), rawJSON)
	}, opRetry
}
func terminateAndDead(w *worker, job *Job) (terminateOp, opType) {
	rawJSON, err := job.serialize()
	if err != nil {
		logError("worker.terminate_and_dead.serialize", err)
		return terminateOnly, opTerminate
	}
	return func(conn redis.Conn) {
		// NOTE: sidekiq limits the # of jobs: only keep jobs for 6 months, and only keep a max # of jobs
		// The max # of jobs seems really horrible. Seems like operations should be on top of it.
		// conn.Send("ZREMRANGEBYSCORE", redisKeyDead(w.namespace), "-inf", now - keepInterval)
		// conn.Send("ZREMRANGEBYRANK", redisKeyDead(w.namespace), 0, -maxJobs)

		conn.Send("ZADD", redisKeyDead(w.namespace), nowEpochSeconds(), rawJSON)
	}, opDead
}

func (w *worker) jobFate(jt *jobType, job *Job) (terminateOp, opType) {
	if jt != nil {
		failsRemaining := int64(jt.MaxFails) - job.Fails
		if failsRemaining > 0 {
			return terminateAndRetry(w, jt, job)
		}
		if jt.SkipDead {
			return terminateOnly, opTerminate
		}
	}
	return terminateAndDead(w, job)
}

// Default algorithm returns an fastly increasing backoff counter which grows in an unbounded fashion
func defaultBackoffCalculator(job *Job) int64 {
	fails := job.Fails
	return (fails * fails * fails * fails) + 15 + (rand.Int63n(30) * (fails + 1))
}