Skip to content
This repository has been archived by the owner on Jul 11, 2023. It is now read-only.

Commit

Permalink
fix(balancer) recover after dns updates/auto-update
Browse files Browse the repository at this point in the history
Since the health was introduced on the balancer, there is an early
exit if the balancer is unhealthy. This introduced a problem where
DNS record would remain untouched, and hence would not be refreshed.
Leaving the balancer in an unhealthy state forever.

A background timer is added to check and update records every second.

issue: Kong/kong#4781
  • Loading branch information
Tieske committed Aug 7, 2019
1 parent cd780d5 commit fb1e92a
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 40 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ History

Versioning is strictly based on [Semantic Versioning](https://semver.org/)

### 4.1.0 (unreleased)

- Fix: unhelthy balancers would not recover because they would not refresh the
DNS records used. See [PR 73](https://github.com/Kong/lua-resty-dns-client/pull/73).
- Added: automatic background resolving of hostnames, expiry will be checked
every second, and if needed DNS (and balancer) will be updated. See [PR 73](https://github.com/Kong/lua-resty-dns-client/pull/73).

### 4.0.0 (26-Jun-2019)

- BREAKING: the balancer callback is called with a new event; "health" whenever
Expand Down
38 changes: 38 additions & 0 deletions spec/balancer/generic_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -1559,6 +1559,44 @@ for algorithm, balancer_module in helpers.balancer_types() do
assert.not_nil(b:getPeer())
end)


it("recovers when dns entries are replaced by healthy ones", function()
local record = dnsA({
{ name = "getkong.org", address = "1.2.3.4" },
})
b:addHost("getkong.org", 8000, 50)
assert.not_nil(b:getPeer())

-- mark it as unhealthy
assert(b:setAddressStatus(false, "1.2.3.4", 8000, "getkong.org"))
assert.same({
nil, "Balancer is unhealthy", nil, nil,
}, {
b:getPeer()
}
)

-- expire DNS and add a new backend IP
-- balancer should now recover since a new healthy backend is available
record.expire = 0
dnsA({
{ name = "getkong.org", address = "5.6.7.8" },
})

local timeout = ngx.now() + 5 -- we'll try for 5 seconds
while true do
assert(ngx.now() < timeout, "timeout")

local ip = b:getPeer()
if ip == "5.6.7.8" then
break -- expected result, success!
end

ngx.sleep(0.1) -- wait a bit before retrying
end

end)

end)

end)
Expand Down
68 changes: 28 additions & 40 deletions src/resty/dns/balancer/base.lua
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,9 @@
--
-- `balancer <1 --- many> hosts <1 --- many> addresses`
--
-- Updating the dns records is passive, meaning that when `getPeer` accesses a
-- target, only then the check is done whether the record is stale, and if so
-- it is updated. The one exception is the failed dns queries (because they
-- have no indices assigned, and hence will never be hit by `getPeer`), which will
-- be actively refreshed using a timer.
-- Updating the DNS records is active, meaning that a background task is running
-- to periodically update DNS records when they expire. This only applies to the
-- hostnames added to the balancer, not to any nested DNS records.
--
-- __Weights__
--
Expand Down Expand Up @@ -565,12 +563,12 @@ function objHost:queryDns(cacheOnly)
ngx_log(ngx_WARN, self.log_prefix, "querying dns for ", self.hostname,
" failed: ", err , ". Tried ", tostring(try_list))

-- query failed, create a fake recorded, flagged as failed.
-- query failed, create a fake record
-- the empty record will cause all existing addresses to be removed
newQuery = {
__errorQueryFlag = true --flag to mark the record as a failed lookup
expire = time() + self.balancer.requeryInterval,
touched = time()
}
self.balancer:startRequery()
end

assert_atomicity(update_dns_result, self, newQuery, dns)
Expand Down Expand Up @@ -1196,50 +1194,26 @@ function objBalancer:setHostStatus(available, hostname, port)
end
-- Timer invoked to check for failed queries
function objBalancer:requeryTimerCallback()
-- Timer invoked to update DNS records
function objBalancer:resolveTimerCallback()
--check all hosts for expired records,
--erroreed ones included
--we update, so changes on the list while traversing can happen, keep track of that
ngx_log(ngx_DEBUG, self.log_prefix, "executing requery timer")
local all_ok = true
for _, host in ipairs(self.hosts) do
-- only retry the errorred ones
if host.lastQuery.__errorQueryFlag then
if (host.lastQuery.expire or 0) < time() then
all_ok = false -- note: only if NO requery at all is done, we are 'all_ok'
-- if even a single dns query is performed, we yield on the dns socket
-- operation and our universe might have changed. Could lead to nasty
-- race-conditions otherwise.
host:queryDns(false) -- timer-context; cacheOnly always false
end
end
if all_ok then
-- shutdown recurring timer
ngx_log(ngx_DEBUG, self.log_prefix, "requery success, stopping timer")
self.requeryTimer:cancel()
self.requeryTimer = nil
else
-- not done yet
ngx_log(ngx_DEBUG, self.log_prefix, "requery failure")
end
ngx_log(ngx_DEBUG, self.log_prefix, "requery completed")
end
-- Starts the requery timer.
function objBalancer:startRequery()
if self.requeryTimer then return end -- already running, nothing to do here
local err
self.requeryTimer, err = resty_timer({
recurring = true,
interval = self.requeryInterval,
detached = false,
expire = self.requeryTimerCallback,
}, self)
if not self.requeryTimer then
ngx_log(ngx_ERR, self.log_prefix, "failed to create the timer: ", err)
end
end
--- Sets an event callback for user code. The callback is invoked for
-- every address added to/removed from the balancer, and on health changes.
Expand Down Expand Up @@ -1385,7 +1359,7 @@ _M.new = function(opts)
weight = 0, -- total weight of all hosts
unavailableWeight = 0, -- the unavailable weight (range: 0 - weight)
dns = opts.dns, -- the configured dns client to use for resolving
requeryTimer = nil, -- requery timer is not running, see `startRequery`
resolveTimer = nil,
requeryInterval = opts.requery or REQUERY_INTERVAL, -- how often to requery failed dns lookups (seconds)
ttl0Interval = opts.ttl0 or TTL_0_RETRY, -- refreshing ttl=0 records
healthy = false, -- initial healthstatus of the balancer
Expand All @@ -1396,6 +1370,20 @@ _M.new = function(opts)
self:setCallback(opts.callback or function() end) -- callback for address mutations
do
local err
self.resolveTimer, err = resty_timer({
recurring = true,
interval = 1, -- check for expired records every 1 second
detached = false,
expire = self.resolveTimerCallback,
}, self)
if not self.resolveTimer then
ngx_log(ngx_ERR, self.log_prefix, "failed to create the timer: ", err)
end
end
ngx_log(ngx_DEBUG, self.log_prefix, "balancer_base created")
return self
end
Expand Down

0 comments on commit fb1e92a

Please sign in to comment.