Skip to content
This repository has been archived by the owner on Jul 11, 2023. It is now read-only.

Commit

Permalink
fix(balancer) recover after dns updates (#73)
Browse files Browse the repository at this point in the history
* fix(balancer) recover after dns updates/auto-update

Since the health was introduced on the balancer, there is an early
exit if the balancer is unhealthy. This introduced a problem where
DNS record would remain untouched, and hence would not be refreshed.
Leaving the balancer in an unhealthy state forever.

A background timer is added to check and update records every second.

issue: Kong/kong#4781
  • Loading branch information
Tieske committed Aug 7, 2019
1 parent cd780d5 commit 4ee0d70
Show file tree
Hide file tree
Showing 11 changed files with 76 additions and 54 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ History

Versioning is strictly based on [Semantic Versioning](https://semver.org/)

### 4.1.0 (unreleased)

- Fix: unhealthy balancers would not recover because they would not refresh the
DNS records used. See [PR 73](https://github.com/Kong/lua-resty-dns-client/pull/73).
- Added: automatic background resolving of hostnames, expiry will be checked
every second, and if needed DNS (and balancer) will be updated. See [PR 73](https://github.com/Kong/lua-resty-dns-client/pull/73).

### 4.0.0 (26-Jun-2019)

- BREAKING: the balancer callback is called with a new event; "health" whenever
Expand Down
1 change: 0 additions & 1 deletion spec/balancer/base_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ describe("[balancer_base]", function()

setup(function()
_G.package.loaded["resty.dns.client"] = nil -- make sure module is reloaded
_G._TEST = true -- expose internals for test purposes
balancer_base = require "resty.dns.balancer.base"
client = require "resty.dns.client"
end)
Expand Down
39 changes: 38 additions & 1 deletion spec/balancer/generic_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ for algorithm, balancer_module in helpers.balancer_types() do

setup(function()
_G.package.loaded["resty.dns.client"] = nil -- make sure module is reloaded
_G._TEST = true -- expose internals for test purposes
client = require "resty.dns.client"
end)

Expand Down Expand Up @@ -1559,6 +1558,44 @@ for algorithm, balancer_module in helpers.balancer_types() do
assert.not_nil(b:getPeer())
end)


it("recovers when dns entries are replaced by healthy ones", function()
local record = dnsA({
{ name = "getkong.org", address = "1.2.3.4" },
})
b:addHost("getkong.org", 8000, 50)
assert.not_nil(b:getPeer())

-- mark it as unhealthy
assert(b:setAddressStatus(false, "1.2.3.4", 8000, "getkong.org"))
assert.same({
nil, "Balancer is unhealthy", nil, nil,
}, {
b:getPeer()
}
)

-- expire DNS and add a new backend IP
-- balancer should now recover since a new healthy backend is available
record.expire = 0
dnsA({
{ name = "getkong.org", address = "5.6.7.8" },
})

local timeout = ngx.now() + 5 -- we'll try for 5 seconds
while true do
assert(ngx.now() < timeout, "timeout")

local ip = b:getPeer()
if ip == "5.6.7.8" then
break -- expected result, success!
end

ngx.sleep(0.1) -- wait a bit before retrying
end

end)

end)

end)
Expand Down
1 change: 0 additions & 1 deletion spec/balancer/least_connections_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ describe("[least-connections]", function()

setup(function()
_G.package.loaded["resty.dns.client"] = nil -- make sure module is reloaded
_G._TEST = true -- expose internals for test purposes
lcb = require "resty.dns.balancer.least_connections"
client = require "resty.dns.client"
end)
Expand Down
1 change: 0 additions & 1 deletion spec/balancer/ring_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ describe("[ringbalancer]", function()

setup(function()
_G.package.loaded["resty.dns.client"] = nil -- make sure module is reloaded
_G._TEST = true -- expose internals for test purposes
balancer = require "resty.dns.balancer.ring"
client = require "resty.dns.client"
end)
Expand Down
2 changes: 0 additions & 2 deletions spec/client_cache_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ describe("[DNS client cache]", function()
local client, resolver, query_func

before_each(function()
_G._TEST = true
client = require("resty.dns.client")
resolver = require("resty.dns.resolver")

Expand Down Expand Up @@ -54,7 +53,6 @@ describe("[DNS client cache]", function()
client = nil
resolver = nil
query_func = nil
_G._TEST = nil
end)


Expand Down
2 changes: 0 additions & 2 deletions spec/client_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ describe("[DNS client]", function()
local client, resolver, query_func

before_each(function()
_G._TEST = true
client = require("resty.dns.client")
resolver = require("resty.dns.resolver")

Expand Down Expand Up @@ -63,7 +62,6 @@ describe("[DNS client]", function()
client = nil
resolver = nil
query_func = nil
_G._TEST = nil
end)

describe("initialization", function()
Expand Down
71 changes: 28 additions & 43 deletions src/resty/dns/balancer/base.lua
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,9 @@
--
-- `balancer <1 --- many> hosts <1 --- many> addresses`
--
-- Updating the dns records is passive, meaning that when `getPeer` accesses a
-- target, only then the check is done whether the record is stale, and if so
-- it is updated. The one exception is the failed dns queries (because they
-- have no indices assigned, and hence will never be hit by `getPeer`), which will
-- be actively refreshed using a timer.
-- Updating the DNS records is active, meaning that a background task is running
-- to periodically update DNS records when they expire. This only applies to the
-- hostnames added to the balancer, not to any nested DNS records.
--
-- __Weights__
--
Expand Down Expand Up @@ -143,7 +141,6 @@ local table_concat = table.concat
local math_floor = math.floor
local string_format = string.format
local ngx_log = ngx.log
local ngx_ERR = ngx.ERR
local ngx_DEBUG = ngx.DEBUG
local ngx_WARN = ngx.WARN
local balancer_id_counter = 0
Expand Down Expand Up @@ -565,12 +562,12 @@ function objHost:queryDns(cacheOnly)
ngx_log(ngx_WARN, self.log_prefix, "querying dns for ", self.hostname,
" failed: ", err , ". Tried ", tostring(try_list))

-- query failed, create a fake recorded, flagged as failed.
-- query failed, create a fake record
-- the empty record will cause all existing addresses to be removed
newQuery = {
__errorQueryFlag = true --flag to mark the record as a failed lookup
expire = time() + self.balancer.requeryInterval,
touched = time()
}
self.balancer:startRequery()
end

assert_atomicity(update_dns_result, self, newQuery, dns)
Expand Down Expand Up @@ -1196,50 +1193,24 @@ function objBalancer:setHostStatus(available, hostname, port)
end
-- Timer invoked to check for failed queries
function objBalancer:requeryTimerCallback()
-- Timer invoked to update DNS records
function objBalancer:resolveTimerCallback()
--check all hosts for expired records,
--including those with errors
--we update, so changes on the list while traversing can happen, keep track of that
ngx_log(ngx_DEBUG, self.log_prefix, "executing requery timer")
local all_ok = true
for _, host in ipairs(self.hosts) do
-- only retry the errorred ones
if host.lastQuery.__errorQueryFlag then
all_ok = false -- note: only if NO requery at all is done, we are 'all_ok'
-- if even a single dns query is performed, we yield on the dns socket
-- operation and our universe might have changed. Could lead to nasty
-- race-conditions otherwise.
if (host.lastQuery.expire or 0) < time() then
host:queryDns(false) -- timer-context; cacheOnly always false
end
end
if all_ok then
-- shutdown recurring timer
ngx_log(ngx_DEBUG, self.log_prefix, "requery success, stopping timer")
self.requeryTimer:cancel()
self.requeryTimer = nil
else
-- not done yet
ngx_log(ngx_DEBUG, self.log_prefix, "requery failure")
end
ngx_log(ngx_DEBUG, self.log_prefix, "requery completed")
end
-- Starts the requery timer.
function objBalancer:startRequery()
if self.requeryTimer then return end -- already running, nothing to do here
local err
self.requeryTimer, err = resty_timer({
recurring = true,
interval = self.requeryInterval,
detached = false,
expire = self.requeryTimerCallback,
}, self)
if not self.requeryTimer then
ngx_log(ngx_ERR, self.log_prefix, "failed to create the timer: ", err)
end
end
--- Sets an event callback for user code. The callback is invoked for
-- every address added to/removed from the balancer, and on health changes.
Expand Down Expand Up @@ -1385,7 +1356,7 @@ _M.new = function(opts)
weight = 0, -- total weight of all hosts
unavailableWeight = 0, -- the unavailable weight (range: 0 - weight)
dns = opts.dns, -- the configured dns client to use for resolving
requeryTimer = nil, -- requery timer is not running, see `startRequery`
resolveTimer = nil,
requeryInterval = opts.requery or REQUERY_INTERVAL, -- how often to requery failed dns lookups (seconds)
ttl0Interval = opts.ttl0 or TTL_0_RETRY, -- refreshing ttl=0 records
healthy = false, -- initial healthstatus of the balancer
Expand All @@ -1396,6 +1367,20 @@ _M.new = function(opts)
self:setCallback(opts.callback or function() end) -- callback for address mutations
do
local err
self.resolveTimer, err = resty_timer({
recurring = true,
interval = 1, -- check for expired records every 1 second
detached = false,
expire = self.resolveTimerCallback,
}, self)
if not self.resolveTimer then
return nil, "failed to create timer for background DNS resolution: " .. err
end
end
ngx_log(ngx_DEBUG, self.log_prefix, "balancer_base created")
return self
end
Expand Down
2 changes: 1 addition & 1 deletion src/resty/dns/balancer/least_connections.lua
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ function _M.new(opts)
opts.log_prefix = "least-connections"
end

local self = balancer_base.new(opts)
local self = assert(balancer_base.new(opts))

-- inject overridden methods
for name, method in pairs(lc) do
Expand Down
2 changes: 1 addition & 1 deletion src/resty/dns/balancer/ring.lua
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ function _M.new(opts)
opts.log_prefix = "ringbalancer"
end

local self = balancer_base.new(opts)
local self = assert(balancer_base.new(opts))

if (not opts.wheelSize) and opts.order then
opts.wheelSize = #opts.order
Expand Down
2 changes: 1 addition & 1 deletion src/resty/dns/client.lua
Original file line number Diff line number Diff line change
Expand Up @@ -1507,7 +1507,7 @@ _M.connect = connect
_M.setpeername = setpeername
-- export the locals in case we're testing
if _TEST then -- luacheck: ignore
if package.loaded.busted then
_M.getcache = function() return dnscache end
_M._search_iter = search_iter -- export as different name!
end
Expand Down

0 comments on commit 4ee0d70

Please sign in to comment.