In [1]:
import Pkg
Pkg.activate(".")

[32m[1m  Activating[22m[39m project at `~/Developer/DistributedStreams.jl`


In [2]:
using Distributed

In [3]:
addprocs(2)

2-element Vector{Int64}:
 2
 3

In [4]:
@everywhere using Base: @kwdef
@everywhere using Distributed, DistributedStreams, DistributedArrays, Serialization

In [5]:
function sendfunc(f::Function, dest::Int64, mod::Module=Main)
    # get 
    fname = Symbol(f)
    mname = Symbol(mod)
    buf = IOBuffer()
    Serialization.serialize(buf, methods(eval(:($mname.$fname))))
    Distributed.remotecall_eval(
        mod, [dest], quote
            function $fname end 
            Serialization.deserialize(seekstart($buf))
        end
    )
end

sendfunc (generic function with 2 methods)

In [6]:
@everywhere @enum MessageType begin
    start
    stop
    started
    stopped
    failed
end

@everywhere @kwdef struct ControlMessage
    message_type::MessageType
    target::Int64
    func_f::Union{Function, Nothing}
    func_in::Union{RemoteChannel, Nothing}
    func_out::Union{RemoteChannel, Nothing}
end

In [26]:
@everywhere function launch_sentinel(;workers=[2], verbose=false, buffer_size=32, timeout=1)

    distributed_control = DArray([
        @spawnat p [(worker = p, flag = Ref(false))]
        for p in workers
    ])

    function remote_worker(entries, results, control)
        # list of active workers
        active_workers = Dict{Int64}{DArray}()
        
        # controller used to modify the behaviour of a running worker, e.g.
        # shut it down gracefully
        local_control = only(localpart(control))

        # check for any failed workers -- if failures did occur, report them as
        # a `failed` type message and remove them from the active_workers list
        @async while true
            # check if should stop checking 
            if local_control.flag[]  # Shutdown flag raised
                if verbose
                    println("Sentinel on $(local_control.worker) is shutting down")
                end
                break
            end
            current_worker_list = Distributed.workers()
            for w in keys(active_workers)
                if !(w in current_worker_list)
                    @async put!(results, ControlMessage(
                        message_type=failed,
                        target=w,
                        func_f=nothing,
                        func_in=nothing,
                        func_out=nothing
                    ))
                    delete!(active_workers, w)
                    if verbose
                        println("Worker $(w) died")
                    end
                end
            end
            # we don't need to check all the time
            sleep(timeout)
        end
        
        # create mechanism to bypass take! if message is not ready => avoid
        # blocking channel with a `fetch`
        message_task = @async take!(entries)
        
        while true
            if ! istaskdone(message_task)
                sleep(timeout)
                continue
            end

            # Periodically check if the worker is flagged to be shut down.
            # `timedwait` is slow, so we run this in async mode.
            @async while true
                # introduce timeout which will shut down the worker with
                # `local_control.flag[] == true`
                if timedwait(()->istaskdone(message_task), timeout) == :ok
                    break
                end
                # ALL CODE ENTERING HERE => TIMEDWAIT TIMED OUT
                if local_control.flag[]  # Shutdown flag raised
                    if verbose
                        println("Sentinel on $(local_control.worker) is shutting down")
                    end
                    sleep(2*timeout) # give everything time to quit
                    return
                end
            end
            
            # process messages
            message = fetch(message_task)

            if message.message_type == start
                println("Start instruction for $(message.target)")
                control = launch_consumer(
                    message.func_f, message.func_in, message.func_out;
                    workers=[message.target], verbose=true, buffer_size=32, timeout=1,
                    start_safe=false
                )
                println("Started")
                active_workers[message.target] = control
                @async put!(results, ControlMessage(
                    message_type=started,
                    target=message.target,
                    func_f=nothing,
                    func_in=nothing,
                    func_out=nothing
                ))
            elseif message.message_type == stop
                println("Stop instruction for $(message.target)")
                if message.target in keys(active_workers)
                    make_unsafe!(active_workers[message.target]; workers=[message.target])
                    stop_workers!(active_workers[message.target]; workers=[message.target])
                    @async put!(results, ControlMessage(
                        message_type=stopped,
                        target=message.target,
                        func_f=nothing,
                        func_in=nothing,
                        func_out=nothing
                    ))
                    delete!(active_workers, message.target)
                    println("Stopped")
                end
            else
                # all other message types ignored by putting them directly into the
                # output channel
                @async put!(results, message)
            end
            # All done with this one => take the next message, rinse, repeat
            message_task = @async take!(entries)
        end
    end

    # these remote channels are owned by PID=1
    control_messages  = RemoteChannel(()->Channel{ControlMessage}(buffer_size), 1)
    control_responses = RemoteChannel(()->Channel{ControlMessage}(buffer_size), 1)

    for p in workers
        remote_do(
            remote_worker, p,
            control_messages, control_responses, distributed_control
        )
    end

    return control_messages, control_responses, distributed_control
end

In [8]:
control_messages, control_responses, distributed_control = launch_sentinel(;
    workers=[2], verbose=true, buffer_size=1024, timeout=1
)

(RemoteChannel{Channel{ControlMessage}}(1, 1, 31), RemoteChannel{Channel{ControlMessage}}(1, 1, 32), @NamedTuple{worker::Int64, flag::Base.RefValue{Bool}}[(worker = 2, flag = Base.RefValue{Bool}(false))])

In [9]:
distributed_control

1-element DArray{@NamedTuple{worker::Int64, flag::Base.RefValue{Bool}}, 1, Vector{@NamedTuple{worker::Int64, flag::Base.RefValue{Bool}}}}:
 (worker = 2, flag = Base.RefValue{Bool}(false))

In [10]:
ch_in = RemoteChannel(()->Channel{Int64}(32), 1)
ch_out = RemoteChannel(()->Channel{Int64}(32), 1)

@everywhere function test_fn(i)
    println("hi there, I'm running on pid=$(myid())")
    sleep(2)
    return i+1
end

In [11]:
m_f = ControlMessage(
    message_type=start,
    target=3,
    func_f=test_fn,
    func_in=ch_in,
    func_out=ch_out
)

ControlMessage(start, 3, test_fn, RemoteChannel{Channel{Int64}}(1, 1, 39), RemoteChannel{Channel{Int64}}(1, 1, 40))

In [12]:
put!(control_messages, m_f)

RemoteChannel{Channel{ControlMessage}}(1, 1, 31)

      From worker 2:	Start instruction for 3
      From worker 2:	Started


In [13]:
collect!(control_responses)

1-element Vector{ControlMessage}:
 ControlMessage(started, 3, nothing, nothing, nothing)

In [14]:
put!(ch_in, 10)

RemoteChannel{Channel{Int64}}(1, 1, 39)

      From worker 3:	hi there, I'm running on pid=3


In [15]:
collect!(ch_out)

1-element Vector{Int64}:
 11

In [16]:
rmprocs(3)

Task (done) @0x00000001165f9ba0

      From worker 2:	Worker 3 died


In [17]:
collect!(control_responses)

1-element Vector{ControlMessage}:
 ControlMessage(failed, 3, nothing, nothing, nothing)

In [18]:
addprocs(1)
@everywhere using Distributed, DistributedStreams, DistributedArrays, Serialization

In [19]:
ch_in = RemoteChannel(()->Channel{Int64}(32), 1)
ch_out = RemoteChannel(()->Channel{Int64}(32), 1)

m = ControlMessage(
    message_type=start,
    target=4,
    func_f=m_f.func_f,
    func_in=m_f.func_in,
    func_out=m_f.func_out
)

ControlMessage(start, 4, test_fn, RemoteChannel{Channel{Int64}}(1, 1, 39), RemoteChannel{Channel{Int64}}(1, 1, 40))

In [20]:
put!(control_messages, m)

RemoteChannel{Channel{ControlMessage}}(1, 1, 31)

      From worker 2:	Start instruction for 4
      From worker 2:	Started


In [21]:
collect!(control_responses)

1-element Vector{ControlMessage}:
 ControlMessage(started, 4, nothing, nothing, nothing)

In [22]:
m = ControlMessage(
    message_type=stop,
    target=4,
    func_f=nothing,
    func_in=nothing,
    func_out=nothing
)

put!(control_messages, m)

RemoteChannel{Channel{ControlMessage}}(1, 1, 31)

      From worker 2:	Stop instruction for 4
      From worker 2:	Stopped


In [23]:
collect!(control_responses)

1-element Vector{ControlMessage}:
 ControlMessage(stopped, 4, nothing, nothing, nothing)

In [24]:
rmprocs(4)

Task (done) @0x00000001192860e0

In [25]:
collect!(control_responses)

ControlMessage[]