## Parallel programming constructs in Base Julia

- Example used
    - calculate pi using random numbers (circle circumscribed by a square)
    - Area of circle / Area of square = pi / 4

- Processes vs tasks
    - Process
        - Single thread of execution scheduled by the OS
        - Master process
            - Driver, orchestrates work
            - Hosts the REPL in interactive mode, or the main script in non-interactive mode  
        - Workers
            - Different OS processes, typically one per core
        - Identified by a numeric process id, not related to the OS pid
    - Tasks are lightweight co-routines

In [None]:
# First the serial version
function serial_π(n)
    in_circle = 0 
    for i in 1:n
        x = rand()
        y = rand()
        in_circle += Int((x^2 + y^2) < 1.0)
    end
    return (in_circle/n) * 4.0
end

In [None]:
println("π = ", @time serial_π(10^8) )

#### Compute in parallel
- First set up a local cluster
- Master process + Worker processes 
- Workers may be on the same host, or different hosts
- Workers can be launched on cluster managers like SGE, SLURM, etc


In [None]:
if nprocs() == 1
    addprocs(4)
end
nprocs()

In [None]:
procs()

In [None]:
workers()

#### Change the serial version to execute the for loop in parallel
    - Use @parallel for
    - Partitions a "for" loop
    - Equally partitioned among available workers
    - Can specify a reduction operator 

In [None]:
#Parallel version
function parallel_π(n)
    in_circle = @parallel (+) for i in 1:n       # <----- partition work
        x = rand()
        y = rand()
        Int((x^2 + y^2) < 1.0)
    end
    return (in_circle/n) * 4.0
end

In [None]:
println("π = ", @time parallel_π(10^8) )

### Julia Tasks

- What is a Julia Task?
    - very lightweight coroutines
    - Not threads!
    - Internal to and scheduled by a Julia Process
    - Runs till it performs an I/O operation or explictly yields (calls sleep() or yield() )
    - A non-yielding task in a process prevents any other code from execution (including I/O operations)
    - Julia process driving external services in parallel
    - Julia master driving worker processes in a Julia cluster 


Simple example of a single Julia process driving a few external resources
    - Calculate pi using all the machines available at JuliaCon 


#### pseudo-code (driver)

```
schedule a background task to
    listen on a known port
    while true
       accept and store incoming connections from machines at JuliaCon
    end
       
    
function calculate_pi_in_parallel
    send out computation requests to all connected machines
    add each response to a queue as it arrives
    process responses as they arrive till all responses have been recd or a timeout
    
```    

#### pseudo-code (calculation service)

```
connect to orchestrator
while true
  wait for a request
  compute request in parallel locally
  send back the response
end
```

#### Driver code for reference

```
# Calculate pi using all instances of the users at JuliaCon

const clients=Set()

@schedule begin
    srvr = listen(8000)
    while true
        sock = accept(srvr)
        push!(clients, sock)
    end
end

function calc_π(n_each)
    println("Processing remotely on possible $(length(clients)) clients")

    # This function will wait for a maximum of 10.0 seconds for clients to return
    tc = Condition()
    t0 = time()
    @schedule (sleep(10.0); notify(tc))     # <---- exit the wait in time
    
    response_channel = Channel()
    for c in clients
        if isopen(c)
            @async try                      # <---- execute remotely in parallel
                serialize(c, n_each)
                put!(response_channel, deserialize(c))
                notify(tc)
            catch e
                delete!(clients, c)
            end
        else 
            delete!(clients, c)
        end
    end
    
    incircle = 0
    nclients = 0
    
    # wait for all responses or the timeout
    while true
        nclients == length(clients) && break     # Have processed all clients
        
        !isready(response_channel) && wait(tc)   # Block wait for a pending response or a timeout
                                      
        !isready(response_channel) && break      # Still not ready, indicates a timeout
        
        incircle += take!(response_channel)
        nclients += 1
        
        println("pi calculated from $nclients workers = ", 4*incircle/(nclients*n_each))
        
    end
    4*incircle/(nclients*n_each)
        
end

calc_π(10^6)

```

In [None]:

##################################################
# Make available your local computation resources
##################################################
@schedule begin
    c = connect("107.23.255.102", 8000)
    while true
        num_points = deserialize(c)    # <---- wait for a request    
        
        in_circle = @parallel (+) for i in 1:num_points
            Int(rand()^2 + rand()^2 < 1)
        end
        
        println("Received request for $num_points points. Response $in_circle")
        serialize(c, in_circle)        # <---- send back response
    end
end


#### Using Julia Tasks and Workers together

Let us build a simple distributed vector
- architecturally similar to DistributedArrays.jl
- create a distributed vector of random floats and implement a map function

In [None]:
nprocs() == 1 && addprocs(4)

type DVector
    refs::Array{RemoteRef}  # references to localparts
    cuts::Array{UnitRange{Int}}    # cut of vector on ith worker
    pids::Array{Int}        # participating workers, refs[i] is on pids[i]
    
    function DVector(N)
        refs=[]
        cuts=[]
        pids=workers()
        localpart_len = div(N, nworkers())
        ncut_start = 1
        for p in pids
            if p == pids[end]
                localpart_len = localpart_len + rem(N, nworkers())
            end
            push!(refs, remotecall(p, rand, localpart_len))     # create the localpart on each worker
                                                                # and hold a reference to it
            
            push!(cuts, ncut_start:ncut_start+localpart_len-1)  # Which worker has which part
            ncut_start += localpart_len
        end
        return new(refs, cuts, workers())
    end
end

function Base.convert(::Type{Array}, d::DVector)
    A = Array(Float64, last(d.cuts[end]))
    @sync for (i,r) in enumerate(d.refs)  # wait for all enclosed requests to finish
        @async A[d.cuts[i]] = fetch(r)    # perform the "fetching" in parallel
    end
    A
end

function Base.getindex(d::DVector, i)
    idx = findfirst(x -> i in x, d.cuts)  # Locate which ref has the index we need

    # fetch only the single element. fetch localpart on correct worker and index locally. 
    remotecall_fetch(d.pids[idx], (li, r) -> fetch(r)[li], i-first(d.cuts[idx])+1, d.refs[idx])
end


In [None]:
d=DVector(12)   # As you can see the local structure only has refernces to distributed parts

In [None]:
# gather distributed parts
Array(d)

In [None]:
d[9]

In [None]:
# Implement a distributed map
function Base.map!(f, d::DVector)
    @sync for (i, p) in enumerate(d.pids)
        @async remotecall_wait(p, (f,r)->(map!(f, fetch(r)); nothing), f, d.refs[i])
    end
    d
end

In [None]:
# Lets try it out 
map!(x->1.0, d)

In [None]:
# gather parts and display
Array(d)

Package DistributedArrays.jl has the complete implementation for distributed arrays.