This workshop uses a few simple examples to show the use of parallel processing facilities available in Julia 

## Approximate the value of pi - Monte Carlo method

![Calculating PI](https://upload.wikimedia.org/wikipedia/commons/8/84/Pi_30K.gif)

```
Attribution : By CaitlinJo - Own work. This mathematical image was created with Mathematica, CC BY 3.0, https://commons.wikimedia.org/w/index.php?curid=14609430
```

In [None]:
#Serial version

function calc_pi(n)
    in_circle = 0 
    for i in 1:n
        x = rand()
        y = rand()
        if (x^2 + y^2) < 1.0
            in_circle = in_circle + 1
        end
    end
    return (in_circle/n) * 4.0
end

println("pi = ", @time calc_pi(10^8) )


In [None]:
#Parallel version

if nprocs() == 1 
    addprocs(4)            # <---- Add julia workers
end

function calc_pi(n)
    in_circle = @parallel (+) for i in 1:n       # <----- @parallel for
        x = rand()
        y = rand()
        (x^2 + y^2) < 1.0 ? 1 : 0
    end
    return (in_circle/n) * 4.0
end

println("pi = ", @time calc_pi(10^8) )


- Julia process
    - What is a Julia process?
        - Single thread of execution scheduled by the OS
        - Master process
            - Runs the REPL in interactive mode
            - Driver process, orchestrates work
        - Workers
            - Do the actual work
            - Typically one per core for computationally intensive work
        - Identified by a numeric process id, not related to the OS pid.
- Julia cluster
    - Master process + Worker processes 
    - nprocs(), nworkers(), workers(), procs() - Try them out.
    - addprocs(N), rmprocs() - Adds / removes workers 
- @parallel for
    - Partitions a "for" loop
    - Equally partitioned among available workers
    - Can specify a reduction operator 

# Julia Tasks

- What is a Julia Task?
    - very lightweight coroutines
    - Not threads!
    - Internal to and scheduled by a Julia Process
    - Tasks switch on yield() or on I/O
- Lets try a simple example     

In [None]:
# Simple JSON validation at an external web service
# Service hosted at http://validate.jsontest.com/

# Serial example
validate_list = [
    "[1,2,3]",                  # Valid
    "\"Malformed JSON string",  # Invalid
    "{\"foo\":\"bar\"}"         # Valid
    ] 

using Requests                # HTTP queries to a web service
using JSON                    # parse responses


function validate(list)
    results = []
    for s in list
        response = Requests.post("http://validate.jsontest.com/", data = Dict("json" => s))
        json_resp = JSON.parse(bytestring(response.data))
        push!(results, (s, json_resp["validate"]))
    end
    return results
end

results = @time validate(validate_list)
for r in results
    println("JSON string $(r[1]) is ", r[2] ? "valid " : " not valid")
end


In [None]:
# Let us perform the validation in parallel

function validate_in_parallel(list)
    results = []
    @sync begin                  # <---- Wait for all enclosed to finish
        for s in validate_list
            @async begin         # <----- Execute block in a new task
                response = Requests.post("http://validate.jsontest.com/", data = Dict("json" => s))
                json_resp = JSON.parse(bytestring(response.data))
                push!(results, (s, json_resp["validate"]))
            end
        end
    end
    return results
end

results = @time validate_in_parallel(validate_list)
for r in results
    println("JSON string $(r[1]) is ", r[2] ? "valid " : " not valid")
end


To retiterate:
- A task runs till it performs an I/O operation or explictly yields (calls sleep() or yield() )
- Upon yielding other runnable tasks are executed
- A non-yielding task in a process prevents any other code from execution (including I/O operations) 

In [None]:
# Now consider an example that uses both Tasks and Worker processes for parallelism
# Lets build a simple random Distributed Vector  

nprocs() == 1 && addprocs(4)

type DVector
    length
    refs                  # references to localparts
    cuts                  # cut of vector on ith worker
    pids                  # participating workers, refs[i] is on pids[i]
    
    function DVector(N)
        refs=[]
        cuts=[]
        localpart_len = div(N, nworkers())
        ncut_start = 1
        last_worker_id = workers()[end]
        for p in workers()
            if p == last_worker_id
                localpart_len = localpart_len + rem(N, nworkers())
            end
            push!(refs, remotecall(p, rand, localpart_len))        # <--- execute on process p
            push!(cuts, ncut_start:ncut_start+localpart_len-1) 
            ncut_start += localpart_len
        end
        return new(N, refs, cuts, workers())
    end
end

function Base.convert(::Type{Array}, d::DVector)
    A = Array(Float64, d.length)
    @sync begin
        for (i,r) in enumerate(d.refs)
            @async A[d.cuts[i]] = fetch(r)    # <--- perform the "fetching" in parallel
        end
    end
    A
end


function Base.getindex(d::DVector, i)
    ref_index = div(d.length, i) + 1
    
    # return the appropriate single element after fetching locally from the remote worker 
    remotecall_fetch(ref_index, (li, r) -> fetch(r)[li], d.refs[ref_index], rem(d.length, i)+1)
end


In [None]:
d=DVector(13)

In [None]:
A=convert(Array, d)

In [None]:
A[5]

In [None]:
function Base.map!(f, d::DVector)
    @sync for (i, p) in enumerate(d.pids)
        @async remotecall_wait(p, (f,r)->(map!(f, fetch(r)); nothing), f, d.refs[i])
    end
    d
end

In [None]:
map!(x->1, d);

Package DistributedArrays.jl has the complete implementation for global arrays.