# Using DArrays for reading and manipulating data
## Parallel Workshop JuliaCon 2016
### DArrays basics

In [3]:
# Add processes
addprocs(8)
# Use package for distributed arrays
using DistributedArrays

In [4]:
C = Array(1:nworkers())

8-element Array{Int64,1}:
 1
 2
 3
 4
 5
 6
 7
 8

In [5]:
map(t -> t*t, C)

8-element Array{Int64,1}:
  1
  4
  9
 16
 25
 36
 49
 64

In [6]:
D = distribute(C)

8-element DistributedArrays.DArray{Int64,1,Array{Int64,1}}:
 1
 2
 3
 4
 5
 6
 7
 8

In [7]:
map(t -> t*t, D)

8-element DistributedArrays.DArray{Int64,1,Array{Int64,1}}:
  1
  4
  9
 16
 25
 36
 49
 64

In [None]:
map(t -> Dates.monthname((t - 1) % 12 + 1), D)

In [8]:
map(t -> Dates.monthname((t - 1) % 12 + 1) |> s -> s*" is my favorite month.", D) |> t -> reduce(*, Array(t))

"January is my favorite month.February is my favorite month.March is my favorite month.April is my favorite month.May is my favorite month.June is my favorite month.July is my favorite month.August is my favorite month."

In [9]:
D55 = @DArray [randn(5,5) for i = 1:32]

32-element DistributedArrays.DArray{Array{Float64,2},1,Array{Array{Float64,2},1}}:
 5x5 Array{Float64,2}:
  0.749681    0.417828   1.32431   0.01106    0.246542
  0.0793146  -0.442598   0.505572  0.224562  -0.556564
  0.964229    0.453331   2.24267   1.4155     0.221317
 -0.954585    0.653496  -0.317464  0.514433   2.00375 
  0.0449563   0.460558   0.3587    2.3126     1.97998                          
 5x5 Array{Float64,2}:
 -0.0673469   0.308336    0.0866737   1.12114    -0.337453 
  0.590937   -0.53924     0.604027   -0.0345528   0.589651 
  1.24741    -1.37506    -0.780778   -1.03455     1.23955  
 -0.318958    0.666986   -1.06661    -1.31499     0.272334 
 -0.148643    0.0414206  -0.310748   -1.47092    -0.0201022
 5x5 Array{Float64,2}:
 -0.19296    0.0781201  -0.801036   0.554971  -0.907947
 -2.52531   -1.34596    -1.20128   -0.236329  -0.91827 
  2.06021   -0.343111    0.904802  -0.204173   0.959878
  0.246964   1.31196     0.370966  -0.776546   1.11704 
 -1.92541   -0.379076   

In [10]:
Dsvd = map(svdvals, D55)

32-element DistributedArrays.DArray{Array{Float64,1},1,Array{Array{Float64,1},1}}:
 [4.012559085786708,3.021723174422703,1.17699429497302,0.5091840394187737,0.2016641603121387]       
 [3.050249950442779,2.2208425311369506,0.9129139369165008,0.4294787517155618,0.11358904824794219]   
 [4.378209808878133,2.083932285645457,1.511758606338584,0.7888951609767596,0.05788026504664591]     
 [2.298630165274538,2.0695245755093308,1.0909032217969403,0.7784116485315605,0.06551854058370984]   
 [3.874771908532562,2.478814183483485,1.123684832179373,0.5806970108064995,0.0848879992731654]      
 [3.986760632568916,3.4758108849221077,2.162607505507518,1.3895150851675877,0.2649968877775879]     
 [5.046341290611348,2.57492114447396,2.0705241538743215,1.0413614880811795,0.2361223871551074]      
 [4.449678187590677,3.502650431662088,3.162266567240395,1.0063730271192177,0.5179716888126966]      
 [4.863367464733277,2.748842772614798,1.2627533382659457,0.9636346920396703,0.04788508827342487]    
 [4.5088

### Reading data in parallel

In [13]:
# Save the path to the data directory and load a list of subdirectories with the data
pth = "/data/MIMICII"
dirs = filter(isdir, map(t -> joinpath(pth, t), readdir(pth)))

LoadError: LoadError: SystemError: unable to read directory /data/MIMICII: No such file or directory
while loading In[13], in expression starting on line 3

In [14]:
# Extract the data files
fls = mapreduce(t -> map(s -> joinpath(t, s), readdir(t)), vcat, dirs)

LoadError: LoadError: UndefVarError: dirs not defined
while loading In[14], in expression starting on line 2

In [12]:
# Size if GB
@time sum(map(filesize, fls))/1024^3

LoadError: LoadError: UndefVarError: fls not defined
while loading In[12], in expression starting on line 155

In [8]:
flsSmall = fls[1:div(length(fls), 10)]
@time sum(map(filesize, flsSmall))/1024^3

  0.000797 seconds (382 allocations: 25.172 KB)


1.6282551661133766

In [9]:
# Use package for reading binary files (written in Julia)
using MAT

In [11]:
@time dt = map(matread, distribute(flsSmall));

  0.409801 seconds (29.13 k allocations: 1.614 MB)


In [12]:
@show typeof(dt)
dt[1]

typeof(dt) = DistributedArrays.DArray{Dict{ASCIIString,Any},1,Array{Dict{ASCIIString,Any},1}}


Dict{ASCIIString,Any} with 3 entries:
  "A"      => Dict{ASCIIString,Any}("A"=>1x17 sparse matrix with 17 Float64 ent…
  "tm"     => 345000x1 Array{Float64,2}:…
  "signal" => 345000x1 Array{Float64,2}:…

In [None]:
# Plot a signal
using PlotlyJS
plot(PlotlyJS.scattergl(;y = dt[1]["signal"][:]))

In [13]:
# custom cleaner functions are fast in Julia
# in this exmaple, replace NaNs with zeros
x = map(t -> ifelse(isnan(t), 0, t),dt[1]["signal"][:]);

# fft
xfft = fft(x)

345000-element Array{Complex{Float64},1}:
  13341.5+0.0im    
  4149.56-13970.2im
  -2368.8-1152.79im
  5407.77+874.87im 
  7131.39-5189.44im
 -530.063-9487.38im
 -6983.82+605.086im
  5317.84+6670.13im
   9060.2-6496.53im
 -3092.74-7535.38im
 -2208.69+2028.72im
  5152.03+142.519im
  4084.85-6764.49im
         ⋮         
  4084.85+6764.49im
  5152.03-142.519im
 -2208.69-2028.72im
 -3092.74+7535.38im
   9060.2+6496.53im
  5317.84-6670.13im
 -6983.82-605.086im
 -530.063+9487.38im
  7131.39+5189.44im
  5407.77-874.87im 
  -2368.8+1152.79im
  4149.56+13970.2im

In [15]:
# size in GB
@time mapreduce(Base.summarysize, +, dt)/1024^3

  0.006456 seconds (2.94 k allocations: 235.547 KB)


1.6282322583720088

In [16]:
@time lngs = map(t -> length(t["signal"]), dt)

  0.051641 seconds (7.61 k allocations: 540.323 KB)


187-element DistributedArrays.DArray{Int64,1,Array{Int64,1}}:
   345000
     5978
  2679022
  8100000
   367500
   952500
  1807500
     4602
   145136
      768
   163584
   328464
      512
        ⋮
 11055000
   418250
      246
     1272
        8
     7474
     1478
        8
     1280
      768
    11000
        8

In [None]:
plot(histogram(x = lngs), Layout(yaxis=Dict("type" => "log")))

In [22]:
# My own small package for iterative SVD and a few convenience methods

# Pkg.clone("https://github.com/andreasnoack/TSVD.jl")
using TSVD

# Define method for converting distributed vector of vectors to distributed matrix
function Base.hcat{T}(A::DistributedArrays.DArray{Vector{T}})
    n = length(A[1])
    D = DArray((n, length(A)), A.pids, [1, length(A.pids)]) do I
        mB, nB = map(length, I)
        # Create new DArray from the distributed vector of vectors.
        # For now, we assume that each vector is only located on a single
        # worker. Eventually, we'd like to find a more flexible solution.
        B = Array(eltype(A[1]), mB, nB)
        for i = 1:nB
            B[:,i] = A[I[2][i]][I[1]]
        end
        B
    end
    return D
end

# convenience function for concatenating distributed vectors collected in a vector
# @everywhere function Base.hcat{T<:DistributedArrays.DVector}(x::Vector{T})
#     l    = length(x)
#     if l == 0
#         throw(ArgumentError("cannot flatten empty vector"))
#     else
#         x1   = x[1]
#         m, n = size(x1, 1), size(x1, 2)
#         B    = DArray((m, l*n)) do I
#             B_local = Array(eltype(x1), map(length, I))
#             for j = 1:length(I[2])
#                 B_local[:, j] = x[I[2][j]][I[1]]
#             end
#             return B_local
#         end
#         return B
#     end
# end

@everywhere Base.procs(A::Array) = fill(myid(), 1, 1)

Base.convert{S,T,N,D<:DArray}(::Type{Array{S,N}}, s::SubArray{T,N,D}) = begin
    I = s.indexes
    d = s.parent
#     println("Hej", isa(I,Tuple{Vararg{UnitRange{Int}}}))
#     if isa(I,Tuple{Vararg{UnitRange{Int}}}) && S<:T && T<:S
    l = DistributedArrays.locate(d, map(first, I)...)
        if isequal(d.indexes[l...], I)
            # SubDArray corresponds to a chunk
        return DistributedArrays.chunk(d, l...)
        end
#     end
#     a = Array(S, size(s))
#     a[[1:size(a,i) for i=1:N]...] = s
#     return a
end

convert (generic function with 547 methods)

In [17]:
# Necesarry to define our own rep function because Julia's repeat is a bit slow
@everywhere function rep(x::Vector, l)
    y = similar(x, l)
    cx = cycle(x)
    s = start(cx)
    @inbounds @simd for i = 1:l
        (yi, s) = next(cx, s)
        y[i] = ifelse(isnan(yi), zero(yi), yi)
    end
    return y
end

In [18]:
# Compute a distributed vector of vectors of equal lengths
@time dt1 = let n = 50000
    map(t -> rep(vec(t["signal"]), n), dt);
end;

  0.259672 seconds (32.09 k allocations: 1.600 MB, 4.17% gc time)


In [21]:
# Convert to a distributed matrix
@time A = hcat(dt1);

  1.257840 seconds (431.32 k allocations: 19.012 MB, 0.40% gc time)


In [15]:
# Apply the fft along the first dimension of the matrix
@time B = mapslices(fft, A, 1)

LoadError: LoadError: UndefVarError: A not defined
while loading In[15], in expression starting on line 155

In [None]:
# Similar to 
map(fft, dt1)

In [None]:
# Created initial vector. Has to be distributed.
v0 = DArray(I -> rand(Complex64, length(I[1])), (size(A, 1),), A.pids[:,1])

# Compute the SVD
@time U, s, V = TSVD.tsvd(A, 5, initVec = v0, stepSize = 5, debug = true);

τ = 0.007648707318601342
Reorth v
τ = 0.007648707318601342
Reorth u
τ = 0.007648707318601342
Reorth v
τ = 0.007648707318601342
Reorth u
