In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr

from scipy.sparse import coo_matrix
import dask.array as da
from dask.diagnostics import ProgressBar

**Update: This kind of works now. But the performance is worse than Numba.**

# Preparation

Just copy the code from [sparse_dot_benchmark.ipynb](./sparse_dot_benchmark.ipynb).

In [2]:
ds = xr.open_dataset("weights.nc")
n_s = ds.dims['n_s']
col = ds['col'].values - 1
row = ds['row'].values - 1
S = ds['S'].values
A = coo_matrix((S, (row, col))) 

In [3]:
data = np.random.rand(500, A.shape[1])
data.shape

(500, 240000)

In [4]:
# reference result and baseline performance
%time out_scipy = A.dot(data.T).T
out_scipy.shape

CPU times: user 475 ms, sys: 167 ms, total: 642 ms
Wall time: 650 ms


(500, 120000)

# apply_ufunc on numpy array

In [5]:
def apply_A(data):
    # use global A here!
    return A.dot(data.T).T

Has the same performance as before.

In [6]:
%time xr.apply_ufunc(apply_A, data)

CPU times: user 466 ms, sys: 165 ms, total: 631 ms
Wall time: 643 ms


array([[ 0.11250547,  0.62152447,  0.32295978, ...,  0.27827256,
         0.34809867,  0.36379499],
       [ 0.64895496,  0.26802886,  0.41272402, ...,  0.46974352,
         0.64499186,  0.64985559],
       [ 0.82787185,  0.33176759,  0.56978941, ...,  0.71662387,
         0.43066475,  0.18261389],
       ..., 
       [ 0.35410965,  0.36236262,  0.7849376 , ...,  0.50895257,
         0.3384691 ,  0.73448701],
       [ 0.87502444,  0.58880469,  0.35482145, ...,  0.69282072,
         0.40735966,  0.61793078],
       [ 0.24804293,  0.48838714,  0.46165374, ...,  0.6340203 ,
         0.87557678,  0.82797878]])

# apply_ufunc on xarray DataArray

In [7]:
dr = xr.DataArray(data, 
                  dims=['extra_dims', 'grid_dims'],
                  coords=[np.arange(500), np.arange(240000)],
                  name='data'
                 )
dr

<xarray.DataArray 'data' (extra_dims: 500, grid_dims: 240000)>
array([[ 0.021726,  0.311896,  0.695838, ...,  0.178822,  0.947601,  0.153578],
       [ 0.906841,  0.161121,  0.273547, ...,  0.657841,  0.965106,  0.675433],
       [ 0.920618,  0.71062 ,  0.13484 , ...,  0.397721,  0.204345,  0.13308 ],
       ..., 
       [ 0.513338,  0.047198,  0.542131, ...,  0.11914 ,  0.883951,  0.778235],
       [ 0.996765,  0.825025,  0.589761, ...,  0.281366,  0.581218,  0.696507],
       [ 0.034942,  0.963049,  0.411179, ...,  0.926111,  0.872794,  0.905661]])
Coordinates:
  * extra_dims  (extra_dims) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
  * grid_dims   (grid_dims) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...

Has the same performance as before.

In [8]:
%time dr_out = xr.apply_ufunc(apply_A, dr)

CPU times: user 516 ms, sys: 193 ms, total: 709 ms
Wall time: 730 ms


In [9]:
# result is correct
np.array_equal(dr_out.data, out_scipy)

True

# Use dask array

In [10]:
data_dask = da.from_array(data, chunks=(50, 240000))
data_dask

dask.array<array, shape=(500, 240000), dtype=float64, chunksize=(50, 240000)>

**Horribly slow...**

In [11]:
%time out_dask = apply_A(data_dask)
out_dask.shape

CPU times: user 3.93 s, sys: 1.05 s, total: 4.98 s
Wall time: 5.09 s


(500, 120000)

In [12]:
# result is correct
np.array_equal(out_dask, out_scipy)

True

# apply_ufunc on dask array, serial

In [13]:
dr_dask = xr.DataArray(data_dask, 
                       dims=['extra_dims', 'grid_dims'],
                       name='data')
dr_dask

<xarray.DataArray 'data' (extra_dims: 500, grid_dims: 240000)>
dask.array<shape=(500, 240000), dtype=float64, chunksize=(50, 240000)>
Dimensions without coordinates: extra_dims, grid_dims

**As slow as the previous case...**

In [14]:
%time dr_out_dask = xr.apply_ufunc(apply_A, dr_dask, dask='allowed')

CPU times: user 3.98 s, sys: 1.05 s, total: 5.03 s
Wall time: 5.13 s


In [15]:
# result is correct
np.array_equal(dr_out_dask, out_scipy)

True

# apply_ufunc on dask array, parallelized

In [16]:
%%time 
dr_out_pa = xr.apply_ufunc(apply_A, dr_dask, 
                           input_core_dims=[['grid_dims']],
                           output_core_dims=[['out_grid']],
                           output_sizes={'out_grid': 120000},
                           dask='parallelized', 
                           output_dtypes=[float])

CPU times: user 140 ms, sys: 39.4 ms, total: 179 ms
Wall time: 178 ms


In [17]:
dr_out_pa # not computed yet

<xarray.DataArray 'data' (extra_dims: 500, out_grid: 120000)>
dask.array<shape=(500, 120000), dtype=float64, chunksize=(50, 120000)>
Dimensions without coordinates: extra_dims, out_grid

**Much faster than the previous case, but still slower than the pure numpy case.**

In [29]:
%timeit dr_out_pa.compute()

731 ms ± 10.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [30]:
# result is correct
np.array_equal(dr_out_pa.data, out_scipy)

True

# Low-level implementation dask delayed

In [31]:
from dask import delayed

Manully break data into 5 chunks

In [32]:
i = 0 
data[50*i:50*(i+1), :].shape # input chunk shape

(50, 240000)

In [33]:
apply_A(data[50*i:50*(i+1), :]).shape # output chunk shape

(50, 120000)

A list of output results from 5 chunks

In [34]:
out_delayed = [delayed(apply_A)(data[50*i:50*(i+1), :]) for i in range(10)]
out_delayed

[Delayed('apply_A-8dbdf735-6195-4d95-b157-50000f70eb0b'),
 Delayed('apply_A-e1adddc9-9ae7-4956-9c31-a439e9ab8b72'),
 Delayed('apply_A-a330b035-3b1b-41a9-a020-f64f69eba30a'),
 Delayed('apply_A-0ba32025-8e20-4ba9-81ec-806883ae0a11'),
 Delayed('apply_A-e2163887-b057-494c-b27d-a6864fd1e786'),
 Delayed('apply_A-03707b33-db0b-4679-8c23-3c1a4d609131'),
 Delayed('apply_A-99771b3c-6b08-4688-abba-a5a66d5468e8'),
 Delayed('apply_A-2e96076a-10d1-43bf-a824-590f8e1d339c'),
 Delayed('apply_A-2d3ed76b-4d70-4ea9-88d7-9da3b77c5d9d'),
 Delayed('apply_A-268087fd-d16e-4d5a-90b1-46bdfd400940')]

Convert to dask arrays:

In [35]:
dask_subarrays = [da.from_delayed(d, (50, 120000), float) for d in out_delayed]
dask_subarrays

[dask.array<from-value, shape=(50, 120000), dtype=float64, chunksize=(50, 120000)>,
 dask.array<from-value, shape=(50, 120000), dtype=float64, chunksize=(50, 120000)>,
 dask.array<from-value, shape=(50, 120000), dtype=float64, chunksize=(50, 120000)>,
 dask.array<from-value, shape=(50, 120000), dtype=float64, chunksize=(50, 120000)>,
 dask.array<from-value, shape=(50, 120000), dtype=float64, chunksize=(50, 120000)>,
 dask.array<from-value, shape=(50, 120000), dtype=float64, chunksize=(50, 120000)>,
 dask.array<from-value, shape=(50, 120000), dtype=float64, chunksize=(50, 120000)>,
 dask.array<from-value, shape=(50, 120000), dtype=float64, chunksize=(50, 120000)>,
 dask.array<from-value, shape=(50, 120000), dtype=float64, chunksize=(50, 120000)>,
 dask.array<from-value, shape=(50, 120000), dtype=float64, chunksize=(50, 120000)>]

Merge into a single dask array:

In [36]:
dask_merge = da.concatenate(dask_subarrays, axis=0)
dask_merge

dask.array<concatenate, shape=(500, 120000), dtype=float64, chunksize=(50, 120000)>

In [37]:
np.array_equal(dask_merge, out_scipy) # result is correct

True

Parallelization does give some speed-up. But the serial version is slower than pure numpy version, and the parallel efficiency is worse than Numba.

In [38]:
%timeit dask_merge.compute(num_workers=1)
%timeit dask_merge.compute(num_workers=2)
%timeit dask_merge.compute(num_workers=4)

725 ms ± 16.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
529 ms ± 5.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
505 ms ± 6.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
