In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr

from scipy.sparse import coo_matrix
import dask.array as da
from dask.diagnostics import ProgressBar

**Warning: This is not working yet! I am looking for advice.**

# Preparation

Just copy the code from [sparse_dot_benchmark.ipynb](./sparse_dot_benchmark.ipynb).

In [2]:
ds = xr.open_dataset("weights.nc")
n_s = ds.dims['n_s']
col = ds['col'].values - 1
row = ds['row'].values - 1
S = ds['S'].values
A = coo_matrix((S, (row, col))) 

In [3]:
data = np.random.rand(500, A.shape[1])
data.shape

(500, 240000)

In [4]:
# reference result and baseline performance
%time out_scipy = A.dot(data.T).T
out_scipy.shape

CPU times: user 393 ms, sys: 148 ms, total: 541 ms
Wall time: 542 ms


(500, 120000)

# apply_ufunc on numpy array

In [5]:
def apply_A(data):
    # use global A here!
    return A.dot(data.T).T

Has the same performance as before.

In [6]:
%time xr.apply_ufunc(apply_A, data)

CPU times: user 396 ms, sys: 145 ms, total: 541 ms
Wall time: 543 ms


array([[ 0.47109029,  0.64809065,  0.3069885 , ...,  0.19186461,
         0.8539589 ,  0.77113161],
       [ 0.33146885,  0.84513615,  0.81045171, ...,  0.61121437,
         0.93084694,  0.66936179],
       [ 0.39108876,  0.39357581,  0.58325874, ...,  0.24378918,
         0.39069883,  0.52605166],
       ..., 
       [ 0.23833508,  0.34872905,  0.79741866, ...,  0.48201491,
         0.51187027,  0.59440683],
       [ 0.11234222,  0.520668  ,  0.13647575, ...,  0.62428014,
         0.51955247,  0.58643299],
       [ 0.51348345,  0.38552869,  0.68036826, ...,  0.35411955,
         0.59422613,  0.61482157]])

# apply_ufunc on xarray DataArray

In [7]:
dr = xr.DataArray(data, 
                  dims=['extra_dims', 'grid_dims'],
                  coords=[np.arange(500), np.arange(240000)],
                  name='data'
                 )
dr

<xarray.DataArray 'data' (extra_dims: 500, grid_dims: 240000)>
array([[ 0.642508,  0.201513,  0.857496, ...,  0.951681,  0.753453,  0.918166],
       [ 0.099309,  0.815894,  0.937141, ...,  0.994728,  0.90566 ,  0.585785],
       [ 0.412488,  0.083408,  0.5964  , ...,  0.316914,  0.856413,  0.474276],
       ..., 
       [ 0.06865 ,  0.763222,  0.266282, ...,  0.39641 ,  0.848988,  0.472966],
       [ 0.025749,  0.186548,  0.546285, ...,  0.576469,  0.175199,  0.712451],
       [ 0.465684,  0.591909,  0.266254, ...,  0.86134 ,  0.090067,  0.847173]])
Coordinates:
  * extra_dims  (extra_dims) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
  * grid_dims   (grid_dims) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...

Has the same performance as before.

In [8]:
%time dr_out = xr.apply_ufunc(apply_A, dr)
dr_out

CPU times: user 425 ms, sys: 156 ms, total: 581 ms
Wall time: 583 ms


<xarray.DataArray 'data' (extra_dims: 500, grid_dims: 120000)>
array([[ 0.47109 ,  0.648091,  0.306989, ...,  0.191865,  0.853959,  0.771132],
       [ 0.331469,  0.845136,  0.810452, ...,  0.611214,  0.930847,  0.669362],
       [ 0.391089,  0.393576,  0.583259, ...,  0.243789,  0.390699,  0.526052],
       ..., 
       [ 0.238335,  0.348729,  0.797419, ...,  0.482015,  0.51187 ,  0.594407],
       [ 0.112342,  0.520668,  0.136476, ...,  0.62428 ,  0.519552,  0.586433],
       [ 0.513483,  0.385529,  0.680368, ...,  0.35412 ,  0.594226,  0.614822]])
Coordinates:
  * extra_dims  (extra_dims) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
  * grid_dims   (grid_dims) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...

In [9]:
# result is correct
np.array_equal(dr_out.data, out_scipy)

True

# Use dask array

In [10]:
data_dask = da.from_array(data, chunks=(100, 240000))
data_dask

dask.array<array, shape=(500, 240000), dtype=float64, chunksize=(100, 240000)>

<font color=red> Horribly slow...

In [11]:
%time out_dask = apply_A(data_dask)
out_dask.shape

CPU times: user 3.73 s, sys: 857 ms, total: 4.58 s
Wall time: 4.59 s


(500, 120000)

In [12]:
# result is correct
np.array_equal(out_dask, out_scipy)

True

# apply_ufunc on dask array

In [13]:
dr_dask = xr.DataArray(data_dask, 
                       dims=['extra_dims', 'grid_dims'],
                       name='data')
dr_dask

<xarray.DataArray 'data' (extra_dims: 500, grid_dims: 240000)>
dask.array<shape=(500, 240000), dtype=float64, chunksize=(100, 240000)>
Dimensions without coordinates: extra_dims, grid_dims

<font color=red> As slow as the previous case...

In [14]:
%time dr_out_dask = xr.apply_ufunc(apply_A, dr_dask, dask='allowed')

CPU times: user 3.82 s, sys: 819 ms, total: 4.64 s
Wall time: 4.65 s


In [15]:
# result is correct
np.array_equal(dr_out_dask, out_scipy)

True

# apply_ufunc on dask array, parallelized

In [16]:
%%time 
dr_out_1 = xr.apply_ufunc(apply_A, dr_dask,
                          dask='parallelized', output_dtypes=[float])

CPU times: user 147 ms, sys: 38.5 ms, total: 186 ms
Wall time: 184 ms


<font color=red> The shape is wrong! </font> `grid_dims` should be 120000 (output), not 240000 (input).

In [17]:
dr_out_1

<xarray.DataArray 'data' (extra_dims: 500, grid_dims: 240000)>
dask.array<shape=(500, 240000), dtype=float64, chunksize=(100, 240000)>
Dimensions without coordinates: extra_dims, grid_dims

In [18]:
# got a bug when actually computing...
dr_out_1.compute()

ValueError: replacement data must match the Variable's shape

In [19]:
%%time 
dr_out_2 = xr.apply_ufunc(apply_A, dr_dask, 
                          input_core_dims=[['grid_dims']],
                          dask='parallelized', output_dtypes=[float])

CPU times: user 109 ms, sys: 20 ms, total: 129 ms
Wall time: 128 ms


Specifying `input_core_dims` doesn't fix the problem... Now the grid dimension is totally missing.

In [20]:
dr_out_2

<xarray.DataArray 'data' (extra_dims: 500)>
dask.array<shape=(500,), dtype=float64, chunksize=(100,)>
Dimensions without coordinates: extra_dims

In [21]:
# Another bug when actually computing...
dr_out_2.compute()

ValueError: could not broadcast input array from shape (100,120000) into shape (100)