In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr

from scipy.sparse import coo_matrix
import dask.array as da
from dask.diagnostics import ProgressBar

**Warning: This is not working yet! I am looking for advice.**

# Preparation

Just copy the code from [sparse_dot_benchmark.ipynb](./sparse_dot_benchmark.ipynb).

In [2]:
ds = xr.open_dataset("weights.nc")
n_s = ds.dims['n_s']
col = ds['col'].values - 1
row = ds['row'].values - 1
S = ds['S'].values
A = coo_matrix((S, (row, col))) 

In [3]:
data = np.random.rand(500, A.shape[1])
data.shape

(500, 240000)

In [4]:
# reference result and baseline performance
%time out_scipy = A.dot(data.T).T
out_scipy.shape

CPU times: user 390 ms, sys: 137 ms, total: 527 ms
Wall time: 531 ms


(500, 120000)

# apply_ufunc on numpy array

In [5]:
def apply_A(data):
    # use global A here!
    return A.dot(data.T).T

Has the same performance as before.

In [6]:
%time xr.apply_ufunc(apply_A, data)

CPU times: user 392 ms, sys: 137 ms, total: 529 ms
Wall time: 528 ms


array([[ 0.64035934,  0.25633261,  0.22932984, ...,  0.58535814,
         0.57098596,  0.45871738],
       [ 0.65111025,  0.43255861,  0.48275877, ...,  0.22928424,
         0.22654979,  0.72473355],
       [ 0.88125002,  0.66011093,  0.67546398, ...,  0.43306806,
         0.33982799,  0.36408538],
       ..., 
       [ 0.44609135,  0.2875747 ,  0.55209924, ...,  0.38536201,
         0.74627413,  0.49513517],
       [ 0.58174224,  0.40249452,  0.83091449, ...,  0.62205669,
         0.4060233 ,  0.33318684],
       [ 0.3397085 ,  0.70883769,  0.4481516 , ...,  0.7491757 ,
         0.36850702,  0.55148137]])

# apply_ufunc on xarray DataArray

In [7]:
dr = xr.DataArray(data, 
                  dims=['extra_dims', 'grid_dims'],
                  coords=[np.arange(500), np.arange(240000)],
                  name='data'
                 )
dr

<xarray.DataArray 'data' (extra_dims: 500, grid_dims: 240000)>
array([[ 0.503106,  0.86587 ,  0.064703, ...,  0.759291,  0.098374,  0.541065],
       [ 0.78399 ,  0.027373,  0.429608, ...,  0.158265,  0.470795,  0.826959],
       [ 0.990341,  0.659289,  0.714679, ...,  0.270863,  0.124033,  0.323208],
       ..., 
       [ 0.473267,  0.615428,  0.113431, ...,  0.818652,  0.887854,  0.380646],
       [ 0.56634 ,  0.670913,  0.315629, ...,  0.301163,  0.442713,  0.190211],
       [ 0.447628,  0.132499,  0.952663, ...,  0.344088,  0.286346,  0.619671]])
Coordinates:
  * extra_dims  (extra_dims) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...
  * grid_dims   (grid_dims) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ...

Has the same performance as before.

In [8]:
%time dr_out = xr.apply_ufunc(apply_A, dr)

CPU times: user 403 ms, sys: 140 ms, total: 542 ms
Wall time: 542 ms


In [9]:
# result is correct
np.array_equal(dr_out.data, out_scipy)

True

# Use dask array

In [10]:
data_dask = da.from_array(data, chunks=(100, 240000))
data_dask

dask.array<array, shape=(500, 240000), dtype=float64, chunksize=(100, 240000)>

**Horribly slow...**

In [11]:
%time out_dask = apply_A(data_dask)
out_dask.shape

CPU times: user 4.15 s, sys: 1 s, total: 5.15 s
Wall time: 5.25 s


(500, 120000)

In [12]:
# result is correct
np.array_equal(out_dask, out_scipy)

True

# apply_ufunc on dask array, serial

In [13]:
dr_dask = xr.DataArray(data_dask, 
                       dims=['extra_dims', 'grid_dims'],
                       name='data')
dr_dask

<xarray.DataArray 'data' (extra_dims: 500, grid_dims: 240000)>
dask.array<shape=(500, 240000), dtype=float64, chunksize=(100, 240000)>
Dimensions without coordinates: extra_dims, grid_dims

**As slow as the previous case...**

In [14]:
%time dr_out_dask = xr.apply_ufunc(apply_A, dr_dask, dask='allowed')

CPU times: user 3.93 s, sys: 915 ms, total: 4.84 s
Wall time: 4.88 s


In [15]:
# result is correct
np.array_equal(dr_out_dask, out_scipy)

True

# apply_ufunc on dask array, parallelized

In [16]:
%%time 
dr_out_pa = xr.apply_ufunc(apply_A, dr_dask, 
                           input_core_dims=[['grid_dims']],
                           output_core_dims=[['out_grid']],
                           output_sizes={'out_grid': 120000},
                           dask='parallelized', 
                           output_dtypes=[float])

CPU times: user 140 ms, sys: 39.5 ms, total: 179 ms
Wall time: 180 ms


In [17]:
dr_out_pa # not computed yet

<xarray.DataArray 'data' (extra_dims: 500, out_grid: 120000)>
dask.array<shape=(500, 120000), dtype=float64, chunksize=(100, 120000)>
Dimensions without coordinates: extra_dims, out_grid

**Much faster than the previous case, but still slower than the pure numpy case.**

In [18]:
%time dr_out_pa.compute()

CPU times: user 1.07 s, sys: 953 ms, total: 2.03 s
Wall time: 1.03 s


<xarray.DataArray 'data' (extra_dims: 500, out_grid: 120000)>
array([[ 0.640359,  0.256333,  0.22933 , ...,  0.585358,  0.570986,  0.458717],
       [ 0.65111 ,  0.432559,  0.482759, ...,  0.229284,  0.22655 ,  0.724734],
       [ 0.88125 ,  0.660111,  0.675464, ...,  0.433068,  0.339828,  0.364085],
       ..., 
       [ 0.446091,  0.287575,  0.552099, ...,  0.385362,  0.746274,  0.495135],
       [ 0.581742,  0.402495,  0.830914, ...,  0.622057,  0.406023,  0.333187],
       [ 0.339708,  0.708838,  0.448152, ...,  0.749176,  0.368507,  0.551481]])
Dimensions without coordinates: extra_dims, out_grid

In [19]:
# result is correct
np.array_equal(dr_out_pa.data, out_scipy)

True