In [1]:
import numpy as np
import xarray as xr

from scipy.sparse import coo_matrix

import dask
import dask.array as da
from dask.distributed import Client, progress
from dask_kubernetes import KubeCluster

In [2]:
# read regridding weights from disk
ds = xr.open_dataset("weights.nc")
n_s = ds.dims['n_s']
col = ds['col'].values - 1
row = ds['row'].values - 1
S = ds['S'].values
A = coo_matrix((S, (row, col))) 
A.shape

(120000, 240000)

# Single machine

In [3]:
# will read from cloud object storage in real cases
x = np.random.rand(1000, A.shape[1])
x.nbytes / 1e9 # GB

1.92

In [4]:
#x_dask = da.from_array(x, chunks=[100, -1]) # point to in-memory array
x_dask = da.random.random([1000, A.shape[1]], chunks=[100, -1]) # initialize on the fly
x_dask

dask.array<random_sample, shape=(1000, 240000), dtype=float64, chunksize=(100, 240000)>

In [5]:
def apply_A(data):
    # use global A here!
    return A.dot(data.T).T

In [6]:
# serial performance
%time out = apply_A(x)
out.shape

CPU times: user 960 ms, sys: 897 ms, total: 1.86 s
Wall time: 1.86 s


(1000, 120000)

In [7]:
out_dask = da.map_blocks(apply_A, x_dask, dtype=np.float64, chunks=[100, A.shape[0]])
out_dask

dask.array<apply_A, shape=(1000, 120000), dtype=float64, chunksize=(100, 120000)>

In [11]:
%%time 
# not faster...
with dask.config.set(scheduler='threads'):
    out_dask.compute()

CPU times: user 5.9 s, sys: 4.7 s, total: 10.6 s
Wall time: 3.71 s


# Distributed

In [12]:
cluster = KubeCluster(n_workers=30)
cluster

VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [14]:
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://10.21.10.12:33848  Dashboard: /user/jiaweizhuang/proxy/8787/status,Cluster  Workers: 21  Cores: 42  Memory: 126.00 GB


In [15]:
%%time 
out_dask.compute()

CPU times: user 1.9 s, sys: 4.08 s, total: 5.98 s
Wall time: 6.4 s


array([[0.31934864, 0.36393641, 0.26946019, ..., 0.72221182, 0.36967245,
        0.59428341],
       [0.48296722, 0.61470972, 0.32055858, ..., 0.26754128, 0.54020582,
        0.29124052],
       [0.91714009, 0.30431861, 0.77103484, ..., 0.33529832, 0.72908959,
        0.74036565],
       ...,
       [0.50992037, 0.50666989, 0.82455594, ..., 0.46589756, 0.20549722,
        0.30079077],
       [0.44720058, 0.14755496, 0.41156093, ..., 0.23921956, 0.71229014,
        0.54658992],
       [0.69205954, 0.59066495, 0.47862871, ..., 0.59993105, 0.873219  ,
        0.78321223]])

# Use larger data and chunk

In [18]:
# will blow-up single-machine memory
x_big_dask = da.random.random([20000, A.shape[1]], chunks=[1000, -1])
x_big_dask.nbytes / 1e9 # GB

38.4

In [19]:
out_big_dask = da.map_blocks(apply_A, x_big_dask, dtype=np.float64, chunks=[1000, A.shape[0]])
out_big_dask

dask.array<apply_A, shape=(20000, 120000), dtype=float64, chunksize=(1000, 120000)>

In [21]:
%%time
# do a reduction to show the graph better
out_big_dask.mean().compute()

CPU times: user 1.2 s, sys: 295 ms, total: 1.5 s
Wall time: 8.9 s


0.5000104995929522

In [22]:
out_big_dask = client.persist(out_big_dask)
progress(out_big_dask)

VBox()