In [1]:
import numpy as np
import xarray as xr

from scipy.sparse import coo_matrix

import dask
import dask.array as da
from dask.distributed import Client, progress
from dask_kubernetes import KubeCluster

In [2]:
# read regridding weights from disk
ds = xr.open_dataset("weights.nc")
n_s = ds.dims['n_s']
col = ds['col'].values - 1
row = ds['row'].values - 1
S = ds['S'].values
A = coo_matrix((S, (row, col))) 
A.shape

(120000, 240000)

# Single machine

In [3]:
# will read from cloud object storage in real cases
x = np.ones([1000, A.shape[1]])
x.nbytes / 1e9 # GB

1.92

In [4]:
x_dask = da.from_array(x, chunks=[100, -1]) # point to in-memory array
x_dask

dask.array<array, shape=(1000, 240000), dtype=float64, chunksize=(100, 240000)>

In [5]:
def apply_A(data):
    # use global A here!
    return A.dot(data.T).T

In [6]:
# serial performance
%time out = apply_A(x)
out.shape

CPU times: user 1.71 s, sys: 994 ms, total: 2.71 s
Wall time: 2.72 s


(1000, 120000)

In [7]:
out_dask = da.map_blocks(apply_A, x_dask, dtype=np.float64, chunks=[100, A.shape[0]])
out_dask

dask.array<apply_A, shape=(1000, 120000), dtype=float64, chunksize=(100, 120000)>

In [8]:
%%time 
# on my dual-core Mac this gives ~1.8x speed-up
# here limited by the master node performance on pangeo.pydata.org
with dask.config.set(scheduler='threads'):
    out_dask.compute()

CPU times: user 2.65 s, sys: 4.22 s, total: 6.87 s
Wall time: 2.77 s


# Distributed running on pangeo.pydata.org

In [9]:
cluster = KubeCluster(n_workers=20)
cluster

VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [10]:
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://10.22.57.14:39585  Dashboard: /user/jiaweizhuang/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [11]:
# will be reading from cloud object storage in real cases
x_dask_dist = da.ones([20000, A.shape[1]], chunks=[500, -1])
x_dask_dist.nbytes / 1e9

38.4

In [12]:
# initialize this large array in distributed memory
# never call compute (will blow-up master memory) ! 
x_dask_dist = client.persist(x_dask_dist)
progress(x_dask_dist)

VBox()

In [13]:
# now `x_dask_dist` should point to futures 
# further build dask graph upon it
out_dask_dist = da.map_blocks(apply_A, x_dask_dist, dtype=np.float64, chunks=[500, A.shape[0]])
out_dask_dist

dask.array<apply_A, shape=(20000, 120000), dtype=float64, chunksize=(500, 120000)>

In [14]:
# will be writing back to cloud object storage in real cases
# here just hold results in memory
out_dask_dist = client.persist(out_dask_dist)
progress(out_dask_dist)

VBox()

- Regridded 40 GB data in 5 seconds (~2s on individual workers + some desynchrony).
- Effective throuput 10 GB/s (will be ~20 GB/s if the data are even bigger & take long enough time to process)

Not bad!

Question: When is the function `apply_A` (containing the weights `A`) get broadcasted? Not seeing this time in "Status" or "Profile" graph.

In [15]:
# clean-up memory and test again
# client.restart()