In [1]:
import numpy as np
import dask.array as da

In [2]:
size_tuple = (18000,18000)
np_arr = np.random.randint(10, size=size_tuple)
np_arr2 = np.random.randint(10, size=size_tuple)

## Let's perform a complex random operation over these arrays

In [3]:
%time (((np_arr * 2).T)**2 + np_arr2 + 100).sum(axis=1).mean()

CPU times: user 5.2 s, sys: 2.57 s, total: 7.77 s
Wall time: 7.87 s


3933049.0675555556

## Let's perform the same operation with Dask Arrays

In [4]:
chunks_tuple = (500, 500)
da_arr = da.from_array(np_arr, chunks=chunks_tuple)
da_arr2 = da.from_array(np_arr2, chunks=chunks_tuple)

In [5]:
%time (((da_arr * 2).T)**2 + da_arr2 + 100).sum(axis=1).mean().compute()

CPU times: user 7.53 s, sys: 3.91 s, total: 11.4 s
Wall time: 7.8 s


3933049.0675555556

##  Numpy won't be able to even load this huge array

In [None]:
size_tuple = (50000, 50000)
np_arr = np.random.randint(10, size=size_tuple)
np_arr2 = np.random.randint(10, size=size_tuple)

## Dask is able to load and compute this data

In [3]:
import dask.array as da

size_tuple = (50000, 50000)
chunks_tuple = (5000, 5000)
da_arr = da.random.randint(10, size=size_tuple,
                           chunks=chunks_tuple)
da_arr2 = da.random.randint(10, size=size_tuple,
                            chunks=chunks_tuple)

In [4]:
%time (((da_arr * 2).T)**2 + da_arr2 + 100).sum(axis=1).mean().compute()

CPU times: user 2min 39s, sys: 1.03 s, total: 2min 40s
Wall time: 1min 26s


10925091.58726

In [5]:
da_arr.nbytes/1e+9

20.0

In [6]:
da_arr2.nbytes/1e+9

20.0