## Dask - Parallel Computing In Python

- Dynamic Task Scheduling
- Big Data Processing

In [None]:
#!python -m pip install "dask[complete]"    # Install everything

In [14]:
import numpy as np
import pandas as pd

import dask.dataframe as dd

In [15]:
index = pd.date_range("2021-09-01", periods=2400, freq="1h")
df = pd.DataFrame({"a": np.arange(2400), "b": list("abcaddbe" * 300)}, index=index)
ddf = dd.from_pandas(df, npartitions=10)
#ddf

In [16]:
ddf

Unnamed: 0_level_0,a,b
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-09-01 00:00:00,int64,object
2021-09-11 00:00:00,...,...
...,...,...
2021-11-30 00:00:00,...,...
2021-12-09 23:00:00,...,...


In [17]:
ddf.divisions

(Timestamp('2021-09-01 00:00:00', freq='H'),
 Timestamp('2021-09-11 00:00:00', freq='H'),
 Timestamp('2021-09-21 00:00:00', freq='H'),
 Timestamp('2021-10-01 00:00:00', freq='H'),
 Timestamp('2021-10-11 00:00:00', freq='H'),
 Timestamp('2021-10-21 00:00:00', freq='H'),
 Timestamp('2021-10-31 00:00:00', freq='H'),
 Timestamp('2021-11-10 00:00:00', freq='H'),
 Timestamp('2021-11-20 00:00:00', freq='H'),
 Timestamp('2021-11-30 00:00:00', freq='H'),
 Timestamp('2021-12-09 23:00:00', freq='H'))

In [18]:
ddf.partitions[3]

Unnamed: 0_level_0,a,b
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-10-01,int64,object
2021-10-11,...,...


In [19]:
ddf["2021-10-01":"2021-10-09 5:00"]

Unnamed: 0_level_0,a,b
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-10-01 00:00:00.000000000,int64,object
2021-10-09 05:00:59.999999999,...,...


In [20]:
ddf["2021-10-01": "2021-10-09 5:00"].compute()

Unnamed: 0,a,b
2021-10-01 00:00:00,720,a
2021-10-01 01:00:00,721,b
2021-10-01 02:00:00,722,c
2021-10-01 03:00:00,723,a
2021-10-01 04:00:00,724,d
...,...,...
2021-10-09 01:00:00,913,b
2021-10-09 02:00:00,914,c
2021-10-09 03:00:00,915,a
2021-10-09 04:00:00,916,d


In [21]:
ddf.a.mean()

dd.Scalar<series-..., dtype=float64>

In [22]:
%time
ddf.a.mean().compute()

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 3.58 µs


1199.5

In [23]:
%time
df['a'].mean()

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.25 µs


1199.5

In [24]:
ddf.a.max().compute()

2399

In [25]:
ddf.memory_usage().compute()

Index    27496
a        19200
b        19200
dtype: int64

In [26]:
result = ddf["2021-10-01": "2021-10-09 5:00"].a.cumsum() - 100

In [27]:
result.dask

0,1
"layer_type  MaterializedLayer  is_materialized  True  npartitions  10  columns  ['a', 'b']  type  dask.dataframe.core.DataFrame  dataframe_type  pandas.core.frame.DataFrame  series_dtypes  {'a': dtype('int64'), 'b': dtype('O')}",

0,1
layer_type,MaterializedLayer
is_materialized,True
npartitions,10
columns,"['a', 'b']"
type,dask.dataframe.core.DataFrame
dataframe_type,pandas.core.frame.DataFrame
series_dtypes,"{'a': dtype('int64'), 'b': dtype('O')}"

0,1
"layer_type  MaterializedLayer  is_materialized  True  npartitions  1  columns  ['a', 'b']  type  dask.dataframe.core.DataFrame  dataframe_type  pandas.core.frame.DataFrame  series_dtypes  {'a': dtype('int64'), 'b': dtype('O')}",

0,1
layer_type,MaterializedLayer
is_materialized,True
npartitions,1
columns,"['a', 'b']"
type,dask.dataframe.core.DataFrame
dataframe_type,pandas.core.frame.DataFrame
series_dtypes,"{'a': dtype('int64'), 'b': dtype('O')}"

0,1
layer_type  Blockwise  is_materialized  False,

0,1
layer_type,Blockwise
is_materialized,False

0,1
layer_type  Blockwise  is_materialized  False,

0,1
layer_type,Blockwise
is_materialized,False

0,1
layer_type  Blockwise  is_materialized  False,

0,1
layer_type,Blockwise
is_materialized,False

0,1
layer_type  MaterializedLayer  is_materialized  True,

0,1
layer_type,MaterializedLayer
is_materialized,True

0,1
layer_type  Blockwise  is_materialized  False,

0,1
layer_type,Blockwise
is_materialized,False
