In [None]:
import dask.dataframe as dd

# Dask Schedulers

## tldr; Use distributed

Recommended by developers for use on a single machine


* Diagnostic UI
* Takes account of data locality
* Futures only work w/ `distributed`

## Firing up distributed

In [None]:
from dask.distributed import Client
client = Client()

In [None]:
client

In [None]:
client.close()

### Adjust the number of workers

In [None]:
from dask.distributed import LocalCluster

In [None]:
cluster = LocalCluster(
    n_workers=2, threads_per_worker=1, memory_limit='4GB')

In [None]:
client = Client(cluster)
client

## Managing distributed

Show plot of the cluster, scheduler, and driver

## Creating `dask` `DataFrames`

* read_table/read_csv
* read_parquet
* read_json
* from_pandas
* from_delayed

If you're using Dask, it's probably because your data is not in memory.

## Lazily read the taxi data

In [None]:
trips = dd.read_parquet("data/taxi-small/trips.parq/")

Due to ongoing `pyarrow` improvements, parquet functionality is changing rapidly.

In [None]:
trips.npartitions

In [None]:
trips.head(compute=False)

We now have 32 tasks (31 partitions + 1 `head` task) that have not yet been evaluated.

## Selecting data

### Selection on the index is fast

In [None]:
xmas = trips.loc["2013-12-25", :]

In [None]:
%time xmas_df = xmas.compute()

In [None]:
xmas['trip_distance'].mean().compute()

In [None]:
first_half = trips.loc["2013-12-01":"2013-12-15", :]
first_half["trip_distance"].mean().compute()

TODO: Show selecting dates 12/1 - 12/15 and calculating

### Filter conditions parallelized across partitions

In [None]:
trips[trips.passenger_count > 4].shape[0].compute()

In [None]:
# Show how, when brought back to the driver, can plot

### `iloc` doesn't work

Dask does not keep track of partition length [(link)](https://docs.dask.org/en/latest/dataframe-indexing.html#positional-indexing).

In [None]:
try:
    trips.iloc[:5, :]
except NotImplementedError as e:
    print(e)

## Groupby/Apply

### Fast aggregation

In [None]:
df.groupby('x')['y'].max()

In [None]:
df.groupby(['idx', 'x']).apply()

## Using the `index`

In [None]:
# fast
dd.merge(df1, df2, left_index=True, right_index=True)

In [None]:
# fast
dd.merge(df1, df2, on=['idx', 'x']) # idx is index for both

## Querying the data

## Merge/Join

In [None]:
# fast
dd.merge(df1, df2, left_index=True, right_index=True)

In [None]:
# fast
dd.merge(df1, df2, on=['idx', 'x']) # idx is index for both

In [None]:
# join against another DataFrame
dd.merge(df1, df2, on='id')

## General computations

In [None]:
df1.x + df2.y # fast

In [None]:
df.rolling()

In [None]:
df.where(df.x > 5, np.nan) # keep > 5 only, others nan
df.mask(df.5 < 5) # < 5 is nan

## Optimizing

Use categories if possible

## Don't do the shuffle

In [None]:
df.set_index(df.x)

In [None]:
dd.merge(df1, df2, on='not_index')

## Reshape/Pivot

* get_dummies
* pivot_table
* melt

## Dask Specific

df.map_partitions(

In [None]:
repartition(divisions, npartitions, freq, partition_size) # one of these

In [None]:
df.random_split([0.8, 0.2])

In [None]:
df.rolling.map_overlap

# https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.rolling.map_overlap

### Series

In [None]:
map_overlap

In [None]:
nunique_approx

`split_every`: Group partitions into groups of this size while performing a tree-reduction. If set to False, no tree-reduction will be used. Default is 8.

## Save Dataframes

* to_csv
* to_parquet
* to_hdf
* to_json