# Test Dask with Parquet Files

1. Serial
2. `dask.distributed`

In [1]:
import os

import numpy as np

from bokeh.io import output_notebook

import dask as da
import pandas as pd
import dask.dataframe as dd

In [2]:
tract = 4850

data_dir = '/global/projecta/projectdirs/lsst/global/in2p3/Run1.1/summary'

datafile_tract = os.path.join(data_dir, 'dpdd_object_tract_%d.parquet' % tract)
datafile_all = os.path.join(data_dir, 'dpdd_object.parquet')

In [3]:
from dask.distributed import Client
client = Client(processes=False)

In [13]:
# Switch between these to go from analyzing one tract ('datafile_tract'), which renders in seconds
# to the entire set of 20 tracts ('datafile_all'), which renders in minutes.
datafile = datafile_tract
# datafile = datafile_all

# Specify the columns we need.  This allows for significant performance advantages when reading a column-based storage format such as Parquet.
columns_to_read = ['mag_g', 'mag_r']
da_df = dd.read_parquet(datafile, columns=columns_to_read)
da_df_all = dd.read_parquet(datafile_all, columns=columns_to_read)

In [14]:
df2 = np.mean(da_df['mag_g'] - da_df['mag_r'])
df2_all = np.mean(da_df_all['mag_g'] - da_df_all['mag_r'])

In [15]:
%%timeit
df2.compute()

1.02 s ± 19.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%timeit
df2_all.compute()

13.8 s ± 2.97 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
# Load Bokeh into the Notebook
output_notebook()

In [18]:
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler, visualize

In [19]:
with da.config.set(scheduler='dask.distributed'), \
        Profiler() as prof, \
        ResourceProfiler(dt=0.25) as rprof, \
        CacheProfiler() as cprof:
    result = df2_all.compute()

In [20]:
visualize([prof, rprof, cprof])

In [21]:
%%timeit
with da.config.set(scheduler='dask.distributed'):
    result = df2_all.compute()

The slowest run took 6.01 times longer than the fastest. This could mean that an intermediate result is being cached.
10.3 s ± 3.39 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
