# h2oai groupby benchmarks

In [1]:
import dask.dataframe as dd

import pandas as pd

In [2]:
# group_by_1e7_csv = "../data/h2o/groupby-datagen_1e7_1e2_0_0/csv/G1_1e7_1e2_0_0.csv"
group_by_1e7_csv = "../data/mrpowers-h2o/groupby-1e7/*.csv"
group_by_1e7_parquet =  "../data/h2o/dask/G1_1e7_1e2_0_0"

In [3]:
x = dd.read_csv(
    group_by_1e7_csv,
    dtype={
        "id1": "category",
        "id2": "category",
        "id3": "category",
        "id4": "Int32",
        "id5": "Int32",
        "id6": "Int32",
        "v1": "Int32",
        "v2": "Int32",
        "v3": "float64",
    },
)

In [4]:
print(x.head())

     id1    id2           id3  id4  id5   id6  v1  v2         v3
0  id001  id001  id0000002442   33   80  7196   3  10  54.276247
1  id001  id001  id0000024189   79   24  8966   1   8  39.844113
2  id001  id001  id0000053796   82   90  6312   3  13   6.186058
3  id001  id001  id0000051207   93   21  5139   4  11  91.188758
4  id001  id001  id0000073710   36    9  7977   2   1  44.550209


In [5]:
len(x)

10000000

In [14]:
x.npartitions

8

In [15]:
x.memory_usage(deep=True).compute()

Index         1024
id1       10082944
id2       10082944
id3      112108332
id4       50000000
id5       50000000
id6       50000000
v1        50000000
v2        50000000
v3        80000000
dtype: int64

## q1: sum v1 by id1

In [6]:
%%time
x.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()

CPU times: user 35.1 s, sys: 1.71 s, total: 36.8 s
Wall time: 30.6 s


Unnamed: 0_level_0,v1
id1,Unnamed: 1_level_1
id001,299643
id002,300028
id003,299721
id004,300195
id005,299787
...,...
id096,299806
id097,299654
id098,299342
id099,299492


In [7]:
ddf = dd.read_parquet(
    group_by_1e7_parquet, columns=["id1", "v1"]
)

In [8]:
%%time
ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()

CPU times: user 1.05 s, sys: 152 ms, total: 1.2 s
Wall time: 862 ms


Unnamed: 0_level_0,v1
id1,Unnamed: 1_level_1
id016,298268
id039,299711
id047,298115
id043,299557
id054,299408
...,...
id008,300392
id062,299313
id011,300188
id059,301107


## q2: sum v1 by id1:id2

In [19]:
ddf = dd.read_parquet(
    group_by_1e7_parquet,
    columns=["id1", "id2", "v1"],
)

In [20]:
%%time
ddf.groupby(["id1", "id2"], dropna=False, observed=True).agg({"v1": "sum"}).compute()

CPU times: user 755 ms, sys: 305 ms, total: 1.06 s
Wall time: 275 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,v1
id1,id2,Unnamed: 2_level_1
id016,id016,3072
id016,id045,3160
id016,id023,2931
id016,id057,2984
id016,id040,2770
...,...,...
id096,id028,3017
id096,id082,3184
id096,id065,3159
id096,id013,2903


In [21]:
%%time
x.groupby(["id1", "id2"], dropna=False, observed=True).agg({"v1": "sum"}).compute()

CPU times: user 36.3 s, sys: 2.25 s, total: 38.6 s
Wall time: 34.6 s


Unnamed: 0_level_0,Unnamed: 1_level_0,v1
id1,id2,Unnamed: 2_level_1
id016,id016,3072
id016,id045,3160
id016,id023,2931
id016,id057,2984
id016,id040,2770
...,...,...
id096,id028,3017
id096,id082,3184
id096,id065,3159
id096,id013,2903


## q3: sum v1 mean v3 by id3

In [22]:
ddf = dd.read_parquet(
    group_by_1e7_parquet,
    columns=["id3", "v1", "v3"],
)

In [23]:
%%time
ddf.groupby("id3", dropna=False, observed=True).agg(
    {"v1": "sum", "v3": "mean"}
).compute()

CPU times: user 1.46 s, sys: 137 ms, total: 1.6 s
Wall time: 1.13 s


Unnamed: 0_level_0,v1,v3
id3,Unnamed: 1_level_1,Unnamed: 2_level_1
id0000042202,281,53.198223
id0000029558,327,45.245052
id0000071286,322,49.842035
id0000015141,308,50.790698
id0000011083,307,47.676258
...,...,...
id0000009966,318,53.899987
id0000064729,298,48.259954
id0000084431,238,50.899993
id0000089703,323,45.874188


In [24]:
%%time
x.groupby("id3", dropna=False, observed=True).agg({"v1": "sum", "v3": "mean"}).compute()

CPU times: user 37.1 s, sys: 2.01 s, total: 39.1 s
Wall time: 34.4 s


Unnamed: 0_level_0,v1,v3
id3,Unnamed: 1_level_1,Unnamed: 2_level_1
id0000042202,281,53.198223
id0000029558,327,45.245052
id0000071286,322,49.842035
id0000015141,308,50.790698
id0000011083,307,47.676258
...,...,...
id0000009966,318,53.899987
id0000064729,298,48.259954
id0000084431,238,50.899993
id0000089703,323,45.874188


## q4: mean v1:v3 by id4

In [25]:
ddf = dd.read_parquet(
    group_by_1e7_parquet,
    columns=["id4", "v1", "v2", "v3"],
)

In [26]:
%%time
ddf.groupby("id4", dropna=False, observed=True).agg(
    {"v1": "mean", "v2": "mean", "v3": "mean"}
).compute()

CPU times: user 573 ms, sys: 130 ms, total: 703 ms
Wall time: 340 ms


Unnamed: 0_level_0,v1,v2,v3
id4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.001875,7.995427,49.983907
2,2.998208,7.996927,49.841268
3,3.00001,7.998429,49.880546
4,3.005033,8.008521,50.053042
5,2.999491,8.01399,50.005535
...,...,...,...
96,2.996474,7.983614,49.889232
97,2.995558,7.990926,50.038346
98,2.986841,8.012647,50.124801
99,2.999043,7.98188,49.944407


In [27]:
%%time
x.groupby("id4", dropna=False, observed=True).agg(
    {"v1": "mean", "v2": "mean", "v3": "mean"}
).compute()

CPU times: user 36.4 s, sys: 2.35 s, total: 38.8 s
Wall time: 35.4 s


Unnamed: 0_level_0,v1,v2,v3
id4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.001875,7.995427,49.983907
2,2.998208,7.996927,49.841268
3,3.00001,7.998429,49.880546
4,3.005033,8.008521,50.053042
5,2.999491,8.01399,50.005535
...,...,...,...
96,2.996474,7.983614,49.889232
97,2.995558,7.990926,50.038346
98,2.986841,8.012647,50.124801
99,2.999043,7.98188,49.944407


## q5: sum v1:v3 by id6

In [28]:
ddf = dd.read_parquet(
    group_by_1e7_parquet,
    columns=["id6", "v1", "v2", "v3"],
)

In [29]:
%%time
ddf.groupby("id6", dropna=False, observed=True).agg(
    {"v1": "sum", "v2": "sum", "v3": "sum"}
).compute()

CPU times: user 1.22 s, sys: 233 ms, total: 1.45 s
Wall time: 555 ms


Unnamed: 0_level_0,v1,v2,v3
id6,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,278,759,4500.324131
2,296,803,5139.198430
3,233,622,4081.984864
4,282,877,5539.323994
5,267,705,3886.950520
...,...,...,...
99996,339,977,5925.965188
99997,262,745,4610.177984
99998,333,759,5134.606333
99999,305,749,5034.354274


In [30]:
%%time
x.groupby("id6", dropna=False, observed=True).agg(
    {"v1": "sum", "v2": "sum", "v3": "sum"}
).compute()

CPU times: user 36.3 s, sys: 1.66 s, total: 38 s
Wall time: 32.7 s


Unnamed: 0_level_0,v1,v2,v3
id6,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,278,759,4500.324131
2,296,803,5139.198430
3,233,622,4081.984864
4,282,877,5539.323994
5,267,705,3886.950520
...,...,...,...
99996,339,977,5925.965188
99997,262,745,4610.177984
99998,333,759,5134.606333
99999,305,749,5034.354274


## q6: Intentionally skipped cause it's commented out

## q7: max v1 - min v2 by id3

In [31]:
ddf = dd.read_parquet(
    group_by_1e7_parquet,
    columns=["id3", "v1", "v2"],
)

In [32]:
%%time
ddf.groupby("id3", dropna=False, observed=True).agg({"v1": "max", "v2": "min"}).assign(
    range_v1_v2=lambda x: x["v1"] - x["v2"]
)[["range_v1_v2"]].compute()

CPU times: user 1.42 s, sys: 89 ms, total: 1.51 s
Wall time: 1.12 s


Unnamed: 0_level_0,range_v1_v2
id3,Unnamed: 1_level_1
id0000042202,4
id0000029558,4
id0000071286,4
id0000015141,4
id0000011083,4
...,...
id0000009966,4
id0000064729,4
id0000084431,4
id0000089703,4


In [33]:
%%time
x.groupby("id3", dropna=False, observed=True).agg({"v1": "max", "v2": "min"}).assign(
    range_v1_v2=lambda x: x["v1"] - x["v2"]
)[["range_v1_v2"]].compute()

CPU times: user 36.8 s, sys: 1.64 s, total: 38.4 s
Wall time: 33.2 s


Unnamed: 0_level_0,range_v1_v2
id3,Unnamed: 1_level_1
id0000042202,4
id0000029558,4
id0000071286,4
id0000015141,4
id0000011083,4
...,...
id0000009966,4
id0000064729,4
id0000084431,4
id0000089703,4


## q8: largest two v3 by id6

In [34]:
ddf = dd.read_parquet(
    group_by_1e7_parquet,
    columns=["id6", "v1", "v2", "v3"],
)

In [35]:
%%time
ddf[~ddf["v3"].isna()][["id6", "v3"]].groupby("id6", dropna=False, observed=True).apply(
    lambda x: x.nlargest(2, columns="v3"), meta={"id6": "Int64", "v3": "float64"}
)[["v3"]].compute()

CPU times: user 2min 36s, sys: 12.6 s, total: 2min 49s
Wall time: 2min 35s


Unnamed: 0_level_0,Unnamed: 1_level_0,v3
id6,Unnamed: 1_level_1,Unnamed: 2_level_1
9,708485,99.930063
9,1233683,99.920439
19,82918,97.625727
19,625200,97.036998
33,1183098,99.414996
...,...,...
99994,168013,96.185434
99998,136085,99.405711
99998,197174,99.278838
99999,667180,99.555205


In [36]:
%%time
x[~x["v3"].isna()][["id6", "v3"]].groupby("id6", dropna=False, observed=True).apply(
    lambda x: x.nlargest(2, columns="v3"), meta={"id6": "Int64", "v3": "float64"}
)[["v3"]].compute()

CPU times: user 3min 12s, sys: 14.9 s, total: 3min 26s
Wall time: 3min 9s


Unnamed: 0_level_0,Unnamed: 1_level_0,v3
id6,Unnamed: 1_level_1,Unnamed: 2_level_1
9,708485,99.930063
9,1233683,99.920439
19,82918,97.625727
19,625200,97.036998
33,1183098,99.414996
...,...,...
99994,168013,96.185434
99998,136085,99.405711
99998,197174,99.278838
99999,667180,99.555205


# q9: regression v1 v2 by id2 id4

In [37]:
ddf = dd.read_parquet(
    group_by_1e7_parquet,
    columns=["id2", "id4", "v1", "v2"],
)

In [38]:
%%time
ddf[["id2", "id4", "v1", "v2"]].groupby(
    ["id2", "id4"], dropna=False, observed=True
).apply(
    lambda x: pd.Series({"r2": x.corr()["v1"]["v2"] ** 2}), meta={"r2": "float64"}
).compute()

CPU times: user 8.08 s, sys: 1.6 s, total: 9.68 s
Wall time: 5.63 s


Unnamed: 0_level_0,Unnamed: 1_level_0,r2
id2,id4,Unnamed: 2_level_1
id041,3,0.000002
id041,8,0.002711
id041,24,0.000868
id041,25,0.004970
id041,37,0.000224
...,...,...
id001,35,0.000494
id001,38,0.001098
id001,59,0.000683
id001,90,0.000092


In [39]:
%%time
x[["id2", "id4", "v1", "v2"]].groupby(
    ["id2", "id4"], dropna=False, observed=True
).apply(
    lambda x: pd.Series({"r2": x.corr()["v1"]["v2"] ** 2}), meta={"r2": "float64"}
).compute()

CPU times: user 43.8 s, sys: 3.8 s, total: 47.6 s
Wall time: 40 s


Unnamed: 0_level_0,Unnamed: 1_level_0,r2
id2,id4,Unnamed: 2_level_1
id047,3,3.313240e-05
id047,8,7.364508e-04
id047,24,2.032771e-03
id047,25,2.191219e-04
id047,37,2.745002e-04
...,...,...
id018,35,8.950952e-07
id018,38,2.433695e-05
id018,59,9.727365e-05
id018,90,1.466000e-04


## q10: sum v3 count by id1:id6

In [40]:
ddf = dd.read_parquet(
    group_by_1e7_parquet,
    columns=["id1", "id2", "id3", "id4", "id5", "id6", "v1", "v3"],
)

In [41]:
%%time
ddf.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], dropna=False, observed=True).agg(
    {"v3": "sum", "v1": "size"}
).compute()

CPU times: user 15.1 s, sys: 3.02 s, total: 18.1 s
Wall time: 14.2 s


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,v3,v1
id1,id2,id3,id4,id5,id6,Unnamed: 6_level_1,Unnamed: 7_level_1
id016,id016,id0000042202,15,24,5971,37.211254,1
id016,id016,id0000096717,90,36,24881,68.265721,1
id016,id016,id0000050660,56,2,31346,69.197638,1
id016,id016,id0000006913,30,76,22749,60.054044,1
id016,id016,id0000032257,19,19,41901,71.002708,1
...,...,...,...,...,...,...,...
id054,id002,id0000063962,43,67,18942,51.507418,1
id054,id002,id0000098794,99,100,27574,96.151364,1
id054,id002,id0000063695,6,90,7197,2.603732,1
id054,id002,id0000078453,2,42,53619,74.687472,1


In [42]:
%%time
x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], dropna=False, observed=True).agg(
    {"v3": "sum", "v1": "size"}
).compute()

CPU times: user 51.2 s, sys: 5.15 s, total: 56.3 s
Wall time: 49.1 s


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,v3,v1
id1,id2,id3,id4,id5,id6,Unnamed: 6_level_1,Unnamed: 7_level_1
id016,id016,id0000042202,15,24,5971,37.211254,1
id016,id016,id0000096717,90,36,24881,68.265721,1
id016,id016,id0000050660,56,2,31346,69.197638,1
id016,id016,id0000006913,30,76,22749,60.054044,1
id016,id016,id0000032257,19,19,41901,71.002708,1
...,...,...,...,...,...,...,...
id054,id002,id0000063962,43,67,18942,51.507418,1
id054,id002,id0000098794,99,100,27574,96.151364,1
id054,id002,id0000063695,6,90,7197,2.603732,1
id054,id002,id0000078453,2,42,53619,74.687472,1
