In [42]:
import polars as pl

In [21]:
group_by_1e7_csv = "../data/h2o/G1_1e7_1e2_0_0/csv/G1_1e7_1e2_0_0.csv"
group_by_1e7_parquet = "../data/h2o/G1_1e7_1e2_0_0/polars-file.parquet"

In [22]:
with pl.StringCache():
    df_csv = pl.read_csv(
        group_by_1e7_csv,
        dtype={
            "id1": pl.Utf8,
            "id2": pl.Utf8,
            "id3": pl.Utf8,
            "id4": pl.Int32,
            "id5": pl.Int32,
            "id6": pl.Int32,
            "v1": pl.Int32,
            "v2": pl.Int32,
            "v3": pl.Float64,
        },
        low_memory=True,
    ).with_columns(
        [
            pl.col("id1").cast(pl.Categorical),
            pl.col("id2").cast(pl.Categorical),
            pl.col("id3").cast(pl.Categorical),
        ]
    )

In [23]:
df_csv_lazy = df_csv.lazy()

In [24]:
df_scan_parquet_lazy = pl.scan_parquet(group_by_1e7_parquet, cache=False, parallel=True)

## q1: sum v1 by id1

In [25]:
%%time
df_csv_lazy.groupby("id1").agg(pl.sum("v1")).collect()

CPU times: user 206 ms, sys: 122 ms, total: 328 ms
Wall time: 70.1 ms


id1,v1_sum
categorical,i32
"""id087""",301327
"""id038""",301052
"""id060""",299345
"""id069""",300120
"""id023""",299417
"""id068""",299946
"""id088""",298370
"""id065""",300249
"""id029""",298486
"""id078""",300185


In [26]:
%%time
df_scan_parquet_lazy.groupby("id1").agg(pl.sum("v1")).collect()

CPU times: user 424 ms, sys: 179 ms, total: 603 ms
Wall time: 575 ms


id1,v1_sum
categorical,i32
"""id027""",301355
"""id081""",300295
"""id063""",300263
"""id030""",300934
"""id041""",301249
"""id021""",300387
"""id016""",298268
"""id074""",301660
"""id059""",301107
"""id056""",299675


## q2: sum v1 by id1:id2

In [27]:
%%time
df_csv_lazy.groupby(["id1", "id2"]).agg(pl.sum("v1")).collect()

CPU times: user 614 ms, sys: 342 ms, total: 956 ms
Wall time: 444 ms


id1,id2,v1_sum
categorical,categorical,i32
"""id061""","""id014""",3096
"""id064""","""id027""",2939
"""id038""","""id014""",2864
"""id062""","""id037""",3033
"""id038""","""id056""",3057
"""id070""","""id071""",2925
"""id055""","""id008""",2934
"""id061""","""id082""",2910
"""id038""","""id073""",3261
"""id065""","""id067""",3052


In [28]:
%%time
df_scan_parquet_lazy.groupby(["id1", "id2"]).agg(pl.sum("v1")).collect()

CPU times: user 1.14 s, sys: 231 ms, total: 1.37 s
Wall time: 627 ms


id1,id2,v1_sum
categorical,categorical,i32
"""id038""","""id097""",3054
"""id062""","""id082""",2990
"""id062""","""id091""",2933
"""id070""","""id021""",2730
"""id070""","""id056""",3057
"""id048""","""id002""",3014
"""id098""","""id099""",2991
"""id042""","""id100""",2914
"""id062""","""id036""",2907
"""id055""","""id067""",2777


## q3: sum v1 mean v3 by id3

In [30]:
%%time
df_csv_lazy.groupby("id3").agg([pl.sum("v1"), pl.mean("v3")]).collect()

CPU times: user 770 ms, sys: 309 ms, total: 1.08 s
Wall time: 305 ms


id3,v1_sum,v3_mean
categorical,i32,f64
"""id0000099766""",198,46.717145
"""id0000061581""",255,43.675530
"""id0000092084""",271,50.338019
"""id0000076922""",303,47.000091
"""id0000044377""",238,52.416216
"""id0000092426""",319,49.331101
"""id0000070592""",325,51.675724
"""id0000018007""",287,48.407862
"""id0000017295""",282,50.465982
"""id0000042649""",347,51.730890


In [31]:
%%time
df_scan_parquet_lazy.groupby("id3").agg([pl.sum("v1"), pl.mean("v3")]).collect()

CPU times: user 1.22 s, sys: 455 ms, total: 1.68 s
Wall time: 1.56 s


id3,v1_sum,v3_mean
categorical,i32,f64
"""id0000024891""",255,47.641006
"""id0000074752""",258,53.084939
"""id0000073424""",261,53.151360
"""id0000071809""",252,51.743002
"""id0000017439""",250,46.797755
"""id0000073511""",357,48.197989
"""id0000006275""",239,46.589223
"""id0000080135""",268,45.952898
"""id0000060098""",357,55.270853
"""id0000033384""",314,51.812097


## q4: mean v1:v3 by id4

In [32]:
%%time
df_csv_lazy.groupby("id4").agg([pl.mean("v1"), pl.mean("v2"), pl.mean("v3")]).collect()

CPU times: user 193 ms, sys: 436 ms, total: 630 ms
Wall time: 283 ms


id4,v1_mean,v2_mean,v3_mean
i32,f64,f64,f64
71,3.002499,7.992222,50.053616
48,2.993247,8.013077,50.069034
85,3.001773,8.004839,50.123553
41,2.995855,7.990891,49.927829
92,2.996628,7.976274,50.076283
1,3.001875,7.995427,49.983907
35,2.999383,7.992722,49.748566
81,3.002362,7.986087,50.012186
46,3.000858,7.978869,49.930203
21,3.004530,8.008759,49.893877


In [33]:
%%time
df_scan_parquet_lazy.groupby("id4").agg([pl.mean("v1"), pl.mean("v2"), pl.mean("v3")]).collect()

CPU times: user 484 ms, sys: 431 ms, total: 916 ms
Wall time: 544 ms


id4,v1_mean,v2_mean,v3_mean
i32,f64,f64,f64
84,3.005932,7.995268,50.005211
42,3.007564,7.977498,49.973327
74,3.002077,8.002157,49.941287
29,3.005723,8.008890,50.019813
89,3.000934,8.012327,49.949707
35,2.999383,7.992722,49.748566
97,2.995558,7.990926,50.038346
77,3.002807,8.004582,50.118955
14,2.994742,7.993106,50.023876
19,2.999412,7.988161,49.960548


## q5: sum v1:v3 by id6

In [34]:
%%time
df_csv_lazy.groupby("id6").agg([pl.sum("v1"), pl.sum("v2"), pl.sum("v3")]).collect()

CPU times: user 1.1 s, sys: 203 ms, total: 1.3 s
Wall time: 804 ms


id6,v1_sum,v2_sum,v3_sum
i32,i32,i32,f64
34088,294,791,4957.5322
62704,307,846,5132.063682
9432,306,796,6167.292685
75256,298,780,4286.103521
38784,295,860,4326.704569
51048,298,771,4817.976526
14128,313,759,5164.688451
78928,289,708,4831.837661
91192,283,802,4730.884402
26392,288,750,4654.438094


In [35]:
%%time
df_scan_parquet_lazy.groupby("id6").agg([pl.sum("v1"), pl.sum("v2"), pl.sum("v3")]).collect()

CPU times: user 1.16 s, sys: 359 ms, total: 1.52 s
Wall time: 843 ms


id6,v1_sum,v2_sum,v3_sum
i32,i32,i32,f64
81224,307,841,4863.312595
55864,261,788,4642.948043
32232,314,833,4724.900642
54176,232,602,3835.857643
50352,283,776,5383.505275
25696,339,861,5631.970719
99344,205,537,3863.433446
75712,401,971,6267.416160
7744,253,713,4464.721459
65168,316,867,4782.384286


## q6: median v3 sd v3 by id4 id5

In [36]:
%%time
df_csv_lazy.groupby(["id4", "id5"]).agg(
    [pl.median("v3").alias("v3_median"), pl.std("v3").alias("v3_std")]
).collect()

CPU times: user 1.23 s, sys: 164 ms, total: 1.39 s
Wall time: 264 ms


id4,id5,v3_median,v3_std
i32,i32,f64,f64
80,41,50.607965,28.826930
88,84,47.419442,28.282258
32,60,51.145579,28.919549
64,7,45.712481,29.041805
40,91,48.795285,28.356530
16,57,52.530386,28.329669
88,44,50.142609,29.056260
56,83,52.555777,29.149103
80,27,50.274150,28.974154
16,76,47.795457,28.777655


In [37]:
%%time
df_scan_parquet_lazy.groupby(["id4", "id5"]).agg(
    [pl.median("v3").alias("v3_median"), pl.std("v3").alias("v3_std")]
).collect()

CPU times: user 1.34 s, sys: 280 ms, total: 1.62 s
Wall time: 471 ms


id4,id5,v3_median,v3_std
i32,i32,f64,f64
88,54,51.520445,28.759195
8,48,48.348173,28.518831
88,29,50.248226,29.375837
16,7,52.108895,28.832566
80,100,49.029559,29.300020
88,64,49.653749,29.731343
24,84,50.833738,29.190614
88,76,52.003322,29.522862
24,7,48.462402,29.046969
48,49,49.017689,29.740750


## q7: max v1 - min v2 by id3

In [38]:
%%time
df_csv_lazy.groupby("id3").agg([(pl.max("v1") - pl.min("v2")).alias("range_v1_v2")]).collect()

CPU times: user 930 ms, sys: 75.4 ms, total: 1 s
Wall time: 200 ms


id3,range_v1_v2
categorical,i32
"""id0000081899""",4
"""id0000033242""",4
"""id0000043093""",4
"""id0000076855""",4
"""id0000010194""",4
"""id0000010383""",4
"""id0000013738""",4
"""id0000013982""",4
"""id0000045164""",4
"""id0000004499""",4


In [39]:
%%time
df_scan_parquet_lazy.groupby("id3").agg([(pl.max("v1") - pl.min("v2")).alias("range_v1_v2")]).collect()

CPU times: user 1.3 s, sys: 113 ms, total: 1.42 s
Wall time: 847 ms


id3,range_v1_v2
categorical,i32
"""id0000017687""",4
"""id0000006382""",4
"""id0000046382""",4
"""id0000015759""",4
"""id0000066924""",4
"""id0000022434""",4
"""id0000035691""",4
"""id0000080355""",4
"""id0000057770""",4
"""id0000002598""",4


## q8: largest two v3 by id6

In [43]:
%%time
df_csv_lazy.drop_nulls("v3").sort("v3", reverse=True).groupby("id6").agg(
    pl.col("v3").head(2).alias("largest2_v3")
).explode("largest2_v3").collect()

CPU times: user 3.02 s, sys: 608 ms, total: 3.63 s
Wall time: 1.34 s


id6,largest2_v3
i32,f64
24344,96.698832
24344,95.994367
86696,99.822178
86696,98.199091
34992,98.885385
34992,98.53119
7752,99.889331
7752,99.672124
75032,99.013749
75032,97.900235


In [44]:
%%time
df_scan_parquet_lazy.drop_nulls("v3").sort("v3", reverse=True).groupby("id6").agg(
    pl.col("v3").head(2).alias("largest2_v3")
).explode("largest2_v3").collect()

CPU times: user 5.68 s, sys: 1.62 s, total: 7.3 s
Wall time: 4.67 s


id6,largest2_v3
i32,f64
12720,99.961201
12720,99.422082
18032,99.959902
18032,99.734689
90800,99.964151
90800,99.57257
91080,99.779424
91080,98.202578
60912,98.998651
60912,97.995509


## q9: regression v1 v2 by id2 id4

In [45]:
%%time
df_csv_lazy.groupby(["id2","id4"]).agg((pl.pearson_corr("v1","v2")**2).alias("r2")).collect()

CPU times: user 504 ms, sys: 349 ms, total: 853 ms
Wall time: 1.51 s


id2,id4,r2
categorical,i32,f64
"""id042""",59,0.000744
"""id070""",57,0.001791
"""id044""",36,0.000111
"""id044""",59,0.000351
"""id055""",7,0.000195
"""id065""",96,0.002126
"""id061""",23,0.000137
"""id095""",3,0.003343
"""id095""",27,0.003238
"""id095""",62,0.001599


In [46]:
%%time
df_scan_parquet_lazy.groupby(["id2","id4"]).agg((pl.pearson_corr("v1","v2")**2).alias("r2")).collect()

CPU times: user 985 ms, sys: 302 ms, total: 1.29 s
Wall time: 1.1 s


id2,id4,r2
categorical,i32,f64
"""id027""",54,0.000390
"""id083""",24,0.004001
"""id008""",66,0.006612
"""id026""",62,0.000956
"""id082""",91,0.002068
"""id075""",95,0.000008
"""id082""",35,0.001379
"""id017""",2,0.000002
"""id082""",47,0.000077
"""id043""",1,0.000064


## q10: sum v3 count by id1:id6

In [48]:
%%time
df_csv_lazy.groupby(["id1","id2","id3","id4","id5","id6"]).agg([pl.sum("v3").alias("v3"), pl.count("v1").alias("count")]).collect()

CPU times: user 3.48 s, sys: 2.27 s, total: 5.76 s
Wall time: 3.52 s


id1,id2,id3,id4,id5,id6,v3,count
categorical,categorical,categorical,i32,i32,i32,f64,u32
"""id082""","""id100""","""id0000084323""",43,2,26187,71.829537,1
"""id059""","""id045""","""id0000009697""",44,88,45859,5.211935,1
"""id002""","""id009""","""id0000077886""",69,6,95062,93.636932,1
"""id098""","""id092""","""id0000005757""",84,8,56543,41.0484,1
"""id002""","""id024""","""id0000064390""",53,5,34754,61.310467,1
"""id078""","""id051""","""id0000041373""",45,31,47482,93.209817,1
"""id084""","""id048""","""id0000026998""",94,78,39538,20.510492,1
"""id005""","""id077""","""id0000087185""",89,2,46926,79.823783,1
"""id039""","""id013""","""id0000045944""",85,94,85998,76.513022,1
"""id051""","""id082""","""id0000042089""",98,75,91690,48.522458,1


In [49]:
%%time
df_scan_parquet_lazy.groupby(["id1","id2","id3","id4","id5","id6"]).agg([pl.sum("v3").alias("v3"), pl.count("v1").alias("count")]).collect()

CPU times: user 5.35 s, sys: 3.07 s, total: 8.42 s
Wall time: 5.81 s


id1,id2,id3,id4,id5,id6,v3,count
categorical,categorical,categorical,i32,i32,i32,f64,u32
"""id100""","""id014""","""id0000015032""",24,80,94018,18.721224,1
"""id015""","""id042""","""id0000077906""",66,42,95502,32.406103,1
"""id004""","""id058""","""id0000052442""",27,1,3317,42.04537,1
"""id074""","""id003""","""id0000001663""",31,93,32755,54.470882,1
"""id094""","""id056""","""id0000062072""",48,2,53867,70.030391,1
"""id004""","""id088""","""id0000078326""",28,19,39714,62.465973,1
"""id078""","""id074""","""id0000058406""",28,77,1519,64.499194,1
"""id083""","""id045""","""id0000046212""",65,52,40174,72.711423,1
"""id023""","""id086""","""id0000032470""",27,32,17141,81.679704,1
"""id019""","""id081""","""id0000041473""",77,95,89661,30.05641,1
