# PySpark query benchmark with different configurations

In [1]:
from pathlib import Path

import delta
import pyspark
import pyspark.sql.functions as F
from delta import configure_spark_with_delta_pip

## Some optimized config

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.executor.memory", "10G")
    .config("spark.driver.memory", "25G")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .config("spark.sql.shuffle.partitions", "2")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-340-delta-240/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1455ef21-4c76-40d9-b4a6-9cb3220c6438;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 121ms :: artifacts dl 3ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default    

### groupby

In [3]:
delta_table = delta.DeltaTable.forPath(
    spark, f"{Path.home()}/data/deltalake/G1_1e9_1e2_0_0"
)

In [4]:
%%time
delta_table.toDF().groupBy("id1").sum("v1").limit(10).collect()

23/10/19 15:08:07 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

CPU times: user 31.6 ms, sys: 11.4 ms, total: 43 ms
Wall time: 11.9 s


                                                                                

[Row(id1='id016', sum(v1)=30003304),
 Row(id1='id074', sum(v1)=30006309),
 Row(id1='id070', sum(v1)=29990210),
 Row(id1='id054', sum(v1)=30011978),
 Row(id1='id053', sum(v1)=29992360),
 Row(id1='id056', sum(v1)=29987234),
 Row(id1='id057', sum(v1)=29991822),
 Row(id1='id003', sum(v1)=30003365),
 Row(id1='id001', sum(v1)=30009448),
 Row(id1='id015', sum(v1)=30006177)]

In [7]:
%%time
df = spark.read.format("parquet").load(f"{Path.home()}/data/G1_1e9_1e2_0_0.parquet")
df.groupBy("id1").sum("v1").limit(10).collect()



CPU times: user 114 ms, sys: 34.3 ms, total: 148 ms
Wall time: 54.5 s


                                                                                

[Row(id1='id016', sum(v1)=30003304),
 Row(id1='id054', sum(v1)=30011978),
 Row(id1='id029', sum(v1)=30003726),
 Row(id1='id090', sum(v1)=29994958),
 Row(id1='id070', sum(v1)=29990210),
 Row(id1='id078', sum(v1)=29998434),
 Row(id1='id024', sum(v1)=30003956),
 Row(id1='id053', sum(v1)=29992360),
 Row(id1='id058', sum(v1)=29999957),
 Row(id1='id086', sum(v1)=30003608)]

In [8]:
%%time
df = spark.read.format("csv").option("header", True).load(f"{Path.home()}/data/G1_1e7_1e2_0_0.csv")
df.groupBy("id1").sum("v1").limit(10).collect()

AnalysisException: "v1" is not a numeric column. Aggregation function can only be applied on a numeric column.

In [6]:
%%time
delta_table.toDF().groupby("id1", "id2").agg(F.sum("v1")).limit(10).collect()



CPU times: user 52.7 ms, sys: 18.1 ms, total: 70.8 ms
Wall time: 24.3 s


                                                                                

[Row(id1='id073', id2='id066', sum(v1)=298735),
 Row(id1='id053', id2='id100', sum(v1)=300252),
 Row(id1='id002', id2='id094', sum(v1)=299035),
 Row(id1='id016', id2='id022', sum(v1)=300351),
 Row(id1='id070', id2='id094', sum(v1)=299972),
 Row(id1='id056', id2='id005', sum(v1)=298962),
 Row(id1='id001', id2='id022', sum(v1)=300173),
 Row(id1='id017', id2='id067', sum(v1)=298629),
 Row(id1='id017', id2='id092', sum(v1)=300686),
 Row(id1='id071', id2='id062', sum(v1)=301267)]

In [7]:
%%time
delta_table.toDF().groupby("id3").agg(F.sum("v1"), F.mean("v3")).limit(10).collect()

[Stage 26:>                                                         (0 + 1) / 1]

CPU times: user 119 ms, sys: 38.2 ms, total: 157 ms
Wall time: 1min 38s


                                                                                

[Row(id3='id0004622650', sum(v1)=259, avg(v3)=50.07008313684211),
 Row(id3='id0002250243', sum(v1)=303, avg(v3)=48.23652759183673),
 Row(id3='id0006555269', sum(v1)=341, avg(v3)=44.89965477142858),
 Row(id3='id0007655494', sum(v1)=320, avg(v3)=47.21333414285715),
 Row(id3='id0003471135', sum(v1)=334, avg(v3)=53.15563272897195),
 Row(id3='id0006725518', sum(v1)=308, avg(v3)=49.316587625000004),
 Row(id3='id0002568104', sum(v1)=291, avg(v3)=51.92029662765959),
 Row(id3='id0009420145', sum(v1)=327, avg(v3)=46.69890432456141),
 Row(id3='id0001388327', sum(v1)=342, avg(v3)=46.687223669811324),
 Row(id3='id0009701426', sum(v1)=358, avg(v3)=52.993968603448295)]

### join

In [3]:
x = spark.read.format("delta").load(f"{Path.home()}/data/deltalake/J1_1e9_1e9_0_0")
small = spark.read.format("parquet").load(f"{Path.home()}/data/J1_1e9_1e3_0_0.parquet")

In [4]:
x.show()

23/10/14 17:56:29 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+----+-------+----------+------+---------+------------+---------+
| id1|    id2|       id3|   id4|      id5|         id6|       v2|
+----+-------+----------+------+---------+------------+---------+
| 673| 149068|1093346816| id673| id149068|id1093346816| 3.929802|
|1022| 248015| 668387734|id1022| id248015| id668387734|58.864929|
|1055| 322117| 872814678|id1055| id322117| id872814678|42.166249|
|  25| 583593| 562556089|  id25| id583593| id562556089|24.244918|
|  50|  27387| 683435950|  id50|  id27387| id683435950|82.454048|
| 595| 820960| 988037401| id595| id820960| id988037401|72.361449|
| 716| 624005| 148172278| id716| id624005| id148172278|87.780909|
| 598|1069732| 934008457| id598|id1069732| id934008457|26.363869|
| 210| 861867| 656429668| id210| id861867| id656429668|86.439724|
| 712| 438322| 499974074| id712| id438322| id499974074|54.159267|
| 636| 519065| 627582547| id636| id519065| id627582547|89.456829|
|1019|  15141| 970467325|id1019|  id15141| id970467325| 6.681671|
| 698| 285

In [5]:
small.show()

+----+------+---------+
| id1|   id4|       v2|
+----+------+---------+
|1052|id1052|81.884757|
| 840| id840|11.485303|
| 936| id936| 96.43028|
|1054|id1054|72.621375|
|1039|id1039|74.512321|
| 697| id697| 18.95829|
|  97|  id97|12.534013|
| 137| id137| 9.575396|
| 646| id646|58.337589|
| 684| id684|74.377903|
| 791| id791|79.697195|
| 943| id943| 77.58073|
|  78|  id78|58.411993|
| 911| id911|49.164542|
| 968| id968| 89.14426|
|  98|  id98| 3.126798|
| 986| id986|90.414426|
| 459| id459| 52.75793|
| 454| id454|62.363871|
| 781| id781|  7.94866|
+----+------+---------+
only showing top 20 rows



In [6]:
x.createOrReplaceTempView("x")
small.createOrReplaceTempView("small")

In [8]:
%%time
spark.sql(
    "select x.id2, sum(small.v2) from x join small using (id1) group by x.id2"
).show()

[Stage 23:>                                                         (0 + 1) / 1]

+-------+------------------+
|    id2|           sum(v2)|
+-------+------------------+
| 149068| 49455.28425800003|
| 820960| 48469.40992599999|
|1069732| 51441.30430500004|
| 861867| 49053.17513100004|
| 438322| 48230.86005900001|
| 519065| 50182.25589699998|
| 478234|48629.404682999935|
|1006333|49915.407982999976|
| 817683| 50524.43148400001|
| 458992|      49816.501689|
| 894323| 46209.67581399999|
| 922592| 51530.21153800002|
| 330214|49880.050869999985|
| 101669|      48598.789904|
| 630688| 52055.20737700001|
| 928119| 48362.81223500001|
| 755275|50467.938369999996|
|1037788|49822.992801000015|
| 993538|50335.485864000024|
| 232083|49318.394981999976|
+-------+------------------+
only showing top 20 rows

CPU times: user 102 ms, sys: 34.7 ms, total: 137 ms
Wall time: 1min 9s


                                                                                

## Default config

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-340-delta-240/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1a523e97-f6e6-4f83-b55e-bdd7a6cf87b7;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 101ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default    

### groupby

In [4]:
delta_table = delta.DeltaTable.forPath(
    spark, f"{Path.home()}/data/deltalake/G1_1e9_1e2_0_0"
)

In [11]:
%%time
delta_table.toDF().groupBy("id1").sum("v1").limit(10).collect()



CPU times: user 29.4 ms, sys: 11.6 ms, total: 41.1 ms
Wall time: 8.35 s


                                                                                

[Row(id1='id073', sum(v1)=30006820),
 Row(id1='id074', sum(v1)=30006309),
 Row(id1='id055', sum(v1)=30009993),
 Row(id1='id056', sum(v1)=29987234),
 Row(id1='id005', sum(v1)=29993888),
 Row(id1='id003', sum(v1)=30003365),
 Row(id1='id002', sum(v1)=29996534),
 Row(id1='id057', sum(v1)=29991822),
 Row(id1='id054', sum(v1)=30011978),
 Row(id1='id053', sum(v1)=29992360)]

In [10]:
%%time
delta_table.toDF().groupby("id1", "id2").agg(F.sum("v1")).limit(10).collect()



CPU times: user 55.9 ms, sys: 20.7 ms, total: 76.6 ms
Wall time: 23.1 s


                                                                                

[Row(id1='id073', id2='id066', sum(v1)=298735),
 Row(id1='id053', id2='id100', sum(v1)=300252),
 Row(id1='id002', id2='id094', sum(v1)=299035),
 Row(id1='id016', id2='id022', sum(v1)=300351),
 Row(id1='id070', id2='id094', sum(v1)=299972),
 Row(id1='id056', id2='id005', sum(v1)=298962),
 Row(id1='id001', id2='id022', sum(v1)=300173),
 Row(id1='id017', id2='id067', sum(v1)=298629),
 Row(id1='id017', id2='id092', sum(v1)=300686),
 Row(id1='id071', id2='id062', sum(v1)=301267)]

In [12]:
%%time
delta_table.toDF().groupby("id3").agg(F.sum("v1"), F.mean("v3")).limit(10).collect()

23/10/14 10:10:22 WARN NettyRpcEnv: Ignored failure: java.util.concurrent.TimeoutException: Cannot receive any reply from qtk9h72yp0.lan:58048 in 10000 milliseconds
23/10/14 10:10:25 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.rpc.RpcTimeoutException: Futures timed out after [10000 milliseconds]. This timeout is controlled by spark.executor.heartbeatInterval
	at org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcTimeout.scala:47)
	at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:62)
	at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:58)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:76)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor

KeyboardInterrupt: 

### join

In [3]:
x = spark.read.format("delta").load(f"{Path.home()}/data/deltalake/J1_1e9_1e9_0_0")
small = spark.read.format("parquet").load(f"{Path.home()}/data/J1_1e9_1e3_0_0.parquet")

In [4]:
x.createOrReplaceTempView("x")
small.createOrReplaceTempView("small")

In [5]:
%%time
spark.sql(
    "select x.id2, sum(small.v2) from x join small using (id1) group by x.id2"
).show()

23/10/14 18:02:16 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/10/14 18:02:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/14 18:02:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/14 18:02:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/14 18:02:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/14 18:02:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/14 18:02:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/10/14 18:02:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
2

+-------+------------------+
|    id2|           sum(v2)|
+-------+------------------+
| 817683| 50524.43148400001|
| 741975| 49268.35268600003|
| 391348| 50910.81451599998|
| 699316|47838.761217000014|
|1045749| 52056.23766099999|
| 950306|53588.621632999995|
| 530869| 49266.58849699996|
| 107123| 53521.99532999998|
| 633006|48324.137790000044|
| 811978|       49004.63729|
|1034173| 50913.84845999997|
| 609174|53030.917331000004|
| 529333|      52338.735607|
| 532494| 50555.22520300001|
|  28728| 50148.61006899997|
| 452763| 50709.10099599998|
| 194308| 49473.58427300003|
|  63090|      47047.629647|
| 846856| 50891.77174800001|
| 971210| 50370.75352800001|
+-------+------------------+
only showing top 20 rows

CPU times: user 387 ms, sys: 134 ms, total: 521 ms
Wall time: 1min 51s


                                                                                