# PySpark vs other engines

In [1]:
from pathlib import Path

import delta
import pyspark
import pyspark.sql.functions as F
from delta import configure_spark_with_delta_pip

## PySpark groupby query 1

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.executor.memory", "10G")
    .config("spark.driver.memory", "25G")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .config("spark.sql.shuffle.partitions", "2")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-340-delta-240/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-144bd826-4b9a-414f-8429-9dad3114a77a;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 106ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default    

In [6]:
%%time
spark.read.format("delta").load(f"{Path.home()}/data/deltalake/G1_1e9_1e2_0_0").groupBy(
    "id1"
).sum("v1").collect()



CPU times: user 25 ms, sys: 9.94 ms, total: 34.9 ms
Wall time: 7.68 s


                                                                                

[Row(id1='id016', sum(v1)=30003304),
 Row(id1='id074', sum(v1)=30006309),
 Row(id1='id070', sum(v1)=29990210),
 Row(id1='id054', sum(v1)=30011978),
 Row(id1='id053', sum(v1)=29992360),
 Row(id1='id056', sum(v1)=29987234),
 Row(id1='id057', sum(v1)=29991822),
 Row(id1='id003', sum(v1)=30003365),
 Row(id1='id001', sum(v1)=30009448),
 Row(id1='id015', sum(v1)=30006177),
 Row(id1='id017', sum(v1)=29995061),
 Row(id1='id018', sum(v1)=29992469),
 Row(id1='id014', sum(v1)=29998476),
 Row(id1='id072', sum(v1)=30003522),
 Row(id1='id071', sum(v1)=29998357),
 Row(id1='id073', sum(v1)=30006820),
 Row(id1='id055', sum(v1)=30009993),
 Row(id1='id004', sum(v1)=30015990),
 Row(id1='id002', sum(v1)=29996534),
 Row(id1='id005', sum(v1)=29993888),
 Row(id1='id050', sum(v1)=30008271),
 Row(id1='id052', sum(v1)=30014118),
 Row(id1='id021', sum(v1)=29982118),
 Row(id1='id041', sum(v1)=29994657),
 Row(id1='id040', sum(v1)=29989173),
 Row(id1='id049', sum(v1)=29978475),
 Row(id1='id051', sum(v1)=29994785),
 

23/10/27 18:13:14 WARN JavaUtils: Attempt to delete using native Unix OS command failed for path = /private/var/folders/19/_52w4zps3xjc6plz_f63j8sh0000gp/T/blockmgr-f2aa96a0-a934-46ff-9f90-b94d5646b5ef. Falling back to Java IO way
java.io.IOException: Failed to delete: /private/var/folders/19/_52w4zps3xjc6plz_f63j8sh0000gp/T/blockmgr-f2aa96a0-a934-46ff-9f90-b94d5646b5ef
	at org.apache.spark.network.util.JavaUtils.deleteRecursivelyUsingUnixNative(JavaUtils.java:177)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:113)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:94)
	at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1231)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1(DiskBlockManager.scala:368)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1$adapted(DiskBlockManager.scala:364)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.Indexe

## Polars groupby query 1

In [1]:
from pathlib import Path

import polars as pl

In [3]:
%%time
pl.scan_delta(f"{Path.home()}/data/deltalake/G1_1e9_1e2_0_0").group_by("id1").agg(
    pl.sum("v1").alias("v1_sum")
).collect()

CPU times: user 29.8 s, sys: 10.4 s, total: 40.3 s
Wall time: 12.4 s


id1,v1_sum
str,i64
"""id054""",30011978
"""id055""",30009993
"""id042""",29989540
"""id084""",30005578
"""id083""",30005209
"""id075""",30013372
"""id096""",29993372
"""id077""",29990807
"""id074""",30006309
"""id035""",30003917


## PySpark groupby query 3 (limit 10)

In [1]:
from pathlib import Path

import delta
import pyspark
import pyspark.sql.functions as F
from delta import configure_spark_with_delta_pip

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.executor.memory", "10G")
    .config("spark.driver.memory", "25G")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .config("spark.sql.shuffle.partitions", "2")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-340-delta-240/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2ac595f3-8c72-4948-b048-c3f574f65c6d;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 105ms :: artifacts dl 5ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default    

In [3]:
%%time
spark.read.format("delta").load(f"{Path.home()}/data/deltalake/G1_1e9_1e2_0_0").createOrReplaceTempView("x")
spark.sql("select id3, sum(v1) as v1, mean(v3) as v3 from x group by id3 limit 10").show()

23/10/27 18:35:37 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 11:>                                                         (0 + 1) / 1]

+------------+---+------------------+
|         id3| v1|                v3|
+------------+---+------------------+
|id0002654924|330|49.091885935185196|
|id0001975574|329|48.816218894230765|
|id0002470600|324|53.640876399999996|
|id0001698780|285|45.204360193548375|
|id0000058673|291|51.325111176470614|
|id0003895227|358| 49.58640469565216|
|id0005293826|295| 52.53704746315788|
|id0007581630|300|48.697405603960384|
|id0000651978|389| 49.91647398305082|
|id0003454065|274|51.122333543478256|
+------------+---+------------------+

CPU times: user 111 ms, sys: 39.8 ms, total: 151 ms
Wall time: 3min 54s


                                                                                

## PySpark groupby query 3 (limit 10) - different config

In [2]:
from pathlib import Path

import delta
import pyspark
import pyspark.sql.functions as F
from delta import configure_spark_with_delta_pip

In [3]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .master("local[1]")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.executor.memory", "10G")
    .config("spark.driver.memory", "25G")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .config("spark.sql.shuffle.partitions", "2")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-340-delta-240/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-79526b79-83da-404f-bd8c-f36f6f38db99;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 109ms :: artifacts dl 5ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default    

In [4]:
%%time
spark.read.format("delta").load(f"{Path.home()}/data/deltalake/G1_1e9_1e2_0_0").createOrReplaceTempView("x")
spark.sql("select id3, sum(v1) as v1, mean(v3) as v3 from x group by id3 limit 10").show()

23/10/27 20:11:24 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 11:>                                                         (0 + 1) / 1]

+------------+---+------------------+
|         id3| v1|                v3|
+------------+---+------------------+
|id0002654924|330|49.091885935185196|
|id0001975574|329|48.816218894230765|
|id0002470600|324|53.640876399999996|
|id0001698780|285|45.204360193548375|
|id0000058673|291|51.325111176470614|
|id0003895227|358| 49.58640469565216|
|id0005293826|295| 52.53704746315788|
|id0007581630|300|48.697405603960384|
|id0000651978|389| 49.91647398305082|
|id0003454065|274|51.122333543478256|
+------------+---+------------------+

CPU times: user 144 ms, sys: 62.2 ms, total: 206 ms
Wall time: 10min 33s


                                                                                

## Polars groupby query 3 (limit 10)

In [1]:
from pathlib import Path

import polars as pl

In [2]:
%%time
x = pl.scan_delta(f"{Path.home()}/data/deltalake/G1_1e9_1e2_0_0")
x.group_by("id3").agg([pl.sum("v1").alias("v1_sum"), pl.mean("v3").alias("v3_mean")]).limit(10).collect()

CPU times: user 3min 33s, sys: 1min 28s, total: 5min 1s
Wall time: 1min 41s


id3,v1_sum,v3_mean
str,i64,f64
"""id0007944062""",344,44.816765
"""id0003196443""",312,49.627266
"""id0004625609""",273,46.772034
"""id0008566319""",286,50.950365
"""id0006815127""",259,52.738839
"""id0000694335""",275,46.795619
"""id0001189103""",265,48.9991
"""id0005475665""",265,44.455095
"""id0006647807""",318,51.353826
"""id0003601460""",305,45.804697
