In [1]:
from pathlib import Path

import delta
import pandas as pd
import pyspark
import pyspark.sql.functions as F
from delta import configure_spark_with_delta_pip

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.executor.memory", "10G")
    .config("spark.driver.memory", "25G")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .config("spark.sql.shuffle.partitions", "2")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-350-delta-310/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e69e4287-6768-4b43-8716-c700dacb9d22;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.1.0 in central
	found io.delta#delta-storage;3.1.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 108ms :: artifacts dl 6ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.1.0 from central in [default]
	io.delta#delta-storage;3.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default 

In [15]:
spark.sparkContext.setLogLevel("OFF")

In [3]:
import pyspark.pandas as ps



## pandas on Spark

In [16]:
%%time
df = ps.read_parquet("/Users/matthew.powers/data/G1_1e9_1e2_0_0.parquet")[
    ["id1", "id2", "v3"]
]
res = df.query("id1 > 'id098'").groupby("id2").sum().head(3)
print(res)



                 v3
id2                
id058  1.004116e+07
id082  9.989551e+06
id083  1.000824e+07
CPU times: user 120 ms, sys: 39.1 ms, total: 159 ms
Wall time: 56.4 s


                                                                                

# pandas

In [9]:
%%time
df = pd.read_parquet("/Users/matthew.powers/data/G1_1e9_1e2_0_0.parquet")[
    ["id1", "id2", "v3"]
]
df.query("id1 > 'id098'").groupby("id2").sum().head(3)

CPU times: user 26.1 s, sys: 4.39 s, total: 30.5 s
Wall time: 21.2 s


Unnamed: 0_level_0,id1,v3
id2,Unnamed: 1_level_1,Unnamed: 2_level_1
id001,id094id094id094id094id094id094id094id094id094i...,505236.862575
id002,id094id094id094id094id094id094id094id094id094i...,511573.248652
id003,id094id094id094id094id094id094id094id094id094i...,503150.385213


In [19]:
del df

## pandas with user optimizations

In [6]:
%%time
df = pd.read_parquet(
    "/Users/matthew.powers/data/G1_1e9_1e2_0_0.parquet",
    columns=["id1", "id2", "v3"],
    filters=[("id1", ">", "id098")],
    engine="pyarrow",
)
df.query("id1 > 'id098'").groupby("id2").sum().head(3)

CPU times: user 4min 55s, sys: 18.9 s, total: 5min 14s
Wall time: 4min 35s


Unnamed: 0_level_0,id1,v3
id2,Unnamed: 1_level_1,Unnamed: 2_level_1
id001,id100id099id099id099id100id100id099id100id100i...,9995667.0
id002,id099id100id100id100id100id099id099id100id099i...,10008080.0
id003,id099id100id099id100id100id100id099id099id099i...,9984115.0


In [5]:
len(df)

2000461

In [10]:
del df

## pandas with incorrect user optimizations

In [11]:
%%time
df = pd.read_parquet(
    "/Users/matthew.powers/data/G1_1e9_1e2_0_0.parquet",
    columns=["id1", "id2", "v3"],
    filters=[("id1", "==", "id001")],
    engine="pyarrow",
)
df.query("id1 > 'id098'").groupby("id2").sum().head(3)

CPU times: user 41.9 s, sys: 4.57 s, total: 46.5 s
Wall time: 6.59 s


Unnamed: 0_level_0,id1,v3
id2,Unnamed: 1_level_1,Unnamed: 2_level_1
