# Install packages

In [1]:
spark_version='3.5.3'
rapids_version='25.10.0'
sparkmeasure_version='0.27'

In [2]:
%pip install --quiet \
  tpcds_pyspark \
  pandas \
  sparkmeasure=={sparkmeasure_version}.0 \
  matplotlib

[0mNote: you may need to restart the kernel to use updated packages.


# Import modules

In [3]:
from importlib.resources import files
from pyspark.sql import SparkSession
from tpcds_pyspark import TPCDS
import glob
import os
import pandas as pd
import re
import time

# Init a SparkSession with RAPIDS Spark

# Detect Scala Version used in PySpark package

In [4]:
spark_sql_jar_path, *_ = glob.glob(f"/usr/lib/spark/jars/spark-sql_*jar")
spark_sql_jar = os.path.basename(spark_sql_jar_path)
scala_version = re.search(r'^spark-sql_(\d+.\d+)-.*\.jar$', spark_sql_jar).group(1)
scala_version

'2.12'

## Create Spark Session

In [5]:
spark = (
    SparkSession.builder
    .appName("NDS Example") \
    .config("spark.rapids.sql.enabled", "true") \
    .getOrCreate()
)
spark

25/11/18 23:46:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Verify SQL Acceleration on GPU can be enabled by checking the query plan

In [6]:
spark.conf.set('spark.rapids.sql.enabled', True)
sum_df = spark.range(1000).selectExpr('SUM(*)')
sum_df.collect()
sum_df.explain()

                                                                                

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   GpuColumnarToRow false, [loreId=22]
   +- GpuHashAggregate (keys=[], functions=[gpubasicsum(id#0L, LongType, false)]), filters=ArrayBuffer(None)) [loreId=21]
      +- GpuShuffleCoalesce 1073741824, [loreId=20]
         +- ShuffleQueryStage 0
            +- GpuColumnarExchange gpusinglepartitioning$(), ENSURE_REQUIREMENTS, [plan_id=64], [loreId=17]
               +- GpuHashAggregate (keys=[], functions=[partial_gpubasicsum(id#0L, LongType, false)]), filters=ArrayBuffer(None)) [loreId=16]
                  +- GpuRange (0, 1000, step=1, splits=2)
+- == Initial Plan ==
   HashAggregate(keys=[], functions=[sum(id#0L)])
   +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=11]
      +- HashAggregate(keys=[], functions=[partial_sum(id#0L)])
         +- Range (0, 1000, step=1, splits=2)




# TPCDS App

In [None]:
# https://github.com/LucaCanali/Miscellaneous/tree/master/Performance_Testing/TPCDS_PySpark/tpcds_pyspark/Queries
# queries = None to run all (takes much longer)
queries = None
queries = [
    'q14a',
    'q14b',
    'q23a',
    'q23b',
    # 'q24a',
    # 'q24b',
    # 'q88',
]

demo_start = time.time()
tpcds = TPCDS(data_path='gs://gcs_bucket/parquet_sf3k_decimal/', num_runs=1, queries_repeat_times=1, queries=queries)

sparkMeasure jar path: /opt/conda/lib/python3.11/site-packages/tpcds_pyspark/spark-measure_2.13-0.25.jar
TPCDS queries path: /opt/conda/lib/python3.11/site-packages/tpcds_pyspark/Queries


# Register TPC-DS tables before running queries

In [9]:
tpcds.map_tables() 

Creating temporary view catalog_returns


25/11/18 23:47:22 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Creating temporary view catalog_sales


                                                                                

Creating temporary view inventory


                                                                                

Creating temporary view store_returns


                                                                                

Creating temporary view store_sales


                                                                                

Creating temporary view web_returns


                                                                                

Creating temporary view web_sales


                                                                                

Creating temporary view call_center
Creating temporary view catalog_page
Creating temporary view customer
Creating temporary view customer_address
Creating temporary view customer_demographics
Creating temporary view date_dim
Creating temporary view household_demographics
Creating temporary view income_band
Creating temporary view item
Creating temporary view promotion
Creating temporary view reason
Creating temporary view ship_mode
Creating temporary view store
Creating temporary view time_dim
Creating temporary view warehouse
Creating temporary view web_page
Creating temporary view web_site


# Measure Apache Spark GPU

In [10]:
tpcds.spark.conf.set('spark.rapids.sql.enabled', True)
%time tpcds.run_TPCDS()
gpu_grouped_results = tpcds.grouped_results_pdf.copy()
gpu_grouped_results


Run 0 - query q14a - attempt 0 - starting...


25/11/18 23:48:32 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/18 23:48:32 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/18 23:48:32 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/18 23:48:32 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/18 23:48:32 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/18 23:48:33 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/18 23:48:33 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoi

Job finished
...Start Time = 2025-11-18 23:48:29
...Elapsed Time = 142.7 sec
...Executors Run Time = 4031.78 sec
...Executors CPU Time = 482.07 sec
...Executors JVM GC Time = 27.43 sec
...Average Active Tasks = 28.3

Run 0 - query q14b - attempt 0 - starting...


25/11/18 23:51:04 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/18 23:51:04 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/18 23:51:04 WARN GpuOverrides: 
! <OverwriteByExpressionExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec

                                                                                

Job finished
...Start Time = 2025-11-18 23:51:03
...Elapsed Time = 92.64 sec
...Executors Run Time = 2720.97 sec
...Executors CPU Time = 462.29 sec
...Executors JVM GC Time = 22.78 sec
...Average Active Tasks = 29.4

Run 0 - query q23a - attempt 0 - starting...


25/11/18 23:52:39 WARN GpuOverrides: 
! <OverwriteByExpressionExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec

                                                                                ]

Job finished
...Start Time = 2025-11-18 23:52:38
...Elapsed Time = 246.92 sec
...Executors Run Time = 7679.47 sec
...Executors CPU Time = 2431.34 sec
...Executors JVM GC Time = 58.84 sec
...Average Active Tasks = 31.1

Run 0 - query q23b - attempt 0 - starting...


25/11/18 23:56:47 WARN GpuOverrides: 
! <OverwriteByExpressionExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec

                                                                                ]

Job finished
...Start Time = 2025-11-18 23:56:47
...Elapsed Time = 349.65 sec
...Executors Run Time = 10949.18 sec
...Executors CPU Time = 3601.05 sec
...Executors JVM GC Time = 124.94 sec
...Average Active Tasks = 31.3
CPU times: user 2.13 s, sys: 536 ms, total: 2.67 s
Wall time: 14min 8s


Unnamed: 0,query,numStages,numTasks,elapsedTime,stageDuration,executorRunTime,executorCpuTime,executorDeserializeTime,executorDeserializeCpuTime,resultSerializationTime,...,shuffleLocalBlocksFetched,shuffleRemoteBlocksFetched,shuffleTotalBytesRead,shuffleLocalBytesRead,shuffleRemoteBytesRead,shuffleRemoteBytesReadToDisk,shuffleBytesWritten,shuffleRecordsWritten,avg_active_tasks,elapsed_time_seconds
0,q14a,33,3029,142702,202073,4031784,482069,35938,26477,737,...,52365,52765,17238723794,10959371688,6279352106,0,17224468696,1453677,28,142
1,q14b,27,2924,92635,109468,2720965,462286,19677,19074,510,...,51274,51791,13772369358,7503041602,6269327756,0,13772258680,1325383,29,92
2,q23a,17,5585,246917,321434,7679465,2431345,59686,57146,22,...,1461555,1459430,167343853198,83745729041,83598124157,0,105548884801,2549000,31,246
3,q23b,20,5650,349650,629014,10949183,3601045,60423,59165,43,...,2559681,2558336,290890866947,145433890100,145456976847,0,106425012814,4270104,31,349


## Measure Apache Spark CPU

In [None]:
tpcds.spark.conf.set('spark.rapids.sql.enabled', False)
%time tpcds.run_TPCDS()
cpu_grouped_results = tpcds.grouped_results_pdf.copy()
cpu_grouped_results


Run 0 - query q14a - attempt 0 - starting...


25/11/19 00:02:40 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/19 00:02:40 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/19 00:02:40 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/19 00:02:40 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/19 00:02:40 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/19 00:02:40 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/19 00:02:40 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoi

Job finished
...Start Time = 2025-11-19 00:02:38
...Elapsed Time = 623.59 sec
...Executors Run Time = 17863.87 sec
...Executors CPU Time = 15325.5 sec
...Executors JVM GC Time = 199.65 sec
...Average Active Tasks = 28.6

Run 0 - query q14b - attempt 0 - starting...


25/11/19 00:13:08 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

25/11/19 00:13:08 INFO PlanChangeLogger: 
 Dataproc Rule org.apache.spark.sql.catalyst.optimizer.ReplaceIntersectWithSemiJoin effective 1 times.

                                                                                

Job finished
...Start Time = 2025-11-19 00:13:08
...Elapsed Time = 579.5 sec
...Executors Run Time = 16634.54 sec
...Executors CPU Time = 14340.63 sec
...Executors JVM GC Time = 176.47 sec
...Average Active Tasks = 28.7

Run 0 - query q23a - attempt 0 - starting...


                                                                                

Job finished
...Start Time = 2025-11-19 00:22:49
...Elapsed Time = 1875.39 sec
...Executors Run Time = 59083.8 sec
...Executors CPU Time = 55319.68 sec
...Executors JVM GC Time = 510.15 sec
...Average Active Tasks = 31.5

Run 0 - query q23b - attempt 0 - starting...




## Show Speedup Factors achieved by GPU

In [None]:
res = pd.merge(cpu_grouped_results, gpu_grouped_results, on='query', how='inner', suffixes=['_cpu', '_gpu'])
res['speedup'] = res['elapsedTime_cpu'] / res['elapsedTime_gpu']
res = res.sort_values(by='elapsedTime_cpu', ascending=False)
res

In [None]:
demo_dur = time.time() - demo_start
print(f"CPU and GPU run took: {demo_dur=} seconds")

In [None]:
res.plot(title='TPC-DS query elapsedTime on CPU vs GPU (lower is better)', 
         kind='bar', x='query', y=['elapsedTime_cpu', 'elapsedTime_gpu'],
         color=['blue', '#76B900'])

In [None]:
res.plot(title='Speedup factors of TPC-DS queries on GPU', kind='bar', 
         x='query', y='speedup', color='#76B900')