# Microbenchmarks on GPU
This is a notebook for microbenchmarks running on GPU. 

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from time import time
import os
# Change to your cluster ip:port and directories
SPARK_MASTER_URL = os.getenv("SPARK_MASTER_URL", "spark:your-ip:port")
RAPIDS_JAR = os.getenv("RAPIDS_JAR", "/your-path/rapids-4-spark_2.12-22.08.0.jar")


Run the microbenchmark with retryTimes

In [2]:
def runMicroBenchmark(spark, appName, query, retryTimes):
    count = 0
    total_time = 0
    # You can print the physical plan of each query
    # spark.sql(query).explain()
    while count < retryTimes:
        start = time()
        spark.sql(query).show(5)
        end = time()
        total_time += round(end - start, 2)
        count = count + 1
        print("Retry times : {}, ".format(count) + appName + " microbenchmark takes {} seconds".format(round(end - start, 2)))
    print(appName + " microbenchmark takes average {} seconds after {} retries".format(round(total_time/retryTimes),retryTimes))

In [3]:
# You need to update with your real hardware resource 
driverMem = os.getenv("DRIVER_MEM", "50g")
executorMem = os.getenv("EXECUTOR_MEM", "16g")
maxPartionBytes = os.getenv("MAX_PARTITION_BYTES", "1g")
pinnedPoolSize = os.getenv("PINNED_POOL_SIZE", "8g")
concurrentGpuTasks = os.getenv("CONCURRENT_GPU_TASKS", "4")
executorCores = int(os.getenv("EXECUTOR_CORES", "16"))
gpuPerExecutor = 1/executorCores
# Common spark settings
conf = SparkConf()
conf.setMaster(SPARK_MASTER_URL)
conf.setAppName("Microbenchmark on GPU")
conf.set("spark.driver.memory", driverMem)
## The tasks will run on GPU memory, so there is no need to set a high host memory
conf.set("spark.executor.memory", executorMem)
## The tasks will run on GPU cores, so there is no need to use many cpu cores
conf.set("spark.executor.cores", executorCores)
conf.set("spark.locality.wait", "0")
conf.set("spark.sql.files.maxPartitionBytes", maxPartionBytes) 
conf.set("spark.dynamicAllocation.enabled", "false") 
conf.set("spark.sql.adaptive.enabled", "true") 

# Plugin settings
conf.set("spark.executor.resource.gpu.amount", "1")
# 4 tasks will run concurrently per GPU
conf.set("spark.rapids.sql.concurrentGpuTasks", concurrentGpuTasks)
# Pinned 8g host memory to transfer data between GPU and host memory
conf.set("spark.rapids.memory.pinnedPool.size", pinnedPoolSize)
# 16 tasks will run concurrently per executor, as we set spark.executor.cores=16
conf.set("spark.task.resource.gpu.amount", gpuPerExecutor) 
conf.set("spark.rapids.sql.enabled", "true") 
conf.set("spark.plugins", "com.nvidia.spark.SQLPlugin")
conf.set("spark.rapids.sql.variableFloatAgg.enabled", "true")
conf.set("spark.driver.extraClassPath", RAPIDS_JAR)
conf.set("spark.executor.extraClassPath", RAPIDS_JAR)
conf.set("spark.jars", RAPIDS_JAR)
# Create spark session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# Load dataframe and create tempView
# You need to update data path to your real path!
dataRoot = os.getenv("DATA_ROOT", "/data")
spark.read.parquet(dataRoot + "/tpcds/customer").createOrReplaceTempView("customer")
spark.read.parquet(dataRoot + "/tpcds/store_sales").createOrReplaceTempView("store_sales")
spark.read.parquet(dataRoot + "/tpcds/catalog_sales").createOrReplaceTempView("catalog_sales")
spark.read.parquet(dataRoot + "/tpcds/web_sales").createOrReplaceTempView("web_sales")
spark.read.parquet(dataRoot + "/tpcds/item").createOrReplaceTempView("item")
spark.read.parquet(dataRoot + "/tpcds/date_dim").createOrReplaceTempView("date_dim")
print("-"*50)

--------------------------------------------------


### Expand&HashAggregate
This is a microbenchmark about Expand&HashAggregate expressions running on the GPU. The query calculates the distinct value of some dimension columns and average birth year by different c_salutation of customers after grouping by c_current_hdemo_sk. You will see about 10x speedups in this query. Because an additional shuffle involved by the repartition operator in CPU mode. And GPUExpand and GPUHashAggregate is much faster than Expand and HashAggregate because GPU algorithms allow us to parallelize the computation and we can utilize most of the GPU cores. The tasks' duration in the third stage is less than one second but will cost 20x-40x while running on CPU. There will be a more significant performance improvement along with the increasing number of count distinct columns and aggregate functions.

In [4]:
query = '''
select c_current_hdemo_sk,
count(DISTINCT if(c_salutation=="Ms.",c_salutation,null)) as c1,
count(DISTINCT if(c_salutation=="Mr.",c_salutation,null)) as c12,
count(DISTINCT if(c_salutation=="Dr.",c_salutation,null)) as c13,

count(DISTINCT if(c_salutation=="Ms.",c_first_name,null)) as c2,
count(DISTINCT if(c_salutation=="Mr.",c_first_name,null)) as c22,
count(DISTINCT if(c_salutation=="Dr.",c_first_name,null)) as c23,

count(DISTINCT if(c_salutation=="Ms.",c_last_name,null)) as c3,
count(DISTINCT if(c_salutation=="Mr.",c_last_name,null)) as c32,
count(DISTINCT if(c_salutation=="Dr.",c_last_name,null)) as c33,

count(DISTINCT if(c_salutation=="Ms.",c_birth_country,null)) as c4,
count(DISTINCT if(c_salutation=="Mr.",c_birth_country,null)) as c42,
count(DISTINCT if(c_salutation=="Dr.",c_birth_country,null)) as c43,

count(DISTINCT if(c_salutation=="Ms.",c_email_address,null)) as c5,
count(DISTINCT if(c_salutation=="Mr.",c_email_address,null)) as c52,
count(DISTINCT if(c_salutation=="Dr.",c_email_address,null)) as c53,

count(DISTINCT if(c_salutation=="Ms.",c_login,null)) as c6,
count(DISTINCT if(c_salutation=="Mr.",c_login,null)) as c62,
count(DISTINCT if(c_salutation=="Dr.",c_login,null)) as c63,

count(DISTINCT if(c_salutation=="Ms.",c_preferred_cust_flag,null)) as c7,
count(DISTINCT if(c_salutation=="Mr.",c_preferred_cust_flag,null)) as c72,
count(DISTINCT if(c_salutation=="Dr.",c_preferred_cust_flag,null)) as c73,

count(DISTINCT if(c_salutation=="Ms.",c_birth_month,null)) as c8,
count(DISTINCT if(c_salutation=="Mr.",c_birth_month,null)) as c82,
count(DISTINCT if(c_salutation=="Dr.",c_birth_month,null)) as c83,

avg(if(c_salutation=="Ms.",c_birth_year,null)) as avg1,
avg(if(c_salutation=="Mr.",c_birth_year,null)) as avg2,
avg(if(c_salutation=="Dr.",c_birth_year,null)) as avg3,
avg(if(c_salutation=="Miss.",c_birth_year,null)) as avg4,
avg(if(c_salutation=="Mrs.",c_birth_year,null)) as avg5,
avg(if(c_salutation=="Sir.",c_birth_year,null)) as avg6,
avg(if(c_salutation=="Professor.",c_birth_year,null)) as avg7,
avg(if(c_salutation=="Teacher.",c_birth_year,null)) as avg8,
avg(if(c_salutation=="Agent.",c_birth_year,null)) as avg9,
avg(if(c_salutation=="Director.",c_birth_year,null)) as avg10
from customer group by c_current_hdemo_sk
'''

In [5]:
# Run microbenchmark with n retry time
runMicroBenchmark(spark,"Expand&HashAggregate",query,2)

+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+------------------+------------------+------------------+----+------------------+----+----+----+----+-----+
|c_current_hdemo_sk| c1|c12|c13| c2|c22|c23| c3|c32|c33| c4|c42|c43| c5|c52| c53| c6|c62|c63| c7|c72|c73| c8|c82|c83|              avg1|              avg2|              avg3|avg4|              avg5|avg6|avg7|avg8|avg9|avg10|
+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+------------------+------------------+------------------+----+------------------+----+----+----+----+-----+
|              1238|  1|  1|  1|284|255|562|358|467|772|194|203|211|452|664|1157|  0|  0|  0|  2|  2|  2| 12| 12| 12|1957.2444933920706|1958.8547655068078|1957.2870771899393|null| 1958.042643923241|null|null|null|null| null|
|              6658|  1|  1|  1|318|253|541|384|492|752|190|203|210|516|647|1115|  0|  0|  0|  2|  2

### Windowing(without data skew)
This is a microbenchmark about windowing expressions running on GPU mode. The sub-query calculates the average ss_sales_price of a fixed window function partition by ss_customer_sk, and the parent query calculates the average price of the sub-query grouping by each customer. You will see about 25x speedups in this query. The speedup mainly comes from GPUSort/GPUWindow/GPUHashAggregate. The avg aggregation function evaluates all rows which are generated by the sub-query's window function. There will be a more significant performance improvement along with the increasing number of sub-query aggregate functions.

In [6]:
query = '''
select ss_customer_sk,avg(avg_price) as avg_price
from
(
SELECT ss_customer_sk ,avg(ss_sales_price) OVER (PARTITION BY ss_customer_sk order by ss_sold_date_sk ROWS BETWEEN 50 PRECEDING AND 50 FOLLOWING ) as avg_price
FROM store_sales
where ss_customer_sk is not null
) group by ss_customer_sk order by 2 desc 
'''
print("-"*50)

--------------------------------------------------


In [7]:
# Run microbenchmark with n retry time
runMicroBenchmark(spark,"Windowing without skew",query,2)

+--------------+------------------+
|ss_customer_sk|         avg_price|
+--------------+------------------+
|      15924921|52.375180502283705|
|      24796404| 52.21073975966333|
|      14299506| 52.16263537127018|
|      27571451|52.156112032252395|
|      10174233| 52.06401030721082|
+--------------+------------------+
only showing top 5 rows

Retry times : 1, Windowing without skew microbenchmark takes 11.39 seconds
+--------------+-----------------+
|ss_customer_sk|        avg_price|
+--------------+-----------------+
|      15924921|52.53781291335107|
|      24796404|52.39683466140243|
|      27571451|52.18830023174899|
|      14299506|52.10829141087412|
|      10174233|51.92766214818386|
+--------------+-----------------+
only showing top 5 rows

Retry times : 2, Windowing without skew microbenchmark takes 9.53 seconds
Windowing without skew microbenchmark takes average 10 seconds after 2 retries


### Windowing(with data skew)
Data skew is caused by many null values in the ss_customer_sk column. You will see about 80x speedups in this query. The heavier skew task a query has, the more improved performance we will get because GPU parallelizes the computation, CPU is limited to just a single core because of how the algorithms are written.

In [8]:
query = '''
select ss_customer_sk,avg(avg_price) as avg_price
from
(
SELECT ss_customer_sk ,avg(ss_sales_price) OVER (PARTITION BY ss_customer_sk order by ss_sold_date_sk ROWS BETWEEN 50 PRECEDING AND 50 FOLLOWING ) as avg_price
FROM store_sales
) group by ss_customer_sk order by 2 desc 
'''
print("-"*50)

--------------------------------------------------


In [9]:
# Run microbenchmark with n retry time
runMicroBenchmark(spark,"Windowing with skew",query,2)

+--------------+------------------+
|ss_customer_sk|         avg_price|
+--------------+------------------+
|      24796404| 52.40675225109215|
|      27571451|52.396675141359374|
|      15924921| 52.30557497833058|
|      10174233|52.088916933379096|
|      14299506|51.995045713009794|
+--------------+------------------+
only showing top 5 rows

Retry times : 1, Windowing with skew microbenchmark takes 17.46 seconds
+--------------+------------------+
|ss_customer_sk|         avg_price|
+--------------+------------------+
|      24796404|52.403564615099896|
|      15924921|52.262694645994465|
|      27571451| 52.14256448618127|
|      10174233| 52.11346591610992|
|      14299506| 51.99180221022445|
+--------------+------------------+
only showing top 5 rows

Retry times : 2, Windowing with skew microbenchmark takes 16.63 seconds
Windowing with skew microbenchmark takes average 17 seconds after 2 retries


### Intersection
This is a microbenchmark about intersection operation running on GPU mode. The query calculates items in the same brand, class, and category that are sold in all three sales channels in two consecutive years. You will see about 10x speedups in this query. This is a competition between high cardinality SortMergeJoin vs GpuShuffleHashJoin. The mainly improved performance comes from two SortMergeJoin(s) in this query running on CPU get converted to GpuShuffleHashJoin running on GPU.

In [10]:
query = '''
select i_item_sk ss_item_sk
 from item,
    (select iss.i_brand_id brand_id, iss.i_class_id class_id, iss.i_category_id category_id
     from store_sales, item iss, date_dim d1
     where ss_item_sk = iss.i_item_sk
                    and ss_sold_date_sk = d1.d_date_sk
       and d1.d_year between 1999 AND 1999 + 2
   intersect
     select ics.i_brand_id, ics.i_class_id, ics.i_category_id
     from catalog_sales, item ics, date_dim d2
     where cs_item_sk = ics.i_item_sk
       and cs_sold_date_sk = d2.d_date_sk
       and d2.d_year between 1999 AND 1999 + 2
   intersect
     select iws.i_brand_id, iws.i_class_id, iws.i_category_id
     from web_sales, item iws, date_dim d3
     where ws_item_sk = iws.i_item_sk
       and ws_sold_date_sk = d3.d_date_sk
       and d3.d_year between 1999 AND 1999 + 2) x
 where i_brand_id = brand_id
   and i_class_id = class_id
   and i_category_id = category_id
'''

In [11]:
# Run microbenchmark with n retry time
runMicroBenchmark(spark,"NDS Q14a subquery",query,2)

+----------+
|ss_item_sk|
+----------+
|      4323|
|      4324|
|      4325|
|      4327|
|      4328|
+----------+
only showing top 5 rows

Retry times : 1, NDS Q14a subquery microbenchmark takes 6.71 seconds
+----------+
|ss_item_sk|
+----------+
|     14103|
|     14104|
|     14105|
|     14107|
|     14108|
+----------+
only showing top 5 rows

Retry times : 2, NDS Q14a subquery microbenchmark takes 6.11 seconds
NDS Q14a subquery microbenchmark takes average 6 seconds after 2 retries


### Crossjoin
This is a microbenchmark for a 1-million rows crossjoin with itself. You will see about 10x speedups in this query. The mainly improved performance comes from converting BroadcastNestedLoogJoin running on CPU to GpuBroadcastNestedLoogJoin running on GPU.

In [12]:
start = time() 
spark.read.parquet(dataRoot + "/tpcds/customer").limit(1000000).write.format("parquet").mode("overwrite").save("/data/tmp/customer1m")
end = time()
# Parquet file scanning and writing will be about 3 times faster running on GPU
print("scanning and writing parquet cost : {} seconds".format(round(end - start, 2)))
spark.read.parquet("/data/tmp/customer1m").repartition(200).createOrReplaceTempView("costomer_df_1_million")
query = '''
select count(*) from costomer_df_1_million c1 inner join costomer_df_1_million c2 on c1.c_customer_sk>c2.c_customer_sk
'''
print("-"*50)

scanning and writing parquet cost : 5.31 seconds
--------------------------------------------------


In [13]:
# Run microbenchmark with n retry time
runMicroBenchmark(spark,"Crossjoin",query,2)

+------------+
|    count(1)|
+------------+
|499999500000|
+------------+

Retry times : 1, Crossjoin microbenchmark takes 6.7 seconds
+------------+
|    count(1)|
+------------+
|499999500000|
+------------+

Retry times : 2, Crossjoin microbenchmark takes 6.37 seconds
Crossjoin microbenchmark takes average 7 seconds after 2 retries
