# Microbenchmarks on CPU
This is a notebook for microbenchmarks running on CPU.

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from time import time
import os

# Change to your cluster ip:port
SPARK_MASTER_URL = os.getenv("SPARK_MASTER_URL", "spark://your-ip:port")


Run the microbenchmark with retry times

In [2]:
def runMicroBenchmark(spark, appName, query, retryTimes):
    count = 0
    total_time = 0
    # You can print the physical plan of each query
    # spark.sql(query).explain()
    while count < retryTimes:
        start = time()
        spark.sql(query).show(5)
        end = time()
        total_time += round(end - start, 2)
        count = count + 1
        print("Retry times : {}, ".format(count) + appName + " Microbenchmark takes {} seconds".format(round(end - start, 2)))
    print(appName + " Microbenchmark takes average {} seconds after {} retries".format(round(total_time/retryTimes),retryTimes))
    

In [3]:
# You need to update data path with your real path and hardware resource!
dataRoot = os.getenv("DATA_ROOT", "/data")
driverMem = os.getenv("DRIVER_MEM", "50g")
executorMem = os.getenv("EXECUTOR_MEM", "12g")
maxPartionBytes = os.getenv("MAX_PARTITION_BYTES", "1g")
executorCores = int(os.getenv("EXECUTOR_CORES", "4"))
# common spark settings
conf = SparkConf()
conf.setMaster(SPARK_MASTER_URL)
conf.setAppName("Microbenchmark on CPU")
conf.set("spark.driver.memory", driverMem)
conf.set("spark.executor.memory", executorMem)
conf.set("spark.executor.cores", executorCores)
 
conf.set("spark.locality.wait", "0")
conf.set("spark.sql.files.maxPartitionBytes", maxPartionBytes) 
conf.set("spark.dynamicAllocation.enabled", "false") 
conf.set("spark.sql.adaptive.enabled", "true")  

# create spark session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# Load dataframe and create tempView
spark.read.parquet(dataRoot + "/tpcds/store_sales").createOrReplaceTempView("store_sales")
spark.read.parquet(dataRoot + "/tpcds/catalog_sales").createOrReplaceTempView("catalog_sales")
spark.read.parquet(dataRoot + "/tpcds/web_sales").createOrReplaceTempView("web_sales")
spark.read.parquet(dataRoot + "/tpcds/item").createOrReplaceTempView("item")
spark.read.parquet(dataRoot + "/tpcds/date_dim").createOrReplaceTempView("date_dim")


### Expand&HashAggregate
This is a microbenchmark about Expand&HashAggregate expressions running on the CPU. The query calculates the distinct value of some dimension columns and average birth year by different c_salutation of customers after grouping by c_current_hdemo_sk.

In [4]:
# As a part of this query the size of the data in each task grows a lot. 
# By default, Spark will try to distribute the data among all the tasks in the cluster, 
# but on large clusters with large parquet files the splittable portions of the parquet files end up not being distributed evenly 
# and it is faster to re-partition the data to redistribute it than to deal with skew.
spark.read.parquet(dataRoot + "/tpcds/customer").repartition(512).createOrReplaceTempView("customer")

print("-"*50)

--------------------------------------------------


In [5]:
query = '''
select c_current_hdemo_sk,
count(DISTINCT if(c_salutation=="Ms.",c_salutation,null)) as c1,
count(DISTINCT if(c_salutation=="Mr.",c_salutation,null)) as c12,
count(DISTINCT if(c_salutation=="Dr.",c_salutation,null)) as c13,

count(DISTINCT if(c_salutation=="Ms.",c_first_name,null)) as c2,
count(DISTINCT if(c_salutation=="Mr.",c_first_name,null)) as c22,
count(DISTINCT if(c_salutation=="Dr.",c_first_name,null)) as c23,

count(DISTINCT if(c_salutation=="Ms.",c_last_name,null)) as c3,
count(DISTINCT if(c_salutation=="Mr.",c_last_name,null)) as c32,
count(DISTINCT if(c_salutation=="Dr.",c_last_name,null)) as c33,

count(DISTINCT if(c_salutation=="Ms.",c_birth_country,null)) as c4,
count(DISTINCT if(c_salutation=="Mr.",c_birth_country,null)) as c42,
count(DISTINCT if(c_salutation=="Dr.",c_birth_country,null)) as c43,

count(DISTINCT if(c_salutation=="Ms.",c_email_address,null)) as c5,
count(DISTINCT if(c_salutation=="Mr.",c_email_address,null)) as c52,
count(DISTINCT if(c_salutation=="Dr.",c_email_address,null)) as c53,

count(DISTINCT if(c_salutation=="Ms.",c_login,null)) as c6,
count(DISTINCT if(c_salutation=="Mr.",c_login,null)) as c62,
count(DISTINCT if(c_salutation=="Dr.",c_login,null)) as c63,

count(DISTINCT if(c_salutation=="Ms.",c_preferred_cust_flag,null)) as c7,
count(DISTINCT if(c_salutation=="Mr.",c_preferred_cust_flag,null)) as c72,
count(DISTINCT if(c_salutation=="Dr.",c_preferred_cust_flag,null)) as c73,

count(DISTINCT if(c_salutation=="Ms.",c_birth_month,null)) as c8,
count(DISTINCT if(c_salutation=="Mr.",c_birth_month,null)) as c82,
count(DISTINCT if(c_salutation=="Dr.",c_birth_month,null)) as c83,

avg(if(c_salutation=="Ms.",c_birth_year,null)) as avg1,
avg(if(c_salutation=="Mr.",c_birth_year,null)) as avg2,
avg(if(c_salutation=="Dr.",c_birth_year,null)) as avg3,
avg(if(c_salutation=="Miss.",c_birth_year,null)) as avg4,
avg(if(c_salutation=="Mrs.",c_birth_year,null)) as avg5,
avg(if(c_salutation=="Sir.",c_birth_year,null)) as avg6,
avg(if(c_salutation=="Professor.",c_birth_year,null)) as avg7,
avg(if(c_salutation=="Teacher.",c_birth_year,null)) as avg8,
avg(if(c_salutation=="Agent.",c_birth_year,null)) as avg9,
avg(if(c_salutation=="Director.",c_birth_year,null)) as avg10
from customer group by c_current_hdemo_sk
'''
print("-"*50)

--------------------------------------------------


In [6]:
# Run microbenchmark with n retry time
runMicroBenchmark(spark,"Expand&HashAggregate",query ,1)

+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+------------------+------------------+------------------+----+------------------+----+----+----+----+-----+
|c_current_hdemo_sk| c1|c12|c13| c2|c22|c23| c3|c32|c33| c4|c42|c43| c5|c52| c53| c6|c62|c63| c7|c72|c73| c8|c82|c83|              avg1|              avg2|              avg3|avg4|              avg5|avg6|avg7|avg8|avg9|avg10|
+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+------------------+------------------+------------------+----+------------------+----+----+----+----+-----+
|              5803|  1|  1|  1|285|272|592|358|496|791|185|202|210|458|674|1177|  0|  0|  0|  2|  2|  2| 12| 12| 12|1959.5225806451613|1959.6557863501484|1958.5581196581197|null| 1958.873873873874|null|null|null|null| null|
|              1591|  1|  1|  1|283|237|544|374|489|739|193|206|211|476|664|1144|  0|  0|  0|  2|  2

### Windowing (without data skew)
This is a microbenchmark about windowing expressions running on CPU mode. The sub-query calculates the average ss_sales_price of a fixed window function partition by ss_customer_sk, and the parent query calculates the average price of the sub-query grouping by each customer.

In [10]:
query = '''
select ss_customer_sk,avg(avg_price) as avg_price
from
(
SELECT ss_customer_sk ,avg(ss_sales_price) OVER (PARTITION BY ss_customer_sk order by ss_sold_date_sk ROWS BETWEEN 50 PRECEDING AND 50 FOLLOWING ) as avg_price
FROM store_sales
where ss_customer_sk is not null
) group by ss_customer_sk order by 2 desc 
'''
print("-"*50)

--------------------------------------------------


In [11]:
# Run microbenchmark with n retry time
runMicroBenchmark(spark,"Windowing without skew",query , 1)

+--------------+------------------+
|ss_customer_sk|         avg_price|
+--------------+------------------+
|      15924921|52.453036568858586|
|      24796404|52.406491887877976|
|      10174233|52.217149302596276|
|      27571451| 52.14256448618126|
|      14299506| 52.09827897444722|
+--------------+------------------+
only showing top 5 rows

Retry times : 1, Windowing without skew Microbenchmark takes 176.61 seconds
Windowing without skew Microbenchmark takes average 177 seconds after 1 retries


### Windowing(with data skew)
Data skew is caused by many null values in the ss_customer_sk column.

In [15]:
query = '''
select ss_customer_sk,avg(avg_price) as avg_price
from
(
SELECT ss_customer_sk ,avg(ss_sales_price) OVER (PARTITION BY ss_customer_sk order by ss_sold_date_sk ROWS BETWEEN 50 PRECEDING AND 50 FOLLOWING ) as avg_price
FROM store_sales
) group by ss_customer_sk order by 2 desc 
'''
print("-"*50)

--------------------------------------------------


In [16]:
# Run microbenchmark with n retry time
runMicroBenchmark(spark,"Windowing with skew",query ,1)

+--------------+------------------+
|ss_customer_sk|         avg_price|
+--------------+------------------+
|      15924921| 52.44865972015809|
|      24796404|52.406491887877976|
|      10174233|52.215293069577626|
|      27571451| 52.14256448618126|
|      14299506| 52.09827897444722|
+--------------+------------------+
only showing top 5 rows

Retry times : 1, Windowing with skew Microbenchmark takes 1666.07 seconds
Windowing with skew Microbenchmark takes average 1666 seconds after 1 retries


### Intersection
This is a microbenchmark about intersection operation running on CPU mode. The query calculates items in the same brand, class, and category that are sold in all three sales channels in two consecutive years.

In [19]:
query = '''
select i_item_sk ss_item_sk
 from item,
    (select iss.i_brand_id brand_id, iss.i_class_id class_id, iss.i_category_id category_id
     from store_sales, item iss, date_dim d1
     where ss_item_sk = iss.i_item_sk
                    and ss_sold_date_sk = d1.d_date_sk
       and d1.d_year between 1999 AND 1999 + 2
   intersect
     select ics.i_brand_id, ics.i_class_id, ics.i_category_id
     from catalog_sales, item ics, date_dim d2
     where cs_item_sk = ics.i_item_sk
       and cs_sold_date_sk = d2.d_date_sk
       and d2.d_year between 1999 AND 1999 + 2
   intersect
     select iws.i_brand_id, iws.i_class_id, iws.i_category_id
     from web_sales, item iws, date_dim d3
     where ws_item_sk = iws.i_item_sk
       and ws_sold_date_sk = d3.d_date_sk
       and d3.d_year between 1999 AND 1999 + 2) x
 where i_brand_id = brand_id
   and i_class_id = class_id
   and i_category_id = category_id
'''

In [20]:
# Run microbenchmark with n retry time
runMicroBenchmark(spark,"NDS Q14a subquery",query ,1)

+----------+
|ss_item_sk|
+----------+
|    326835|
|    248465|
|    174935|
|    130715|
|     78159|
+----------+
only showing top 5 rows

Retry times : 1, NDS Q14a subquery Microbenchmark takes 62.42 seconds
NDS Q14a subquery Microbenchmark takes average 62 seconds after 1 retries


### Crossjoin
This is a microbenchmark for a 1-million rows crossjoin with itself.

In [21]:
# You have to stop the sparksession and create a new one 
# because in this query we need to create more executors with less cores to get the best performance
spark.stop()
conf = SparkConf()
# Common spark settings
conf.setMaster(SPARK_MASTER_URL)
conf.setAppName("Crossjoin Microbenchmark on CPU")
 
conf.set("spark.driver.memory", driverMem)
conf.set("spark.executor.memory", executorMem)
conf.set("spark.executor.cores", executorCores)
 
conf.set("spark.locality.wait", "0")
conf.set("spark.sql.files.maxPartitionBytes", maxPartionBytes) 
conf.set("spark.dynamicAllocation.enabled", "false") 
conf.set("spark.sql.adaptive.enabled", "true")
# We can get a better performance by broadcast one table to change CartesianJoin to BroadCastNestLoopJoin
conf.set("spark.sql.autoBroadcastJoinThreshold",1000000000)
# Get or create spark session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

print("-"*50)

--------------------------------------------------


In [22]:
# Load dataframe and create tempView
start = time() 
spark.read.parquet(dataRoot + "/tpcds/customer").limit(1000000).write.format("parquet").mode("overwrite").save("/data/tmp/customer1m")
end = time()
print("scanning and writing parquet cost : {} seconds".format(round(end - start, 2)))
# We need to tune the partition number to get the best performance.
spark.read.parquet("/data/tmp/customer1m").repartition(16000).createOrReplaceTempView("costomer_df_1_million")
query = '''
select count(*) from costomer_df_1_million c1 inner join costomer_df_1_million c2 on c1.c_customer_sk>c2.c_customer_sk
'''
print("-"*50)

scanning and writing parquet cost : 18.18 seconds
--------------------------------------------------


In [23]:
# Run microbenchmark with n retry time
runMicroBenchmark(spark,"Crossjoin",query ,1)

+------------+
|    count(1)|
+------------+
|499999500000|
+------------+

Retry times : 1, Crossjoin Microbenchmark takes 78.8 seconds
Crossjoin Microbenchmark takes average 79 seconds after 1 retries


In [24]:
spark.stop()