# Open Street Map Benchmarks
Below benchmarks show case how user do analysis based on OpenStreetMap dataset.

In [3]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from time import time
RAPIDS_JAR = "/usr/lib/spark/jars/rapids-4-spark_2.12-24.10.0.jar"


In [4]:
def runQuery(spark, appName, query, retryTimes):
    count = 0
    total_time = 0
    while count < retryTimes:
        start = time()
        spark.sql(query).show(5)
        end = time()
        total_time += round(end - start, 2)
        count = count + 1
        print("Retry times : {}, ".format(count) + appName + " benchmark takes {} seconds".format(round(end - start, 2)))

In [5]:
# Common spark settings
conf = SparkConf()
conf.setMaster("yarn")
conf.setAppName("Open Street Map Benchmarks")
conf.set("spark.executor.instances","2")
conf.set("spark.executor.cores", "16") 
conf.set("spark.driver.memory", "10g")
conf.set("spark.driver.cores", "1")
## The tasks will run on GPU memory, so there is no need to set a high host memory
conf.set("spark.executor.memory", "16g")

conf.set("spark.locality.wait", "0")
conf.set("spark.rapids.sql.reader.batchSizeBytes","512m")
conf.set("spark.dynamicAllocation.enabled", "false") 

# Plugin settings
conf.set("spark.executor.resource.gpu.amount", "1")
# 4 tasks will run concurrently per GPU
conf.set("spark.rapids.sql.concurrentGpuTasks", "2")
# Pinned 8g host memory to transfer data between GPU and host memory
conf.set("spark.rapids.memory.pinnedPool.size", "4G")
# 16 tasks will run concurrently per executor, as we set spark.executor.cores=16
conf.set("spark.task.resource.gpu.amount", "0.0625") 
conf.set("spark.rapids.sql.enabled", "true") 
conf.set("spark.plugins", "com.nvidia.spark.SQLPlugin")
conf.set("spark.driver.extraClassPath",   RAPIDS_JAR)
conf.set("spark.executor.extraClassPath",  RAPIDS_JAR)
conf.set("spark.jars",  RAPIDS_JAR)
# Create spark session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# Load dataframe and create tempView
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/history_changesets/").createOrReplaceTempView("history_changesets")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/history_layers/").createOrReplaceTempView("history_layers")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/history_nodes/").createOrReplaceTempView("history_nodes")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/history_relations/").createOrReplaceTempView("history_relations")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/history_ways/").createOrReplaceTempView("history_ways")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_changesets/").createOrReplaceTempView("planet_changesets")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features/").createOrReplaceTempView("planet_features")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_lines/").createOrReplaceTempView("planet_features_lines")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_multilinestrings/").createOrReplaceTempView("planet_features_multilinestrings")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_multipolygons/").createOrReplaceTempView("planet_features_multipolygons")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_other_relations/").createOrReplaceTempView("planet_features_other_relations")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_features_points/").createOrReplaceTempView("planet_features_points")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_layers/").createOrReplaceTempView("planet_layers")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_nodes/").createOrReplaceTempView("planet_nodes")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_relations/").createOrReplaceTempView("planet_relations")
spark.read.parquet("gs://bigquery_public_data_for_spark/geo_openstreetmap/planet_ways/").createOrReplaceTempView("planet_ways")
print("-"*50)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/20 09:23:47 INFO SparkEnv: Registering MapOutputTracker
25/01/20 09:23:47 INFO SparkEnv: Registering BlockManagerMaster
25/01/20 09:23:47 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/01/20 09:23:47 INFO SparkEnv: Registering OutputCommitCoordinator
25/01/20 09:23:48 WARN RapidsPluginUtils: RAPIDS Accelerator 24.10.0 using cudf 24.10.0, private revision bd4e99e18e20234ee0c54f95f4b0bfce18a6255e
25/01/20 09:23:48 WARN RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.
25/01/20 09:23:48 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.
25/01/20 09:23:48 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.
                                  

--------------------------------------------------


### Calculate length of ways with 'highway' tag in Japan


In [48]:
query = '''
select count(osm_id),feature_type  from planet_features group by feature_type

'''

In [49]:
# Run microbenchmark with n retry time
runQuery(spark,"Japan highway",query,1)

25/01/20 09:50:03 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU

25/01/20 09:50:03 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU

25/01/20 09:50:03 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU bec

+-------------+----------------+
|count(osm_id)|    feature_type|
+-------------+----------------+
|      5628557|   multipolygons|
|      2782002| other_relations|
|    247031768|           lines|
|    184798630|          points|
|       804708|multilinestrings|
+-------------+----------------+

Retry times : 1, Japan highway benchmark takes 26.28 seconds


                                                                                