In [None]:
from pyspark.sql import SparkSession
import os
jarsPath = os.getenv("JARS_PATH", "/data/cuspatial_data/jars/rapids-4-spark_2.12-23.04.0-SNAPSHOT.jar,/data/cuspatial_data/jars/spark-cuspatial-23.04.0-SNAPSHOT.jar")
spark = SparkSession.builder \
    .config("spark.jars", jarsPath) \
    .config("spark.sql.adaptive.enabled", "false") \
    .config("spark.executor.memory", "20GB") \
    .config("spark.executor.cores", "6") \
    .config("spark.plugins", "com.nvidia.spark.SQLPlugin") \
    .config("spark.executor.resource.gpu.amount", "1") \
    .getOrCreate()
# register the udf
spark.udf.registerJavaFunction("point_in_polygon", "com.nvidia.spark.rapids.udf.PointInPolygon", None)

In [None]:
# prepare shape files
rootPath = os.getenv("ROOT_PATH", "/data/cuspatial_data")
spark.sparkContext.addFile(rootPath + "/polygons/polygons.shp")
spark.sparkContext.addFile(rootPath + "/polygons/polygons.shx")

inputPath = rootPath + "/points/"
outputPath = rootPath + "/output/"

In [None]:
# set its parameters via SQL config for runtime updating
spark.conf.set("spark.cuspatial.sql.udf.shapeFileName", "polygons.shp")

In [None]:
# read the points data
df = spark.read.parquet(inputPath)
df = df.selectExpr('x', 'y', 'point_in_polygon(x, y) as point_in_polygon')

In [None]:
import time
begin = time.time()
df.write.mode("overwrite").parquet(outputPath)
end = time.time()
print("==> It took {} s".format(round(end-begin, 2)))